In [75]:
import os
import pandas as pd
import re
from collections import Counter

In [76]:
data_dir = "../data"
training_sets_dir = "haspeede2_dev"
training_file = "haspeede2_dev_taskAB.tsv"
train_path = os.path.join(data_dir, training_sets_dir, training_file)
train = pd.read_table(train_path, header= 0)
train.rename(columns = {'text ': 'text'}, inplace = True)

In [77]:
texts = train[['hs','text']].values
hs_count = 0
not_hs_count = 0
for hs, tweet in texts:
    if hs == 1:
        hs_count += tweet.count('#')
    else:
        not_hs_count += tweet.count('#')
        
print(f"# in HS tweets: {hs_count}, # in notHS tweets: {not_hs_count}, total: {hs_count+not_hs_count}")

# in HS tweets: 1779, # in notHS tweets: 3239, total: 5018


In [78]:
hashtag_re = re.compile(r'#([\w]+)')
texts = train[['hs','text']].values
hs_ht = []
not_hs_ht = []
for hs, tweet in texts:
    if hs == 1:
        hs_ht += hashtag_re.findall(tweet)
    else:
        not_hs_ht += hashtag_re.findall(tweet)

print(f"valid hashtags in HS tweets: {len(hs_ht)}, valid hashtags in notHS tweets: {len(not_hs_ht)}, total: {len(hs_ht+not_hs_ht)}")

valid hashtags in HS tweets: 1760, valid hashtags in notHS tweets: 3222, total: 4982


In [79]:
# unique hashtags
print(len(set(hs_ht)), len(set(not_hs_ht)), len(set(hs_ht+not_hs_ht)))

895 1538 2168


In [80]:
# number of hashtags that appear in both labels
len(set(hs_ht).intersection(set(not_hs_ht)))

265

In [81]:
# '#' characters that are not found by the regex
hashtag_re = re.compile(r'#([\w]+)')
texts = train[['hs','text']].values
for hs, tweet in texts:
        count_re = len(hashtag_re.findall(tweet))
        count_hash = tweet.count('#')
        if count_re != count_hash:
            print(tweet)
            
# mostly '#...', '# ' and '#punctuation'

Oltre 600 migranti a Catania, tra loro il piccolo Favour nato dopo il salvataggio - Giornale di Sicilia URL #Catania #… 
#Papa Guerra e terrorismo non centrano con la religione. Anche noi abbiamo i fondamentalisti #Papa: #'Guerra ... URL 
@user a # bianconero diffende i due sequestratori della donna #rom. Voglio vedere se la #rom era sua sorella o sua figlia. Vergogna! 
Macomer, era ricercata da sei mesi: nomade arrestata dopo l'inseguimento sulla 131 - L'Unione Sarda URL #OlbiaTempio #… 
Earth Hour, il mondo spegne la luce per un'ora Le immagini - URL # l'italia pensa ad un altro: al papa dei migranti 
Le vittime del #terrorismo dell'#Isis in #Turquia oltre a fare vittime fra # stranieri hanno ammazzato anche turisti di religione #islamica 
#Roma arrestata la “regina” dei furti una rom di 36 anni #Roma, #arrestata #la #“regina” #dei #furti: #una ... URL 
Roma, cinese morta: da oggi la bonifica nel campo rom. I residenti: «Raggi venga a vedere» - Il Messaggero URL #Roma #… 
Londra-Tehe

In [82]:
# we have to choose the minimum frequency of a hashtag to be included (probably higher than 6???)
total_ht = [ht.lower() for ht in hs_ht+not_hs_ht]
counter = Counter(total_ht)
frequent_ht = [ht for ht, count in counter.items() if count >= 6]
print(len(frequent_ht))

124


In [83]:
texts = train[['id','text']].values
binary_ht_features =  []
for id, text in texts:
    row = {'id': id}
    row.update({ht: 0 for ht in frequent_ht})
    for ht in hashtag_re.findall(text):
        if ht.lower() in row:
            row[ht.lower()] += 1
    binary_ht_features.append(row)
# (rows, cols)
len(binary_ht_features), len(binary_ht_features[0])

(6837, 125)

In [84]:
hashtags_features_df = pd.DataFrame(binary_ht_features)
print(hashtags_features_df.shape)
hashtags_features_df.head()

(6837, 125)


Unnamed: 0,id,matrix,invasione,stopislam,lagabbia,salvini,italia,migranti,immigrati,parigi,...,chilhavisto,repubblica,ultimora,cronaca,rifugiati,sinti,malta,lucano,3ottobre,globalcompact
0,2066,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2045,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,61,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1259,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,949,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
