# Datasets processing

## Import and preprocessing

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings("ignore")

#INPS
ht_inps=pd.read_csv('data/raw/Enti/INPS/Hashtags.csv')
ht_inps['type'] = 'hashtag'
mn_inps=pd.read_csv('data/raw//Enti/INPS/Mentions.csv')
mn_inps['type']='mention'

#INAIL
ht_inail=pd.read_csv('data/raw//Enti/INAIL/Hashtags.csv')
ht_inail['type'] = 'hashtag'
mn_inail=pd.read_csv('data/raw//Enti/INAIL/Mentions.csv')
mn_inail['type']='mention'

#Protezione Civile
ht_pc=pd.read_csv('data/raw//Enti/Protezione Civile/Hashtags.csv')
ht_pc['type'] = 'hashtag'
mn_pc=pd.read_csv('data/raw//Enti/Protezione Civile/Mentions.csv')
mn_pc['type']='mention'

In [2]:
import numpy as np

#INPS
#concatenation of hashtags and mentions dataframes indicating the type of tweet in a column
frames_inps = [ht_inps, mn_inps]
df_inps = pd.concat(frames_inps)
#column definition to identify retweets
df_inps['retweet'] = np.where(df_inps['tweet'].str.contains('RT @'), True, False)

#INAIL
#concatenation of hashtags and mentions dataframes indicating the type of tweet in a column
frames_inail = [ht_inail, mn_inail]
df_inail = pd.concat(frames_inail)
#column definition to identify retweets
df_inail['retweet'] = np.where(df_inail['tweet'].str.contains('RT @'), True, False)

#Protezione Civile
#concatenation of hashtags and mentions dataframes indicating the type of tweet in a column
frames_pc = [ht_pc, mn_pc]
df_pc = pd.concat(frames_pc)
#column definition to identify retweets
df_pc['retweet'] = np.where(df_pc['tweet'].str.contains('RT @'), True, False)

In [15]:
#Dataset infos
def get_stats(df):
    print("Dataset Shape: ",df.shape)
    print("\t Mentions - Hashtags")
    print("#Mentions:",df.loc[df['type'] == 'mention'].shape)
    print("#Hashtags:",df.loc[df['type'] == 'hashtag'].shape)
    print(df['type'].value_counts(normalize=True) * 100)
    if "retweet" in df:
        print("\t Retweet")
        print("#Retweet:",df.loc[df['retweet'] == True].shape)
        print(df['retweet'].value_counts(normalize=True) * 100)

In [16]:
get_stats(df_inps)

Dataset Shape:  (64373, 13)
	 Mentions - Hashtags
#Mentions: (36460, 13)
#Hashtags: (27913, 13)
mention    56.638653
hashtag    43.361347
Name: type, dtype: float64
	 Retweet
#Retweet: (25214, 13)
False    60.831404
True     39.168596
Name: retweet, dtype: float64


In [5]:
df_inps.head()

Unnamed: 0,author id,created_at,geo,tweet_id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet,type,retweet
0,1171862947951534080,2021-01-30 20:44:02+00:00,,1355617791865450496,it,0,0,0,6,Twitter for Android,RT @FLEPAR__: 4^ tappa maratona FLEPAR nuovo #sistemapubblico: RiStrutturiamo e RiOrganizziamo le #PA l'occasione del #RecoveryFund\n“Traghe…,hashtag,True
1,1171862947951534080,2021-01-30 20:43:51+00:00,,1355617747657482242,it,0,0,0,8,Twitter for Android,"RT @FLEPAR__: #Covid, Tridico: dall'#Inps prova di efficienza, 15 mln di beneficiari. Lo ha detto al convegno Flepar ""Traghettare il ricamb…",hashtag,True
2,1171862947951534080,2021-01-30 20:43:37+00:00,,1355617686559059971,it,0,0,0,7,Twitter for Android,"RT @FLEPAR__: “il problema è che c’è una parte di pensionandi, di lavoratori che si trova in un limbo. In un sistema previdenziale misto: r…",hashtag,True
3,915658126451494912,2021-01-30 20:24:52+00:00,,1355612969942581250,it,0,0,0,0,Twitter for Android,"Ho parlato di settori non essenziali...In lockdown tutti lavoravano nell'#Inps,#AgenziaEntrate Scuole ecc?.fatto solo un es.. https://t.co/8W44EvZaPz",hashtag,False
4,3025288924,2021-01-30 20:13:25+00:00,,1355610087906271232,it,0,0,0,58,Twitter Web App,RT @tetrabondi: Bisognerebbe usare tutte le proprie energie per tentare di vivere al meglio e combattere le proprie patologie. Invece tropp…,hashtag,True


In [17]:
#Removing retweets and unnecessary columns

#INPS
df_inps=df_inps.loc[df_inps['retweet'] == False]
df_inps=df_inps[['created_at','tweet_id','tweet','type']]

#INAIL
df_inail=df_inail.loc[df_inail['retweet'] == False]
df_inail=df_inail[['created_at','tweet_id','tweet','type']]

#Protezione Civile
df_pc=df_pc.loc[df_pc['retweet'] == False]
df_pc=df_pc[['created_at','tweet_id','tweet','type']]

In [19]:
get_stats(df_inps)

Dataset Shape:  (39159, 4)
	 Mentions - Hashtags
#Mentions: (23133, 4)
#Hashtags: (16026, 4)
mention    59.074542
hashtag    40.925458
Name: type, dtype: float64


# Silver labelling

In [21]:
#Emoji lists
positive_emoticons=["😀","😃","😄","😁","😆","🤣","😂","🙂","😊","😍","🥰","🤩","☺","🥳"]
negative_emoticons=["😒","😔","😟","🙁","☹","😥","😢","😭","😱","😞","😓","😩","😫","😡","😠","🤬"]

In [22]:
#Definition of silver labels based on the presence / absence of emojis within the entire tweet

#INPS
pos_df_inps = df_inps.loc[df_inps['tweet'].str.contains('|'.join(positive_emoticons))]
neg_df_inps = df_inps.loc[df_inps['tweet'].str.contains('|'.join(negative_emoticons))]
neutral_df_inps = pd.concat([df_inps, pos_df_inps, neg_df_inps]).drop_duplicates(keep=False)

#INAIL
pos_df_inail = df_inail.loc[df_inail['tweet'].str.contains('|'.join(positive_emoticons))]
neg_df_inail = df_inail.loc[df_inail['tweet'].str.contains('|'.join(negative_emoticons))]
neutral_df_inail = pd.concat([df_inail, pos_df_inail, neg_df_inail]).drop_duplicates(keep=False)

#Protezione Civile
pos_df_pc = df_pc.loc[df_pc['tweet'].str.contains('|'.join(positive_emoticons))]
neg_df_pc = df_pc.loc[df_pc['tweet'].str.contains('|'.join(negative_emoticons))]
neutral_df_pc = pd.concat([df_pc, pos_df_pc, neg_df_pc]).drop_duplicates(keep=False)

In [23]:
get_stats(neutral_df_pc)

Dataset Shape:  (15104, 4)
	 Mentions - Hashtags
#Mentions: (8130, 4)
#Hashtags: (6974, 4)
mention    53.826801
hashtag    46.173199
Name: type, dtype: float64


In [11]:
#tweets containing both positive and negative emoticons
int_df_inps = pd.merge(pos_df_inps, neg_df_inps, how ='inner')
int_df_inail = pd.merge(pos_df_inail, neg_df_inail, how ='inner')
int_df_pc = pd.merge(pos_df_pc, neg_df_pc, how ='inner')

In [12]:
int_df_inps.shape

(13, 4)

In [13]:
#Sampling neutral datasets to balance classes

#INPS
sample_neutral_df_inps = neutral_df_inps.sample(frac=0.015)

#INAIL
sample_neutral_df_inail = neutral_df_inail.sample(frac=0.015)

#Protezione Civile
sample_neutral_df_pc = neutral_df_pc.sample(frac=0.015)

In [14]:
neg_df_inail.shape

(22, 4)

In [15]:
#Added polarity and topic column

#INPS
pos_df_inps['polarity']='positive'
pos_df_inps['topic']='inps'
neg_df_inps['polarity']='negative'
neg_df_inps['topic']='inps'
sample_neutral_df_inps['polarity']='neutral'
sample_neutral_df_inps['topic']='inps'

#INAIL
pos_df_inail['polarity']='positive'
pos_df_inail['topic']='inail'
neg_df_inail['polarity']='negative'
neg_df_inail['topic']='inail'
sample_neutral_df_inail['polarity']='neutral'
sample_neutral_df_inail['topic']='inail'

#Protezione civile
pos_df_pc['polarity']='positive'
pos_df_pc['topic']='pc'
neg_df_pc['polarity']='negative'
neg_df_pc['topic']='pc'
sample_neutral_df_pc['polarity']='neutral'
sample_neutral_df_pc['topic']='pc'

In [18]:
#concatenation of all dataframes
df_total = pd.concat([pos_df_inps, pos_df_inail, pos_df_pc,neg_df_inps,neg_df_inail,neg_df_pc,sample_neutral_df_inps,sample_neutral_df_inail,sample_neutral_df_pc])
df_total.head()

Unnamed: 0,created_at,tweet_id,tweet,type,polarity,topic
1266,2021-01-25 07:41:19+00:00,1353608875103166476,"#INPS i conti in rosso ma #tridico si è aumentato lo stipendio,se così fosse scoppierebbe la guerra civile prima di scrivere cazzate pensateci bene @Affaritaliani 🤣😂😂 \nInps, scatta l'allarme rosso.""Buco da 16 miliardi, le pensioni sono a rischio"" https://t.co/uJue2EjReB",hashtag,positive,inps
2359,2021-01-16 14:43:02+00:00,1350453512510070786,"Il ""grande potere"" del mullet. \nFinalmente l' nnovazione sbarca anche all' #INPS 😆 https://t.co/Ir6RBo24Sq",hashtag,positive,inps
2454,2021-01-14 20:57:42+00:00,1349823027593306122,"Come il sito #inps di #Tridico nel ""clickday""? 😂 https://t.co/xFdP4Jkhdj",hashtag,positive,inps
2467,2021-01-14 15:21:59+00:00,1349738539357900804,@borghi_claudio Ma sono gli stessi che violarono i server #inps nel #clickday? 😁🤭,hashtag,positive,inps
2471,2021-01-14 13:23:11+00:00,1349708643248308228,"@INPS_it Buongiorno, ho presentato domanda per il ristori ter il 14/12/2020.. ancora in ATTESA DI ESITO. A quando novità? grazie 🙂 @INPS_it #INPS #BONUS #ristori #ter",hashtag,positive,inps


In [19]:
#Dataset shuffling
df_total_shuffle = df_total.sample(frac=1)
#Duplicate removal
df_total_shuffle = df_total_shuffle.drop_duplicates(keep='first',subset=['tweet_id'])

In [None]:
#First round annotation file
ann_1 = df_total_shuffle.sample(n=322,random_state=1)
ann_1.to_csv("Annotazione_1.csv")

In [None]:
#Second round annotation file
ann_2 = pd.concat([df_total_shuffle,ann_1]).drop_duplicates(keep=False,subset=['tweet_id'])
ann_2.to_csv("Annotazione_2")