# Prep Work
To run this notebook properly on Colab you need to execute the following cell in order to clone the repository and get the necessary data

In [None]:
! git clone https://github.com/collab-uniba/Benchmark_sentimentAnalysis_ITA.git

# Preparing SentiStrenght files from Custom Sentix Lexicon 

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

sentix_SS = pd.read_csv("/content/Benchmark_sentimentAnalysis_ITA/data/external/lexicon/sentix_ss.csv")

In [None]:
#IdiomLookupTable
idiom = sentix_SS[(sentix_SS['Lemma'].str.contains('_'))==True]
idiom["Lemma"]= idiom["Lemma"].str.replace("(","")
idiom["Lemma"]= idiom["Lemma"].str.replace(")","") 

#BoosterWordList
bwl = ["abbastanza", "sufficientemente","bastantemente","troppo","poco","pochino","minimamente","pochissimo","meno","alquanto","piuttosto","assai","molto","grandemente","tanto","massimamente","affatto","più","affatto","proprio","veramente","assolutamente","notevolmente","particolarmente","davvero","super","iper","mega","maxi","completamente"]
boosterWordList = sentix_SS[sentix_SS['Lemma'].isin(bwl)]

#NegatingWordList
nwl = ["non",'nemmeno','neanche','neppure','no']
negatingWordList = sentix_SS[sentix_SS['Lemma'].isin(nwl)]

#EmotionLookupTable
sentix_SS = pd.concat([sentix_SS,idiom,boosterWordList,negatingWordList]).drop_duplicates(keep=False,subset='New_Lemma')

In [None]:
#EmotionLookupTable
elt = sentix_SS[['New_Lemma','Polarity SS']]
elt['Polarity SS'] = elt['Polarity SS'].astype('int') 
elt.to_csv('/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/SentiStrength_Italian_Custom/EmotionLookupTable.txt', sep='\t',header=None, index=None)

#IdiomLookupTable
ilt = idiom[["Lemma","Polarity SS"]]
ilt['Polarity SS'] = ilt['Polarity SS'].astype('int') 
ilt.to_csv('/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/SentiStrength_Italian_Custom/IdiomLookupTable.txt', sep='\t',header=None, index=None)

#BoosterWordList
bwl = boosterWordList[["New_Lemma","Polarity SS"]]
bwl['Polarity SS'] = bwl['Polarity SS'].astype('int') 
bwl.to_csv('/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/SentiStrength_Italian_Custom/BoosterWordList.txt', sep='\t',header=None, index=None)

#NegatingWordList
nwl = negatingWordList[["New_Lemma"]]
nwl.to_csv('/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/SentiStrength_Italian_Custom/NegatingWordList.txt',header=None, index=None)

In [3]:
#Idiom handling
idiom['Lemma_2'] = idiom['Lemma'].str.replace('_',' ')
idList = idiom['Lemma_2'].to_list()
idList2 = idiom['Lemma'].to_list()

# Enti Pubblici Dataset
## Preparing Gold Standard for SentiStrenght

In [4]:
import pandas as pd
gs = pd.read_csv("/content/Benchmark_sentimentAnalysis_ITA/data/processed/Enti/test.csv")

In [5]:
#search and replace every idiomatic expression which is in list in tweets
I=pd.Series(gs.tweet_x)
for i in idList:
    I.replace(i, i.replace(" ","_") ,regex=False, inplace = True)
I=I.to_frame()

## Processing gold standard with Spacy

In [None]:
! pip install spacy==3.2.4
! python -m spacy download it_core_news_lg

In [6]:
import spacy

nlp = spacy.load("it_core_news_lg")
I['spacy'] = I['tweet_x'].apply(lambda x: nlp(x))
docs = I['spacy'].to_list()

In [7]:
#Extract lemmatization and POS tagging from individual spaCy doc
def extract_lemma_pos(doc:spacy.tokens.doc.Doc):
    return [token.lemma_+"_"+token.pos_ for token in doc]

In [8]:
#Tokenization and application of lemmatization and POS tagging
def tidy_tokens(docs):
    cols = ["doc_id","processed"]
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_lemma_pos(doc)
        meta = pd.DataFrame(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df)  

In [9]:
df = tidy_tokens(docs)
df.head()

Unnamed: 0,doc_id,processed
0,0,invece_ADV
1,0,mi_PRON
2,0,sapere_VERB
3,0,proprio_ADV
4,0,che_SCONJ


In [10]:
df = df.groupby('doc_id')['processed'].apply(' '.join).reset_index()

In [11]:
list = ["😀","😃","😄","😁","😆","🤣","😂","🙂","😊","😍","🥰","🤩","☺","🥳","😒","😔","😟","🙁","☹","😥","😢","😭","😱","😞","😓","😩","😫","😡","😠","🤬"]

S=pd.Series(df.processed)

#Regular expression to replace the POS tags with the lowercase version or to delete them in order to recognize terms in the support files (QuestionWords)
for i in list:
    S.replace(i+'\S+', i,regex=True, inplace = True)

for i in idList2:
    S.replace(i+'\S+', i,regex=True, inplace = True)

S.replace("NOUN","noun",regex=True, inplace = True)
S.replace("ADJ","adj",regex=True, inplace = True)
S.replace("ADV","adv",regex=True, inplace = True)
S.replace("VERB","verb",regex=True, inplace = True)
S.replace("_ADP","",regex=True, inplace = True)
S.replace("_AUX","",regex=True, inplace = True)
S.replace("_CONJ","",regex=True, inplace = True)
S.replace("_CCONJ","",regex=True, inplace = True)
S.replace("_DET","",regex=True, inplace = True)
S.replace("_INTJ","",regex=True, inplace = True)
S.replace("_NUM","",regex=True, inplace = True)
S.replace("_PART","",regex=True, inplace = True)
S.replace("_PRON","",regex=True, inplace = True)
S.replace("_PROPN","",regex=True, inplace = True)
S.replace("_PUNCT","",regex=True, inplace = True)
S.replace("_SCONJ","",regex=True, inplace = True)
S.replace("_SYM","",regex=True, inplace = True)
S.replace("_X","",regex=True, inplace = True)
S.replace("_SPACE","",regex=True, inplace = True)
S.replace("[_][+]\s"," ",regex=True, inplace = True)

In [12]:
S[1]

'andare_verb ad imparare_verb l inglese_noun a dubai stupefacente_noun incredibilmente_adv ridicolo_adj 😁'

## Convert tweets to txt files to submit to SentiStrenght

In [14]:
S.to_csv("/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/In_Out/file.txt",header=None,index=None,sep=' ',mode='a')

# Sentipolc Dataset
## Preparing for SentiStrenght

In [98]:
colnames=['idTwitter','subj','opos','oneg','iro','lpos','lneg','top','text'] 
sp = pd.read_csv("/content/Benchmark_sentimentAnalysis_ITA/data/processed/SentiPolc/test_set_sentipolc16_gold2000.csv",header=None,names=colnames)

#defining gold standard dataframes to get final results
gs2 = pd.read_csv("/content/Benchmark_sentimentAnalysis_ITA/data/processed/SentiPolc/test_set_sentipolc16_gold2000.csv",header=None,names=colnames)

## Dataset Preprocessing

In [16]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

#Keep original tweets
sp['tweetOrig']=sp['text']

#Adding a space before and after each stopword in such a way not to consider the case in which the stopword is contained in a word
words = set(stopwords.words('italian'))
stopwords = [' ' + x + ' ' for x in words]

emoticons = ["😀","😃","😄","😁","😆","🤣","😂","🙂","😊","😍","🥰","🤩","☺","🥳","😒","😔","😟","🙁","☹","😥","😢","😭","😱","😞","😓","😩","😫","😡","😠","🤬"]

sp.text = sp.text.replace("@[\w]*[_-]*[\w]*"," ",regex=True)   # tag removal
sp.text = sp.text.replace("https?://[\w/%-.]*"," ",regex=True) # Url removal
# Removing everything except the letters of the alphabet and the emoticons
sp.text = sp.text.replace('[^ a-zA-Zà-ú'
                            '\😀\😃\😄\😁\😆\🤣\😂\🙂\😊\😍\🥰\🤩\☺\🥳\😒\😔\😟\🙁\☹\😥\😭\😱\😞\😓\😩\😫\😡\😠\🤬]', " ",regex=True)    
for word in emoticons:
    sp.text = sp.text.replace(word, " "+word+" ",regex=True) 

sp.text = sp.text.replace('\s+', ' ',regex=True)               # Removal of excess spaces
sp.text = sp.text.replace('^ ', '', regex=True)                # Removing the space at the beginning
sp.text = sp.text.replace(' $', '', regex=True)                # Removing the space at the end
sp.text = sp.text.apply(lambda x: x.lower())                   # Making everything in lowercase
sp.text = sp.text.replace('^', ' ', regex=True) 
sp.text = sp.text.replace('$', ' ', regex=True)

for word in stopwords:
    sp.text = sp.text.replace(word, ' ',regex=True)

# Removing the spaces at the beginning and at the end of every tweet
sp.text = sp.text.apply(lambda x: x.strip())
# Removing the empty tweets
sp = sp[sp.text != '']

In [17]:
#search and replace every idiomatic expression which is in list in tweets
I=pd.Series(sp.text)
for i in idList:
    I.replace(i, i.replace(" ","_") ,regex=False, inplace = True)
I=I.to_frame()

## Processing with Spacy

In [18]:
import spacy

nlp = spacy.load("it_core_news_lg")
I['spacy'] = I['text'].apply(lambda x: nlp(x))
docs = I['spacy'].to_list()

In [19]:
df = tidy_tokens(docs)
df.head()

Unnamed: 0,doc_id,processed
0,0,minuto_NOUN
1,0,presentazione_ADJ
2,0,piano_NOUN
3,0,scuola_NOUN
4,0,governo_NOUN


In [20]:
df = df.groupby('doc_id')['processed'].apply(' '.join).reset_index()

In [21]:
list = ["😀","😃","😄","😁","😆","🤣","😂","🙂","😊","😍","🥰","🤩","☺","🥳","😒","😔","😟","🙁","☹","😥","😢","😭","😱","😞","😓","😩","😫","😡","😠","🤬"]

S=pd.Series(df.processed)

#Regular expression to replace the POS tags with the lowercase version or to delete them in order to recognize terms in the support files (QuestionWords)
for i in list:
    S.replace(i+'\S+', i,regex=True, inplace = True)

for i in idList2:
    S.replace(i+'\S+', i,regex=True, inplace = True)

S.replace("NOUN","noun",regex=True, inplace = True)
S.replace("ADJ","adj",regex=True, inplace = True)
S.replace("ADV","adv",regex=True, inplace = True)
S.replace("VERB","verb",regex=True, inplace = True)
S.replace("_ADP","",regex=True, inplace = True)
S.replace("_AUX","",regex=True, inplace = True)
S.replace("_CONJ","",regex=True, inplace = True)
S.replace("_CCONJ","",regex=True, inplace = True)
S.replace("_DET","",regex=True, inplace = True)
S.replace("_INTJ","",regex=True, inplace = True)
S.replace("_NUM","",regex=True, inplace = True)
S.replace("_PART","",regex=True, inplace = True)
S.replace("_PRON","",regex=True, inplace = True)
S.replace("_PROPN","",regex=True, inplace = True)
S.replace("_PUNCT","",regex=True, inplace = True)
S.replace("_SCONJ","",regex=True, inplace = True)
S.replace("_SYM","",regex=True, inplace = True)
S.replace("_X","",regex=True, inplace = True)
S.replace("_SPACE","",regex=True, inplace = True)
S.replace("[_][+]\s"," ",regex=True, inplace = True)

## Convert tweets to txt files to submit to SentiStrenght

In [22]:
S.to_csv("/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/In_Out/sp.txt",header=None,index=None,sep=' ',mode='a')

# SentiStrenght

In [23]:
import subprocess
import shlex
import os.path
import sys

In [24]:
SentiStrengthLocation = "/content/Benchmark_sentimentAnalysis_ITA/utility/SentiStrengthCom.jar" # path for .jar file
SentiStrengthLanguageFolder = "/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/SentiStrength_Italian_Custom/" #path for directory containing SentiStrenght support files

In [25]:
#Test code
if not os.path.isfile(SentiStrengthLocation):
    print("SentiStrength not found at: ", SentiStrengthLocation)
if not os.path.isdir(SentiStrengthLanguageFolder):
    print("SentiStrength data folder not found at: ", SentiStrengthLanguageFolder)

In [26]:
def RateSentiment(sentiString):
    #open a subprocess using shlex to get the command line string into the correct args list format
    p = subprocess.Popen(shlex.split("java -jar '" + SentiStrengthLocation + "' stdin sentidata '" + SentiStrengthLanguageFolder + "'"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    #communicate via stdin the string to be rated. Note that all spaces are replaced with +
    b = bytes(sentiString.replace(" ","+"), 'utf-8') #Can't send string in Python 3, must send bytes
    stdout_byte, stderr_text = p.communicate(b)
    stdout_text = stdout_byte.decode("utf-8")  #convert from byte
    stdout_text = stdout_text.rstrip().replace("/t"," ") #remove the tab spacing between the positive and negative ratings. e.g. 1    -5 -> 1 -5
    return stdout_text + " " + sentiString

## SentiStrenght from file

## Enti Pubblici

In [28]:
FileToClassify = "/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/In_Out/file.txt"
if not os.path.isfile(FileToClassify):
    print("File to classify not found at: ", FileToClassify)

In [None]:
print("Running SentiStrength on file " + FileToClassify + " with command:")
cmd = 'java -jar "' + SentiStrengthLocation + '" sentidata "' + SentiStrengthLanguageFolder + '" input "' + FileToClassify + '"'
print(cmd)
p = subprocess.Popen(shlex.split(cmd),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
classifiedSentimentFile = os.path.splitext(FileToClassify)[0] + "0_out.txt"
print("Finished! The results will be in:/n" + classifiedSentimentFile)

### Results

In [None]:
comments = pd.read_csv("/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/In_Out/file0_out.txt",sep='\t',encoding='latin-1')
comments.head()

In [9]:
#Reset index to join dataframes
comments=comments.reset_index()
gs=gs.reset_index()
#Mapping SentiStrenght scores to sentiment classes
comments.loc[comments['Positive'] > 1, 'Pos_SS'] = 'yes'
comments.loc[comments['Negative'] < -1, 'Neg_SS'] = 'yes'
comments.loc[(comments['Positive'] <= 1)&(comments['Negative'] >= -1) , 'Neut_SS'] = 'yes'

#Mapping GoldStandard scores to sentiment classes
gs.loc[gs['Class'] =='pos', 'Pos_GS'] = 'yes'
gs.loc[gs['Class'] == 'neg', 'Neg_GS'] = 'yes'
gs.loc[gs['Class'] == 'neut', 'Neut_GS'] = 'yes'
gs.loc[gs['Class'] == 'mix', 'Pos_GS'] = 'yes'
gs.loc[gs['Class'] == 'mix', 'Neg_GS'] = 'yes'

#Join dataframes
val = gs.merge(comments, how='inner', on='index')
val = val.fillna("no")
val = val[['tweet_id','year','month','day','tweet_x','tweetOrig','Pos_GS','Pos_SS', 'Neg_GS','Neg_SS', 'Neut_GS','Neut_SS', 'Irony']]

In [10]:
val.head()

Unnamed: 0,tweet_id,year,month,day,tweet_x,tweetOrig,Pos_GS,Pos_SS,Neg_GS,Neg_SS,Neut_GS,Neut_SS,Irony
0,1462002288835403777,2021,11,20,in arrivo un nuovo bonus inps scopri chi può o...,In arrivo un nuovo #bonus #inps! Scopri chi pu...,yes,yes,no,yes,no,no,no
1,1354381987507744771,2021,1,27,allora riepiloghiamo ai politici la pensione d...,"Allora, riepiloghiamo;\nAi politici la pension...",no,yes,yes,yes,no,no,yes
2,1454050817821003783,2021,10,29,caro e cari fate bene i vostri conti perché no...,Caro @INPS_it e cari @Europarl_IT fate bene i...,no,yes,yes,no,no,no,yes
3,1393675898960982016,2021,5,15,foto a napoli dopo l apertura di una voragine ...,"FOTO - A #napoli, dopo l'apertura di una #vora...",no,yes,no,no,yes,no,no
4,1417876270705164289,2021,7,21,maxi esercitazione di protezionecivile lo scen...,Maxi esercitazione di #protezionecivile. Lo sc...,no,yes,no,yes,yes,no,no


### Metrics
#### Positive

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

print("P: ",precision_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print("R: ",recall_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print("F: ",f1_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))

#### Negative

In [None]:
print("P: ",precision_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print("R: ",recall_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print("F: ",f1_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))

#### Neutral

In [None]:
print("P: ",precision_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print("R: ",recall_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print("F: ",f1_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))

## SentiPolc

In [88]:
FileToClassify = "/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/In_Out/sp.txt"
if not os.path.isfile(FileToClassify):
    print("File to classify not found at: ", FileToClassify)

In [None]:
print("Running SentiStrength on file " + FileToClassify + " with command:")
cmd = 'java -jar "' + SentiStrengthLocation + '" sentidata "' + SentiStrengthLanguageFolder + '" input "' + FileToClassify + '"'
print(cmd)
p = subprocess.Popen(shlex.split(cmd),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
classifiedSentimentFile = os.path.splitext(FileToClassify)[0] + "0_out.txt"
print("Finished! The results will be in:/n" + classifiedSentimentFile)

### Results

In [99]:
sp_out = pd.read_csv("/content/Benchmark_sentimentAnalysis_ITA/data/external/SentiStrenght/In_Out/sp0_out.txt",sep='\t',encoding='latin-1')
sp_out.head()

Unnamed: 0,Positive,Negative,Text
0,1,-2,intanto_adv partita via nazionale_adj complica...
1,1,-5,falso_adj illusione_noun sgradevole_adj realtÃ...
2,1,-5,falso_adj illusione_noun sgradevole_adj realtÃ...
3,2,-3,mario monte berlusconi risparmio_noun italia b...
4,2,-3,mario monte berlusconi risparmio_noun italia b...


In [100]:
#Reset index to join dataframes
sp_out=sp_out.reset_index()
gs2=gs2.reset_index()
#Mapping SentiStrenght scores to sentiment classes
sp_out.loc[sp_out['Positive'] > 1, 'Pos_SS'] = 'yes'
sp_out.loc[sp_out['Negative'] < -1, 'Neg_SS'] = 'yes'
sp_out.loc[(sp_out['Positive'] <= 1)&(sp_out['Negative'] >= -1) , 'Neut_SS'] = 'yes'

#Mapping SentiPolc scores to sentiment classes
gs2.loc[gs2['opos'] ==1, 'Pos_GS'] = 'yes'
gs2.loc[gs2['oneg'] ==1, 'Neg_GS'] = 'yes'
gs2.loc[(gs2['opos'] ==0) & (gs2['oneg'] ==0), 'Neut_GS'] = 'yes'
gs2 = gs2.fillna("no")

#Join dataframes
val = gs2.merge(sp_out, how='inner', on='index')
val = val.fillna("no")
val = val[['idTwitter','text','Pos_GS','Pos_SS', 'Neg_GS','Neg_SS', 'Neut_GS','Neut_SS','iro']]

### Metrics

#### Positive

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

print(precision_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print(recall_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print(f1_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))

#### Negative

In [None]:
print(precision_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print(recall_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print(f1_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))

#### Neutral

In [None]:
print(precision_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print(recall_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print(f1_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))