# From Sentix to Sentistrenght

## Sentix lexicon import and processing

In [1]:
import pandas as pd

sentix = pd.read_csv('lexicon/sentix',sep='\t',names=["Lemma","POS","Wordnet SynsetID","Pos Score","Neg Score","Polarity","Intensity"])
sentix.head()

Unnamed: 0,Lemma,POS,Wordnet SynsetID,Pos Score,Neg Score,Polarity,Intensity
0,abile,a,1740,0.125,0.0,1.0,0.125
1,intelligente,a,1740,0.125,0.0,1.0,0.125
2,valente,a,1740,0.125,0.0,1.0,0.125
3,capace,a,1740,0.125,0.0,1.0,0.125
4,incapace,a,2098,0.0,0.75,-1.0,0.75


In [2]:
sentix = sentix.dropna()
#Actual size of the lexicon
sentix.shape

(74606, 7)

In [3]:
#New polarity computation
sentix['New_Pol'] = sentix['Polarity']*sentix['Intensity']

In [4]:
sentix.head()

Unnamed: 0,Lemma,POS,Wordnet SynsetID,Pos Score,Neg Score,Polarity,Intensity,New_Pol
0,abile,a,1740,0.125,0.0,1.0,0.125,0.125
1,intelligente,a,1740,0.125,0.0,1.0,0.125,0.125
2,valente,a,1740,0.125,0.0,1.0,0.125,0.125
3,capace,a,1740,0.125,0.0,1.0,0.125,0.125
4,incapace,a,2098,0.0,0.75,-1.0,0.75,-0.75


## Polysemy management

In [5]:
#Terms like 'buono' have more than one record in the lexicon
sentix[sentix['Lemma']=='buono']

Unnamed: 0,Lemma,POS,Wordnet SynsetID,Pos Score,Neg Score,Polarity,Intensity,New_Pol
759,buono,a,631391,0.625,0.0,1.0,0.625,0.625
770,buono,a,633410,0.625,0.125,0.748668,0.637377,0.477184
1476,buono,a,1123148,0.75,0.0,1.0,0.75,0.75
1486,buono,a,1129977,1.0,0.0,1.0,1.0,1.0
1763,buono,a,1372049,0.625,0.0,1.0,0.625,0.625
2289,buono,a,1800349,0.625,0.125,0.748668,0.637377,0.477184
2526,buono,a,1983162,1.0,0.0,1.0,1.0,1.0
22931,buono,n,4849241,0.875,0.0,1.0,0.875,0.875
25724,buono,n,5142180,0.625,0.0,1.0,0.625,0.625
32577,buono,n,6518068,0.125,0.0,1.0,0.125,0.125


In [6]:
#Get values for POS tagging used in the lexicon
uniqueValues = (sentix['POS']).unique()
uniqueValues

array(['a', 'n', 'r', 'v'], dtype=object)

In [7]:
#Change POS values used in the lexicon to POS values used in Spacy for further computation
sentix['POS'] = sentix['POS'].replace(['a'],'adj')
sentix['POS'] = sentix['POS'].replace(['n'],'noun')
sentix['POS'] = sentix['POS'].replace(['r'],'adv')
sentix['POS'] = sentix['POS'].replace(['v'],'verb')

In [8]:
#Setting New_Lemma
sentix['New_Lemma'] = sentix['Lemma'] + '_' + sentix['POS']

In [9]:
sentix.head()

Unnamed: 0,Lemma,POS,Wordnet SynsetID,Pos Score,Neg Score,Polarity,Intensity,New_Pol,New_Lemma
0,abile,adj,1740,0.125,0.0,1.0,0.125,0.125,abile_adj
1,intelligente,adj,1740,0.125,0.0,1.0,0.125,0.125,intelligente_adj
2,valente,adj,1740,0.125,0.0,1.0,0.125,0.125,valente_adj
3,capace,adj,1740,0.125,0.0,1.0,0.125,0.125,capace_adj
4,incapace,adj,2098,0.0,0.75,-1.0,0.75,-0.75,incapace_adj


In [10]:
#Keep first entry for each Lemma_POS because, in WordNet, it has the highest absolute frequency
sentix = sentix.drop_duplicates(subset=['New_Lemma'],keep='first')

# Mapping Sentix New Polarity to SentiStrenght Polarity score

In [12]:
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None  

#direct mapping of pos score, neg score and polarity through scaling
def sentix_to_sentistrenght (df):
    #New df deprived of useless columns for the purpose of converting to SentiStrenght format
    sentix_ss = df[['Lemma','New_Lemma', 'New_Pol','Pos Score','Neg Score','Polarity','Intensity']]
    #Separate dataframes for proper scaling
    sentix_ss_pos = sentix_ss[sentix_ss['New_Pol']>0]
    sentix_ss_neg = sentix_ss[sentix_ss['New_Pol']<0]
    sentix_ss_neut = sentix_ss[sentix_ss['New_Pol']==0]

    #Positive dataframe
    #Min max scaling on polarity scores
    scaler = MinMaxScaler(feature_range=(2,5))
    sentix_ss_pos[['Polarity SS']] = scaler.fit_transform(sentix_ss_pos[['New_Pol']])
    #Rounding of scaled scores
    sentix_ss_pos[['Polarity SS']] = sentix_ss_pos[['Polarity SS']].round()

    #Negative dataframe
    #Polarità negativa
    sentix_ss_neg['New_Pol'] = sentix_ss_neg['New_Pol'].abs()
    #Min max scaling on polarity scores
    scaler = MinMaxScaler(feature_range=(2,5))
    sentix_ss_neg[['Polarity SS']] = scaler.fit_transform(sentix_ss_neg[['New_Pol']])
    #Rounding of scaled scores
    sentix_ss_neg[['Polarity SS']] = sentix_ss_neg[['Polarity SS']].round()
    #Return the values to their original negativity
    sentix_ss_neg['Polarity SS'] = -sentix_ss_neg['Polarity SS'].abs()
    sentix_ss_neg['New_Pol'] = -sentix_ss_neg['New_Pol'].abs()

    #Neutral dataframe
    sentix_ss_neut['Polarity SS'] = (sentix_ss_neut['New_Pol'])

    #Final dataframe mapped from Sentix to Sentistrenght
    sentix_ss_total = pd.concat([sentix_ss_pos, sentix_ss_neg, sentix_ss_neut], axis=0)
    sentix_ss_total['Polarity SS'] = sentix_ss_total['Polarity SS'].round()
    sentix_ss_total = sentix_ss_total[['Lemma','New_Lemma','New_Pol','Polarity SS','Pos Score','Neg Score','Polarity','Intensity']]

    return sentix_ss_total

In [13]:
sentix_SS = sentix_to_sentistrenght(sentix)

In [19]:
sentix_SS[sentix_SS['Lemma']=='buono']

Unnamed: 0,Lemma,New_Lemma,New_Pol,Polarity SS,Pos Score,Neg Score,Polarity,Intensity
759,buono,buono_adj,0.625,4.0,0.625,0.0,1.0,0.625
22931,buono,buono_noun,0.875,5.0,0.875,0.0,1.0,0.875


In [15]:
sentix_SS.to_csv("sentix_ss.csv")