# Import translated datasets

In [4]:
import pandas as pd
tw = pd.read_csv("../data/processed/Translated/twitter/dataset.csv")
sptr = pd.read_csv("../data/processed/Translated/SentiPolc/training_set_sentipolc16.csv")

#defining gold standard dataframes to get final results
gs1 = pd.read_csv("../data/processed/Translated/twitter/dataset.csv")
gs2 = pd.read_csv("../data/processed/Translated/SentiPolc/training_set_sentipolc16.csv")

## Defining list of tweets

In [6]:
tw = tw['eng']
sptr = sptr['eng']

## Convert tweets to txt files to submit to SentiStrenght

In [8]:
tw.to_csv("../data/external/SentiStrenght/toClassify/translated_tw.txt",header=None,index=None,sep=' ',mode='a')
sptr.to_csv("../data/external/SentiStrenght/toClassify/translated_sptr.txt",header=None,index=None,sep=' ',mode='a')

# SentiStrenght

In [9]:
import subprocess
import shlex
import os.path
import sys

In [12]:
SentiStrengthLocation = "C:/Users/polic/Desktop/Data Science/Tesi/Risorse/SentiStrength/SentiStrengthCom.jar" # path for .jar file
SentiStrengthLanguageFolder = "../data/external/SentiStrenght/SentStrength_Data" #path for directory containing SentiStrenght support files

In [13]:
#Test code
if not os.path.isfile(SentiStrengthLocation):
    print("SentiStrength not found at: ", SentiStrengthLocation)
if not os.path.isdir(SentiStrengthLanguageFolder):
    print("SentiStrength data folder not found at: ", SentiStrengthLanguageFolder)

In [14]:
def RateSentiment(sentiString):
    #open a subprocess using shlex to get the command line string into the correct args list format
    p = subprocess.Popen(shlex.split("java -jar '" + SentiStrengthLocation + "' stdin sentidata '" + SentiStrengthLanguageFolder + "'"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    #communicate via stdin the string to be rated. Note that all spaces are replaced with +
    b = bytes(sentiString.replace(" ","+"), 'utf-8') #Can't send string in Python 3, must send bytes
    stdout_byte, stderr_text = p.communicate(b)
    stdout_text = stdout_byte.decode("utf-8")  #convert from byte
    stdout_text = stdout_text.rstrip().replace("/t"," ") #remove the tab spacing between the positive and negative ratings. e.g. 1    -5 -> 1 -5
    return stdout_text + " " + sentiString

## SentiStrenght from file

# Translated tweets

In [15]:
FileToClassify = "../data/external/SentiStrenght/toClassify/translated_tw.txt"
if not os.path.isfile(FileToClassify):
    print("File to classify not found at: ", FileToClassify)

In [None]:
print("Running SentiStrength on file " + FileToClassify + " with command:")
cmd = 'java -jar "' + SentiStrengthLocation + '" sentidata "' + SentiStrengthLanguageFolder + '" input "' + FileToClassify + '"'
print(cmd)
p = subprocess.Popen(shlex.split(cmd),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
classifiedSentimentFile = os.path.splitext(FileToClassify)[0] + "0_out.txt"
print("Finished! The results will be in:/n" + classifiedSentimentFile)

## Results

In [20]:
tw_out = pd.read_csv("../data/external/SentiStrenght/output/translated_tw0_out.txt",sep='\t',encoding='latin-1')
tw_out.head()

Unnamed: 0,Positive,Negative,Text
0,2,-1,A new #bonus #inps is coming! Find out who can...
1,1,-1,"So, let's recap;\n3\t-1\tThe survivor's pensio..."
2,2,-1,Dear @INPS_it and dear @Europarl_IT do your ac...
3,1,-1,"PHOTO - In #napoli, after the opening of a #vo..."
4,1,-1,Maxi #protezionecivile tutorial. The scenario ...


In [21]:
#Reset index to join dataframes
tw_out=tw_out.reset_index()
gs1=gs1.reset_index()
#Mapping SentiStrenght scores to sentiment classes
tw_out.loc[tw_out['Positive'] > 1, 'Pos_SS'] = 'yes'
tw_out.loc[tw_out['Negative'] < -1, 'Neg_SS'] = 'yes'
tw_out.loc[(tw_out['Positive'] <= 1)&(tw_out['Negative'] >= -1) , 'Neut_SS'] = 'yes'

#Mapping GoldStandard scores to sentiment classes
gs1.loc[gs1['Class'] =='pos', 'Pos_GS'] = 'yes'
gs1.loc[gs1['Class'] == 'neg', 'Neg_GS'] = 'yes'
gs1.loc[gs1['Class'] == 'neut', 'Neut_GS'] = 'yes'
gs1.loc[gs1['Class'] == 'mix', 'Pos_GS'] = 'yes'
gs1.loc[gs1['Class'] == 'mix', 'Neg_GS'] = 'yes'

#Join dataframes
val = gs1.merge(tw_out, how='inner', on='index')
val = val.fillna("no")
val = val[['tweet_id','year','month','day','tweet_x','tweetOrig','Pos_GS','Pos_SS', 'Neg_GS','Neg_SS', 'Neut_GS','Neut_SS', 'Irony']]

## Metrics
### Positive

In [23]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

print("P: ",precision_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print("R: ",recall_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print("F: ",f1_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))

P:  0.1816860465116279
R:  0.6510416666666666
F:  0.28409090909090906


### Negative

In [24]:
print("P: ",precision_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print("R: ",recall_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print("F: ",f1_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))

P:  0.630048465266559
R:  0.41980624327233584
F:  0.5038759689922481


### Neutral

In [25]:
print("P: ",precision_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print("R: ",recall_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print("F: ",f1_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))

P:  0.5417185554171855
R:  0.5370370370370371
F:  0.5393676379417235


# Translated SentiPolc

In [26]:
FileToClassify = "../data/external/SentiStrenght/toClassify/translated_sptr.txt"
if not os.path.isfile(FileToClassify):
    print("File to classify not found at: ", FileToClassify)

In [None]:
print("Running SentiStrength on file " + FileToClassify + " with command:")
cmd = 'java -jar "' + SentiStrengthLocation + '" sentidata "' + SentiStrengthLanguageFolder + '" input "' + FileToClassify + '"'
print(cmd)
p = subprocess.Popen(shlex.split(cmd),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
classifiedSentimentFile = os.path.splitext(FileToClassify)[0] + "0_out.txt"
print("Finished! The results will be in:/n" + classifiedSentimentFile)

## Results

In [31]:
sptr_out = pd.read_csv("../data/external/SentiStrenght/output/translated_sptr0_out.txt",sep='\t',encoding='latin-1')
sptr_out.head()

Unnamed: 0,Positive,Negative,Text
0,1,-2,"Meanwhile, the game for Via Nazionale becomes ..."
1,1,-3,"False illusions, unpleasant realities Mario Mo..."
2,1,-3,"False illusions, unpleasant realities #editori..."
3,1,-2,Mario Monti: Berlusconi spare Italy the blame ...
4,1,-2,Mario Monti: Berlusconi spare Italy the blame ...


In [32]:
#Reset index to join dataframes
sptr_out=sptr_out.reset_index()
gs2=gs2.reset_index()
#Mapping SentiStrenght scores to sentiment classes
sptr_out.loc[sptr_out['Positive'] > 1, 'Pos_SS'] = 'yes'
sptr_out.loc[sptr_out['Negative'] < -1, 'Neg_SS'] = 'yes'
sptr_out.loc[(sptr_out['Positive'] <= 1)&(sptr_out['Negative'] >= -1) , 'Neut_SS'] = 'yes'

#Mapping SentiPolc scores to sentiment classes
gs2.loc[gs2['opos'] ==1, 'Pos_GS'] = 'yes'
gs2.loc[gs2['oneg'] ==1, 'Neg_GS'] = 'yes'
gs2.loc[(gs2['opos'] ==0) & (gs2['oneg'] ==0), 'Neut_GS'] = 'yes'
gs2 = gs2.fillna("no")

#Join dataframes
val = gs2.merge(sptr_out, how='inner', on='index')
val = val.fillna("no")
val = val[['idtwitter','text','eng','Pos_GS','Pos_SS', 'Neg_GS','Neg_SS', 'Neut_GS','Neut_SS']]

## Metrics
### Positive

In [34]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

print(precision_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print(recall_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))
print(f1_score(val['Pos_GS'],val['Pos_SS'],labels=['yes','no'],pos_label='yes'))


0.4459755715627936
0.6942954656265237
0.5430968726163234


### Negative

In [35]:
print(precision_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print(recall_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))
print(f1_score(val['Neg_GS'],val['Neg_SS'],labels=['yes','no'],pos_label='yes'))

0.6290617013508579
0.577606436473349
0.6022369800768962


### Neutral

In [36]:
print(precision_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print(recall_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))
print(f1_score(val['Neut_GS'],val['Neut_SS'],labels=['yes','no'],pos_label='yes'))

0.5898807075277663
0.5092329545454546
0.5465980560320183
