In [1]:
from __future__ import division, print_function

# Import data science libraries
from everything import *  # https://github.com/fnielsen/everything

In [2]:
# The afinn sentiment analysis package
from afinn import Afinn

In [3]:
# Read annotated text data 
df = read_csv('dan_mixed_2014_10K-sentences.csv', encoding='utf-8', index_col=0)

In [4]:
# Show the first few sentences
df.head()

Unnamed: 0_level_0,valence,text
number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,09:05 DR2 Morgen - med Camilla Thorning og Mor...
2,2.0,09-10 sæson Spa Francorchamps S2000 Vinter Cup...
3,0.0,½ time og pensl dem derefter med et sammenpisk...
4,0.0,10-06-2010 Forslag til sportsudvalg. 06-06-201...
5,0.0,"100% økologisk rå naturlig bomuld, som efterfø..."


In [5]:
# Automated sentiment scores with afinn
afinn = Afinn()
scores = []
for idx, row in df.iterrows():
    scores.append(afinn.score(row.text))

In [6]:
# Append automated scores to dataset
df['afinn'] = scores

In [7]:
# Show the first few sentences with both manual and automated afinn sentiment score
df.head()

Unnamed: 0_level_0,valence,text,afinn
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,09:05 DR2 Morgen - med Camilla Thorning og Mor...,0.0
2,2.0,09-10 sæson Spa Francorchamps S2000 Vinter Cup...,1.0
3,0.0,½ time og pensl dem derefter med et sammenpisk...,0.0
4,0.0,10-06-2010 Forslag til sportsudvalg. 06-06-201...,0.0
5,0.0,"100% økologisk rå naturlig bomuld, som efterfø...",0.0


In [8]:
# Find indices for sentence where there is a manual annocation
indices = df.valence.notnull()

In [9]:
# Report the number of manually scored sentences
print("Number of annotated sentences: {}".format(sum(indices)))

Number of annotated sentences: 401


In [10]:
# Cross-tabulation of manual and annotated scores
pd.crosstab(df[indices].valence, df[indices].afinn)

afinn,-6.0,-3.0,-2.0,0.0,1.0,2.0,3.0
valence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-3.0,0,0,1,1,0,0,0
-2.0,0,0,3,36,1,0,0
-1.0,0,0,0,15,0,0,0
0.0,1,2,3,223,5,6,2
1.0,0,0,1,25,1,1,0
2.0,0,0,2,47,5,2,1
3.0,0,1,0,9,0,1,4
4.0,0,0,0,1,0,0,0
9.0,0,0,0,1,0,0,0


In [11]:
# Cross-tabulation of three-class'ed manual and automated scores
C = pd.crosstab(sign(df[indices].valence), sign(df[indices].afinn))
C

afinn,-1.0,0.0,1.0
valence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,4,52,1
0.0,6,223,13
1.0,4,83,15


In [12]:
# Three-class baseline accuracy
sum(C.ix[0.0, :]) / sum(sum(C))

0.60349127182044893

In [13]:
# Three-class accuracy
sum(diag(C)) / sum(sum(C))

0.60349127182044893