In [1]:
from __future__ import division, print_function

# Import data science libraries
from everything import *  # https://github.com/fnielsen/everything

In [2]:
# The afinn sentiment analysis package
from afinn import Afinn

In [3]:
# Read annotated text data 
filenames = ['dan_mixed_2014_10K-sentences.csv', 'dan_newscrawl_2011_10K-sentences.csv']
df = read_csv(filenames[1], encoding='utf-8', index_col=0)

In [4]:
# Show the first few sentences
df.head()

Unnamed: 0_level_0,valence,text
number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,Fold ud Et af kerneelementerne i EU-samarbejde...
2,-2.0,"Det viser sig nu, at det kan have forurenet gr..."
3,2.0,"Uddannelseschef for Frisør og service, Mie Pou..."
4,-2.0,"Men vi begynder ikke ved ingenting, vi begynde..."
5,-2.0,"Mange af dem føler, at deres skamlæber er for ..."


In [5]:
# Automated sentiment scores with afinn
afinn = Afinn()
scores = []
for idx, row in df.iterrows():
    scores.append(afinn.score(row.text))

In [6]:
# Append automated scores to dataset
df['afinn'] = scores

In [7]:
# Show the first few sentences with both manual and automated afinn sentiment score
df.head()

Unnamed: 0_level_0,valence,text,afinn
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,Fold ud Et af kerneelementerne i EU-samarbejde...,0.0
2,-2.0,"Det viser sig nu, at det kan have forurenet gr...",0.0
3,2.0,"Uddannelseschef for Frisør og service, Mie Pou...",0.0
4,-2.0,"Men vi begynder ikke ved ingenting, vi begynde...",0.0
5,-2.0,"Mange af dem føler, at deres skamlæber er for ...",0.0


In [8]:
# Find indices for sentence where there is a manual annocation
indices = df.valence.notnull()

In [9]:
# Report the number of manually scored sentences
print("Number of annotated sentences: {}".format(sum(indices)))

Number of annotated sentences: 98


In [10]:
# Cross-tabulation of manual and annotated scores
pd.crosstab(df[indices].valence, df[indices].afinn)

afinn,-3.0,-2.0,-1.0,0.0,2.0,3.0
valence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-3.0,0,0,0,1,0,0
-2.0,1,2,1,28,1,0
-1.0,0,0,0,3,0,0
0.0,1,0,0,32,0,1
1.0,0,0,0,6,0,0
2.0,1,0,0,20,0,0


In [11]:
# Cross-tabulation of three-class'ed manual and automated scores
C = pd.crosstab(sign(df[indices].valence), sign(df[indices].afinn))
C

afinn,-1.0,0.0,1.0
valence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,4,32,1
0.0,1,32,1
1.0,1,26,0


In [12]:
# Three-class baseline accuracy
sum(C.ix[0.0, :]) / sum(sum(C))

0.34693877551020408

In [13]:
# Three-class accuracy
sum(diag(C)) / sum(sum(C))

0.36734693877551022