In [1]:
from __future__ import division

from afinn import Afinn
import numpy as np
import pandas as pd

In [2]:
# http://www.crowdflower.com/data-for-everyone
url = 'http://cdn2.hubspot.net/hubfs/346378/DFE_CSVs/Twitter-sentiment-self-drive-DFE.csv?t=1436301804871'
df = pd.read_csv(url)

In [3]:
# Sentiment analysis with AFINN
afinn = Afinn(emoticons=True)
afinn_scores = [afinn.score(text) for text in df.text]
df['afinn'] = afinn_scores
df[['sentiment', 'afinn', 'text']].head(10)

Unnamed: 0,sentiment,afinn,text
0,5,0,Two places I'd invest all my money if I could:...
1,5,5,Awesome! Google driverless cars will help the ...
2,2,1,If Google maps can't keep up with road constru...
3,2,0,Autonomous cars seem way overhyped given the t...
4,3,0,Just saw Google self-driving car on I-34. It w...
5,3,0,Will driverless cars eventually replace taxi d...
6,not_relevant,0,Chicago metro expected to be fully autonomous ...
7,not_relevant,3,I love the infotainment system in my new car. ...
8,5,-3,Autonomous vehicles could reduce traffic fatal...
9,1,1,Driverless cars are not worth the risk. Don't...


In [4]:
pd.crosstab(df.sentiment, df.afinn)

afinn,-13.0,-9.0,-8.0,-7.0,-6.0,-5.0,-4.0,-3.0,-2.0,-1.0,...,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,12.0
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,2,4,5,3,9,15,10,8,...,6,6,1,0,2,0,0,0,0,0
2,0,1,2,5,10,15,25,49,97,65,...,62,38,19,7,6,0,2,0,1,0
3,0,0,1,1,4,14,38,83,199,232,...,324,172,86,28,12,8,4,1,2,0
4,1,0,1,3,6,8,19,38,84,72,...,184,157,92,38,23,7,7,1,1,0
5,0,0,0,1,0,1,5,11,14,12,...,55,65,59,14,15,6,5,1,2,1
not_relevant,0,0,0,2,2,1,3,3,7,17,...,22,18,12,4,4,2,0,0,0,1


In [5]:
# Ignore tweets that are marked as 'not relevant'
df_relevant = df[df.sentiment != 'not_relevant']
df_relevant = df_relevant.convert_objects(convert_numeric=True)

In [6]:
# Compute 3-class confusion matrix
confusion = pd.crosstab(np.sign(df_relevant.sentiment - 3), np.sign(df_relevant.afinn))
confusion

afinn,-1.0,0.0,1.0
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,326,243,226
0,572,2698,975
1,276,624,1003


In [7]:
accuracy_3_class = np.sum(np.diag(confusion)) / np.sum(confusion.values)
accuracy_3_class

0.5800086417974939

In [8]:
# Compute 2-class confusion matrix
confusion_2_class = confusion.iloc[[0, 2], [0, 2]]
confusion_2_class

afinn,-1.0,1.0
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,326,226
1,276,1003


In [9]:
accuracy_2_class = np.sum(np.diag(confusion_2_class)) / np.sum(confusion_2_class.values)
accuracy_2_class

0.72583287820862918

In [10]:
accuracy_2_class_baseline = confusion_2_class.sum().max() / np.sum(confusion_2_class.values)
accuracy_2_class_baseline 

0.67121791370835604