In [1]:
## import necessary libraries
import pandas as pd
from collections import Counter
import math
import random

## load the processed training-set tweets
df = pd.read_pickle('tweet_data/training_data/training_data_processed.pkl')
## load the processed cv-set tweets
df_sa_cv = pd.read_pickle('tweet_data/cv_data/sanders_analytics/sanders_analytics_cv_data_processed.pkl')
df_s140_cv = pd.read_pickle('tweet_data/cv_data/sentiment140/sentiment140_cv_data_processed.pkl')

In [2]:
## drop all neutral tweets
neutral_drop = (df['sentiment']==0)
droplist = list(neutral_drop)
not_droplist = [not i for i in droplist]
df = df[list(not_droplist)]
df = df.reset_index(drop=True)

## drop all neutral tweets
neutral_drop = (df_sa_cv['sentiment']==0)
droplist = list(neutral_drop)
not_droplist = [not i for i in droplist]
df_sa_cv = df_sa_cv[list(not_droplist)]
df_sa_cv = df_sa_cv.reset_index(drop=True)

## drop all neutral tweets
neutral_drop = (df_s140_cv['sentiment']==0)
droplist = list(neutral_drop)
not_droplist = [not i for i in droplist]
df_s140_cv = df_s140_cv[list(not_droplist)]
df_s140_cv = df_s140_cv.reset_index(drop=True)

In [3]:
## split off a separate cv set from the training data (we already shuffled the data)
cvratio = .1 # fraction to split off
df_cv = df.copy(deep=True)
df_cv = df_cv.iloc[:int(math.floor(cvratio*len(df)))]
df_cv = df_cv.reset_index(drop=True)
df = df.iloc[int(math.floor(cvratio*len(df))):]
df = df.reset_index(drop=True)
print 'training data set size:', len(df)
print 'cv data set size:', + len(df_cv)
print 'sanders anlaytics cv data set size:', len(df_sa_cv)
print 'sentiment 140 cv data set size:', len(df_s140_cv)

training data set size: 423867
cv data set size: 47096
sanders anlaytics cv data set size: 846
sentiment 140 cv data set size: 358


In [4]:
## throw away some fraction (to be treated as a hyperparameter) of the emoticons
## probably should treat +/- differently, for now let's treat as the same
pThreshold = 0.06 #fraction to keep
emoticoncounter = 0
droppedcounter = 0
for i in range(len(df)):
  for w in df['words'].iloc[i]:
    if w == 'posemoticontoken' or w == 'negemoticontoken':
      emoticoncounter += 1
      if random.random() > pThreshold:
        df['words'].iloc[i].remove(w)
        droppedcounter += 1
print 'dropped ' + str(100*float(droppedcounter)/emoticoncounter) + '% of emoticons'

dropped 93.9903151697% of emoticons


In [5]:
## build the bag of words
words_tot_pos = [item for sublist in df['words'][df['sentiment'] == 1] for item in sublist]
words_tot_neg = [item for sublist in df['words'][df['sentiment'] == -1] for item in sublist]
words_tot = words_tot_pos + words_tot_neg
counter_pos = Counter(words_tot_pos)
counter_neg = Counter(words_tot_neg)
counter = Counter(words_tot)

## priors
prior_pos = float(sum(df['sentiment']==1))/len(df)
prior_neg = float(sum(df['sentiment']==-1))/len(df)

In [6]:
## conditional probabilities of word given class (with Laplace smoothing parameter alpha)
def p_word_given_pos(word):
  return float(counter_pos[word] + alpha)/(len(words_tot_pos) + alpha*len(words_tot))
def p_word_given_neg(word):
  return float(counter_neg[word] + alpha)/(len(words_tot_neg) + alpha*len(words_tot))

In [7]:
def predict(word_list):
  p_pos = math.log(prior_pos) + sum([math.log(p_word_given_pos(i)) for i in word_list])
  p_neg = math.log(prior_neg) + sum([math.log(p_word_given_neg(i)) for i in word_list])
  return 2*([p_neg, p_pos].index(max([p_pos, p_neg])))-1

In [8]:
## evaluate against cv sets
alpha = 1 # Laplace smoothing parameter
df_cv['predictions'] = df_cv['words'].map(predict)
df_sa_cv['predictions'] = df_sa_cv['words'].map(predict)
df_s140_cv['predictions'] = df_s140_cv['words'].map(predict)

print 'correctly classified:', 100*float(sum(df_cv['predictions'] == df_cv['sentiment']))/len(df_cv), '%'
print 'correctly classified:', 100*float(sum(df_sa_cv['predictions'] == df_sa_cv['sentiment']))/len(df_sa_cv), '%'
print 'correctly classified:', 100*float(sum(df_s140_cv['predictions'] == df_s140_cv['sentiment']))/len(df_s140_cv), '%'

correctly classified: 99.9617801936 %
correctly classified: 73.1678486998 %
correctly classified: 74.3016759777 %


In [None]:
i=9
print df_s140_cv['words'].iloc[i]
print df_s140_cv['sentiment'].iloc[i]
print predict(df_s140_cv['words'].iloc[i])

In [None]:
df_s140_cv['predictions'] == df_s140_cv['sentiment']