In [56]:
%load_ext autoreload
%autoreload 2

import math
import gensim #added this instead of the above 
import pickle as pk
import sklearn.metrics as met
import scipy.stats as stats
import sklearn
import numpy as np

from sklearn.model_selection import cross_validate


from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

import twitter_sentiment_dataset as tsd
import phrase2vec as p2v
from twitter_sentiment_dataset import TweetTrainingExample
from model import ModelParams

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setup
Load the three vector representations from files. In general, any variable with the word 'none' in it refers to Google News word2vec w/o any emoji vectors, 'ours' to Google News word2vec w/ vectors we trained, and 'theirs' to Google News word2vec with the vectors trained by Barbieri et. al.

In [57]:
w2v_path='./data/word2vec/GoogleNews-vectors-negative300.bin'

in_dim = 300   # Length of word2vec vectors
out_dim = 300  # Desired dimension of output vectors
pos_ex = 4
neg_ratio = 1
max_epochs = 40
dropout = 0.0

params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs,
                    neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5)



#e2v_ours_path = params.model_folder('unicode') + '/emoji2vec.bin'
e2v_ours_path = './data/word2vec/emojional_latest.bin'
e2v_theirs_path = './data/word2vec/emojional2.bin'

In [58]:
#out of date
#w2v = gs.Word2Vec.load_word2vec_format(w2v_path, binary=True)
#e2v_ours = gs.Word2Vec.load_word2vec_format(e2v_ours_path, binary=True)
#e2v_theirs = gs.Word2Vec.load_word2vec_format(e2v_theirs_path, binary=True)

In [59]:
w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
e2v_ours = gensim.models.KeyedVectors.load_word2vec_format(e2v_ours_path, binary=True)
e2v_theirs = gensim.models.KeyedVectors.load_word2vec_format(e2v_theirs_path, binary=True)



In [60]:
p2v_no_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=None)
p2v_our_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_ours)
p2v_their_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_theirs)

Using stats scraped from emojitracker.com at a certain point in time, we generate two sets of emoji: the top 173 most frequently used emoji, whose usage constitutes 90% of emoji usage on Twitter, and the bottom 612 least frequently used emoji, whose usage constitutes 10% of emoji usage on Twitter.

Subsequently, 'common' will refer to the former group, while 'rare' will refer to the latter.

In [61]:
p = open('./data/tweets/frequencies_w_emoji.txt', 'r')
ems = p.readlines()
ems = [l.split('\t')[0] for l in ems]
p.close()
top90 = set(ems[:173])
bottom10 = set(ems[173:])
p.close()

In [62]:
def emoji_dataset_stats(tweets):
    total_tweets = len(tweets)
    total_emoji = tsd.num_tweets_with_emoji(tweets, e2v_ours, e2v_theirs, ems)
    top_90_total = tsd.num_tweets_with_emoji(tweets, set(), set(), top90)
    bottom_10_total = tsd.num_tweets_with_emoji(tweets, set(), set(), bottom10)
    return total_tweets, total_emoji, top_90_total, bottom_10_total

Statistics for the entire Twitter corpus. Counts refer to # of tweets containing emoji of a type.

In [63]:
train_tweets, test_tweets = tsd.load_training_test_sets()
print('All Tweets in corpus: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(tsd.get_all_examples()))
print('Training set: total tweets: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(train_tweets))
print('Test set: total tweets: %s, total emoji: %s, common emoji: %s, rare emoji: %s' % emoji_dataset_stats(test_tweets))

All Tweets in corpus: 64599, total emoji: 22040, common emoji: 11137, rare emoji: 1576
Training set: total tweets: 51679, total emoji: 17671, common emoji: 8950, rare emoji: 1268
Test set: total tweets: 12920, total emoji: 4369, common emoji: 2187, rare emoji: 308


In [64]:
def emoji_dataset_label_stats(tweets):
    res = dict()
    res['Positive'] = 0
    res['Negative'] = 0
    res['Neutral'] = 0
    for tweet in tweets:
        res[tweet.label] += 1/len(tweets)
    print(res)

In [65]:
emoji_dataset_label_stats(train_tweets)
emoji_dataset_label_stats(test_tweets)

{'Positive': 0.2882215213142463, 'Negative': 0.25207531105472025, 'Neutral': 0.45970316763111974}
{'Positive': 0.2876160990712081, 'Negative': 0.25069659442723424, 'Neutral': 0.4616873065016006}


## Prepare Training and Testing Vectors
Given the raw training and test tweets, calculate the vector representations for each tweet for each model.

In [66]:
train_none, train_y = tsd.prepare_tweet_vector_averages(train_tweets, p2v_no_emoji)
train_ours, _ = tsd.prepare_tweet_vector_averages(train_tweets, p2v_our_emoji)
train_theirs, _ = tsd.prepare_tweet_vector_averages(train_tweets, p2v_their_emoji)

In [67]:
test_none, test_y = tsd.prepare_tweet_vector_averages(test_tweets, p2v_no_emoji)
test_ours, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_our_emoji)
test_theirs, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_their_emoji)

# Classification

In [68]:
classifiers = {
   # 'SGD (n_iter=50)' : SGDClassifier(n_iter=50)
    'SGD (n_iter=50)' : SGDClassifier(),
    'Random Forest (n_estimators=60)' : RandomForestClassifier(n_estimators=60)
}

In [69]:
#out of date
#def train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5):
 #   scores_none = cross_validation.cross_val_score(clf, train_none, train_y, cv=cv)
  #  print("None: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_none.mean(), scores_none.std() * 2))
    
 #   scores_ours = cross_validation.cross_val_score(clf, train_ours, train_y, cv=cv)
 #   print("Ours: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_ours.mean(), scores_ours.std() * 2))
    
 #   scores_theirs = cross_validation.cross_val_score(clf, train_theirs, train_y, cv=cv)
 #   print("Theirs: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_theirs.mean(), scores_theirs.std() * 2))

In [70]:
def train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5):
    scores_none = sklearn.model_selection.cross_val_score(clf, train_none, train_y, cv=cv)
    print("None: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_none.mean(), scores_none.std() * 2))
    
    scores_ours = sklearn.model_selection.cross_val_score(clf, train_ours, train_y, cv=cv)
    print("Ours: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_ours.mean(), scores_ours.std() * 2))
    
    scores_theirs = sklearn.model_selection.cross_val_score(clf, train_theirs, train_y, cv=cv)
    print("Theirs: %s Train Accuracy: %0.2f (+/- %0.3f)" % (clf_name, scores_theirs.mean(), scores_theirs.std() * 2))

In [71]:
def train_and_predict(train_data, train_y, test_data, test_y, clf):
    clf.fit(train_data, train_y)
    predictions = clf.predict(test_data)
    score = met.accuracy_score(test_y, predictions)
    f1 = met.f1_score(test_y, predictions, average='weighted')
    return predictions, score, f1

In [72]:
def train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name):
    none_pred, none_acc, none_f1 = train_and_predict(train_none, train_y, test_none, test_y, clf)
    print('None: %s Test Accuracy: %0.5f, f1=%0.5f' % (clf_name, none_acc, none_f1))
    
    ours_pred, ours_acc, ours_f1 = train_and_predict(train_ours, train_y, test_ours, test_y, clf)
    ours_p = tsd.calculate_mcnemars(none_pred, ours_pred, test_y)
    print('Ours: %s Test Accuracy: %0.5f, p=%0.5f, f1=%0.5f' % (clf_name, ours_acc, ours_p, ours_f1))
    
    theirs_pred, theirs_acc, theirs_f1 = train_and_predict(train_theirs, train_y, test_theirs, test_y, clf)
    theirs_p = tsd.calculate_mcnemars(none_pred, theirs_pred, test_y)
    print('Theirs: %s Test Accuracy: %0.5f, p=%0.5f, f1=%0.5f' % (clf_name, theirs_acc, theirs_p, theirs_f1))
    
    ours_theirs_p = tsd.calculate_mcnemars(ours_pred, theirs_pred, test_y)
    print('Significance between ours and theirs: p=%0.5f' % ours_theirs_p)

## Performance on Training Set and Complete Test Set
For each classifier, we calculate the average performance of the classifier on the training set when cross validation is applied, as well as the accuracy on the complete test set.

In [73]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    
    print()
    
    print('Cross Validation Accuracy on Training Set\n')
    train_all_with_cross_validation(train_none, train_ours, train_theirs, train_y, clf, clf_name, cv=5)
    
    print()
    
    print('Accuracy on Test Set\n')
    train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name)
    
    print()

SGD (n_iter=50)

Cross Validation Accuracy on Training Set

None: SGD (n_iter=50) Train Accuracy: 0.61 (+/- 0.014)
Ours: SGD (n_iter=50) Train Accuracy: 0.62 (+/- 0.018)
Theirs: SGD (n_iter=50) Train Accuracy: 0.61 (+/- 0.013)

Accuracy on Test Set

None: SGD (n_iter=50) Test Accuracy: 0.60658, f1=0.58868
Ours: SGD (n_iter=50) Test Accuracy: 0.62446, p=0.00000, f1=0.61176
Theirs: SGD (n_iter=50) Test Accuracy: 0.62152, p=0.00000, f1=0.61108
Significance between ours and theirs: p=0.17965

Random Forest (n_estimators=60)

Cross Validation Accuracy on Training Set

None: Random Forest (n_estimators=60) Train Accuracy: 0.58 (+/- 0.007)
Ours: Random Forest (n_estimators=60) Train Accuracy: 0.60 (+/- 0.007)
Theirs: Random Forest (n_estimators=60) Train Accuracy: 0.60 (+/- 0.008)

Accuracy on Test Set

None: Random Forest (n_estimators=60) Test Accuracy: 0.58111, f1=0.56444
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.60186, p=0.00000, f1=0.58929
Theirs: Random Forest (n_estimators

In [74]:
def train_and_predict_all_on_test_subset(test_tweets, clf, clf_name):
    test_none, test_y = tsd.prepare_tweet_vector_averages(test_tweets, p2v_no_emoji)
    test_ours, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_our_emoji)
    test_theirs, _ = tsd.prepare_tweet_vector_averages(test_tweets, p2v_their_emoji)

    train_and_predict_all(train_none, test_none, train_ours, test_ours, train_theirs, test_theirs, test_y, clf, clf_name)

In [75]:
emoji_test_tweets = tsd.get_tweets_with_emoji(test_tweets, e2v_ours, e2v_theirs, ems)
emoji_test_tweets_top90 = tsd.get_tweets_with_emoji(test_tweets, set(), set(), top90)
emoji_test_tweets_bottom10 = tsd.get_tweets_with_emoji(test_tweets, set(), set(), bottom10)

## Test Subset - All Tweets with Emoji
For each classifier, we calculate the accuracy on the subset of test examples that contain emoji.

In [76]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    train_and_predict_all_on_test_subset(emoji_test_tweets, clf, clf_name)
    print()

SGD (n_iter=50)
None: SGD (n_iter=50) Test Accuracy: 0.52071, f1=0.51885
Ours: SGD (n_iter=50) Test Accuracy: 0.59396, p=0.00000, f1=0.59862
Theirs: SGD (n_iter=50) Test Accuracy: 0.58366, p=0.00000, f1=0.58879
Significance between ours and theirs: p=0.01481

Random Forest (n_estimators=60)
None: Random Forest (n_estimators=60) Test Accuracy: 0.49874, f1=0.50124
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.57153, p=0.00000, f1=0.57551
Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.57977, p=0.00000, f1=0.58264
Significance between ours and theirs: p=0.21637



## Test Subset - All Tweets with Common Emoji
For each classifier, we calculate the accuracy on the subset of test examples that contain common (Top 90%) emoji.

In [77]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    train_and_predict_all_on_test_subset(emoji_test_tweets_top90, clf, clf_name)
    print()

SGD (n_iter=50)
None: SGD (n_iter=50) Test Accuracy: 0.51532, f1=0.52970
Ours: SGD (n_iter=50) Test Accuracy: 0.63329, p=0.00000, f1=0.63021
Theirs: SGD (n_iter=50) Test Accuracy: 0.63009, p=0.00000, f1=0.62749
Significance between ours and theirs: p=0.62491

Random Forest (n_estimators=60)
None: Random Forest (n_estimators=60) Test Accuracy: 0.46685, f1=0.47978
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.62048, p=0.00000, f1=0.60895
Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.61134, p=0.00000, f1=0.60174
Significance between ours and theirs: p=0.27949



## Test Subset - All Tweets with Rare Emoji
For each classifier, we calculate the accuracy on the subset of test examples that contain rare (Bottom 10%) emoji.

In [78]:
for clf_name, clf in classifiers.items():
    print(clf_name)
    train_and_predict_all_on_test_subset(emoji_test_tweets_bottom10, clf, clf_name)
    print()

SGD (n_iter=50)
None: SGD (n_iter=50) Test Accuracy: 0.43182, f1=0.40671
Ours: SGD (n_iter=50) Test Accuracy: 0.56494, p=0.00015, f1=0.56551
Theirs: SGD (n_iter=50) Test Accuracy: 0.55519, p=0.00026, f1=0.55762
Significance between ours and theirs: p=0.60151

Random Forest (n_estimators=60)
None: Random Forest (n_estimators=60) Test Accuracy: 0.42208, f1=0.41578
Ours: Random Forest (n_estimators=60) Test Accuracy: 0.53896, p=0.00350, f1=0.52486
Theirs: Random Forest (n_estimators=60) Test Accuracy: 0.55519, p=0.00107, f1=0.54457
Significance between ours and theirs: p=0.52873

