In [1]:
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn
from twitter_nlp_toolkit.tweet_sentiment_classifier import tweet_sentiment_classifier
from twitter_nlp_toolkit.file_fetcher import file_fetcher
from zipfile import ZipFile

Using TensorFlow backend.


In [2]:
chunk = 1 # Fraction of data to train on - you can reduce for debugging for speed
redownload=True
model_path = 'Models'


Here we download pre-trained models and a validation dataset. The models have been trained on the Sentiment140 dataset, taken form here: https://www.kaggle.com/kazanova/sentiment140

The validation data is hand-labeled airline customer feedback taken from https://www.figure-eight.com/data-for-everyone/

In [3]:
if redownload: 
    
    # Validation data
    file_fetcher.download_file('https://www.dropbox.com/s/440m6x07bjg6c0h/Tweets.zipdl=1',"Tweets.zip")
    
    # Compressed model
    # Note that this model has only been trained on 5% of the training data
    # Update when better-trained model is available
    file_fetcher.download_file("https://www.dropbox.com/s/i88eqlja56xncyx/model_test_05.zip?dl=1","Models.zip")
    
    # Extract all the contents of zip file in current directory
    with ZipFile('Models.zip', 'r') as zipObj:
        zipObj.extractall(path=model_path)

Tweets.zip: 298kB [00:00, 1.79MB/s]
Models.zip: 100%|##########| 1.82G/1.82G [02:00<00:00, 16.2MB/s]


In [4]:
# Load the validation data

test_data = pd.read_csv('Tweets.zip', header=0, names=['Index', 'Sentiment', 'Sentiment_confidence',
                                                                'Negative_reason', 'Negative_reason_confidence',
                                                                'Airline', 'Airline_sentiment_gold', 'Handle',
                                                                'Negative_reason_gold', 'Retweet_count', 'Text',
                                                                'Tweet_coord', 'Time', 'Location', 'Timezone'])

In [5]:
# Remove the unlabeled test data

test_data['Labels'] = (test_data['Sentiment'] == 'positive') * 2
test_data['Labels'] = test_data['Labels'] + (test_data['Sentiment'] == 'neutral') * 1
test_data['Labels'] = test_data['Labels'] / 2

test_data.set_index('Labels')
test_data = test_data[test_data.Labels != 0.5]

In [6]:
# Executing this cell takes about 30s on a laptop

Classifier = tweet_sentiment_classifier.SentimentAnalyzer()
Classifier.load_models(path=model_path)

BoW model Models\bow loaded successfully
BoW model Models\bow_005_1 loaded successfully
BoW model Models\bow_005_2 loaded successfully
BoW model Models\bow_005_3 loaded successfully
Pre-trained embedding model loaded successfully
Pre-trained embedding model loaded successfully
Pre-trained embedding model loaded successfully
Pre-trained embedding model loaded successfully
LSTM model Models\lstm loaded successfully
LSTM model Models\lstm_005_1 loaded successfully
LSTM model Models\lstm_005_2 loaded successfully
LSTM model Models\lstm_005_3 loaded successfully
BoW model Models\model_test_05 loaded successfully


We santiy check the models: 

In [7]:
Classifier.predict(['I am happy', 'I am sad', 'I am cheerful', 'I am mad'])


array([1., 0., 1., 0.])

We test the model on an airline customer feedback dataset.

In [8]:
# Executing this cell takes several minuites on a laptop

predictions = Classifier.predict(test_data['Text'])


In [9]:
print('Test Accuracy:  {:.3f}'.format(sklearn.metrics.accuracy_score(test_data['Labels'], predictions)))
print('Test MCC:  {:.3f}'.format(sklearn.metrics.matthews_corrcoef(test_data['Labels'], predictions)))
sklearn.metrics.confusion_matrix(test_data['Labels'], predictions)

Test Accuracy:  0.773
Test MCC:  0.528


array([[6806, 2372],
       [ 246, 2117]], dtype=int64)

We have accuracy of just under 80%.


We split our evaluation dataset into validation and testing and eliminate the worst-performing models

In [10]:
valX, testX, valY, testY = sklearn.model_selection.train_test_split(test_data['Text'], test_data['Labels'], test_size=0.5, stratify=test_data['Labels'])

In [11]:
# Executing this cell takes several minuites on a laptop


Classifier.trim_models(valX, valY, threshold=0.7)

Model Models\bow score: 0.801
Model Models\bow_005_1 score: 0.738
Model Models\bow_005_2 score: 0.756
Model Models\bow_005_3 score: 0.746
Model Models\glove score: 0.765
Model Models\glove_005_1 score: 0.724
Model Models\glove_005_2 score: 0.677
Deleting model Models\glove_005_2
Model Models\glove_005_3 score: 0.673
Deleting model Models\glove_005_3
Model Models\lstm score: 0.689
Deleting model Models\lstm
Model Models\lstm_005_1 score: 0.652
Deleting model Models\lstm_005_1
Model Models\lstm_005_2 score: 0.631
Deleting model Models\lstm_005_2
Model Models\lstm_005_3 score: 0.648
Deleting model Models\lstm_005_3
Model Models\model_test_05 score: 0.801


Our custom-embedding models performed poorly on this dataset and have been pruned. 

In [12]:
predictions = Classifier.predict(testX)

print('Test Accuracy:  {:.3f}'.format(sklearn.metrics.accuracy_score(testY, predictions)))
print('Test MCC:  {:.3f}'.format(sklearn.metrics.matthews_corrcoef(testY, predictions)))
sklearn.metrics.confusion_matrix(testY, predictions)

Test Accuracy:  0.784
Test MCC:  0.535


array([[3485, 1104],
       [ 140, 1042]], dtype=int64)

Pruning our models had a minor impact on our classification performance, bringing us to an acceptible ~77%. 

To improve our accuracy, we can refine the models on our airline data. The early stopping procedure (enabled by default to use 20% of the training data for validation) should minimize overfitting.

In [13]:
Classifier.refine(valX, valY)
test_predictions = Classifier.predict(testX)

predictions = Classifier.predict(testX)

Train on 4616 samples, validate on 1154 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500


Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500


Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 00159: early stopping
Train on 4616 samples, validate on 1154 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500


Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500


Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500


Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 00159: early stopping


In [14]:
print('Test Accuracy:  {:.3f}'.format(sklearn.metrics.accuracy_score(testY, predictions)))
print('Test MCC:  {:.3f}'.format(sklearn.metrics.matthews_corrcoef(testY, predictions)))
sklearn.metrics.confusion_matrix(testY, predictions)

Test Accuracy:  0.897
Test MCC:  0.656


array([[4511,   78],
       [ 519,  663]], dtype=int64)

Now we have accuracies of nearly 90%! 

In [15]:
Classifier.evaluate(testX, testY)

Model Models\bow score: 0.891
Model Models\bow_005_1 score: 0.891
Model Models\bow_005_2 score: 0.892
Model Models\bow_005_3 score: 0.889
Model Models\glove score: 0.883
Model Models\glove_005_1 score: 0.879
Model Models\model_test_05 score: 0.891


{'ensembled': 0.896551724137931,
 'Models\\bow': 0.8911800381216427,
 'Models\\bow_005_1': 0.8910067579275689,
 'Models\\bow_005_2': 0.8920464390920118,
 'Models\\bow_005_3': 0.8894472361809045,
 'Models\\glove': 0.8826893086120257,
 'Models\\glove_005_1': 0.8790504245364755,
 'Models\\model_test_05': 0.8911800381216427}