In [1]:
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn
from twitter_nlp_toolkit.twit_module import twit_module
from twitter_nlp_toolkit.file_fetcher import file_fetcher
from zipfile import ZipFile

Using TensorFlow backend.


In [2]:
chunk = 1 # Fraction of data to train on

Here we download some training and validation data.

The training data is the semi-supervised Sentiment140 dataset, taken form here: https://www.kaggle.com/kazanova/sentiment140

The validation data is hand-labeled airline customer feedback taken from https://www.figure-eight.com/data-for-everyone/

In [3]:
downloader = file_fetcher.downloader(using_notebook=True)

downloader initialized


In [4]:
# Training data 
downloader.download_file("https://www.dropbox.com/s/5zr4e84x83vevbt/training.1600000.processed.noemoticon.csv.zip?dl=1","1_6_m_tweets.zip")

Downloading: 99% [81100800 / 81334274] bytes            


In [5]:
# Validation data
downloader.download_file('https://www.dropbox.com/s/440m6x07bjg6c0h/Tweets.zip?dl=1',"Tweets.zip")

In [6]:
# Pickled model
# Note that this model has only been trained on 1% of the training data
# Update when better-trained model is available
downloader.download_file("https://www.dropbox.com/s/owpldku3kk7aaqj/tweet_classifier_001.zip?dl=1","tweet_classifier_001.zip")

Downloading: 99% [932806656 / 933152569] bytes            


In [7]:
#unzip the pickle file
with ZipFile('tweet_classifier_001.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [8]:
train_data = pd.read_csv("1_6_m_tweets.zip", encoding='latin-1',
                         names=['Labels', 'Index', 'Time', 'Query', 'Handle', 'Text'])

test_data = pd.read_csv('Tweets.zip', header=0, names=['Index', 'Sentiment', 'Sentiment_confidence',
                                                                'Negative_reason', 'Negative_reason_confidence',
                                                                'Airline', 'Airline_sentiment_gold', 'Handle',
                                                                'Negative_reason_gold', 'Retweet_count', 'Text',
                                                                'Tweet_coord', 'Time', 'Location', 'Timezone'])

In [9]:
test_data['Labels'] = (test_data['Sentiment'] == 'positive') * 2
test_data['Labels'] = test_data['Labels'] + (test_data['Sentiment'] == 'neutral') * 1
test_data['Labels'] = test_data['Labels'] / 2

train_data['Labels'] = np.array(train_data['Labels'])/4


# For debugging, it is possible to choose only a fraction of the data for speed
if chunk < 1:
    train_data = train_data.reindex(np.random.permutation(train_data.index))
    train_data = train_data[0:int(chunk * len(train_data))]

# Remove unlabeled test data
test_data.set_index('Labels')
test_data = test_data[test_data.Labels != 0.5]

In [10]:
# Executing this cell takes about 20min on a laptop

Classifier = twit_module.SentimentAnalyzer(bow_param={}, lstm_param=None, glove_param=None)
Classifier.fit(train_data['Text'], train_data["Labels"])



In [11]:
test_data['Labels'] = (test_data['Sentiment'] == 'positive') * 2

print("Test Accuracy: %.3f" % sklearn.metrics.accuracy_score(test_data['Labels']/2, Classifier.predict(test_data['Text']).reshape(-1)))

Test Accuracy: 0.791


In [12]:
EnsembledClassifier = pkl.load(open('tweet_classifier_001.pkl', 'rb'))

ModuleNotFoundError: No module named 'twit_module'

In [None]:
print("Test Accuracy: %.3f" % sklearn.metrics.accuracy_score(test_data['Labels']/2, EnsembledClassifier.predict(test_data['Text']).reshape(-1)))

In [None]:
EnsembledClassifier.predict(['I am happy', 'I am sad', 'I am cheerful', 'I am mad'])

In [13]:
import os

In [14]:
?os.mk_dir

Object `os.mk_dir` not found.
