<h2>IMDB dataset</h2>

In [3]:
import os
import random

# get the file names in each directory
imdb_train_neg = [f for f in os.listdir('aclImdb/train/neg')]
imdb_train_pos = [f for f in os.listdir('aclImdb/train/pos')]
imdb_test_neg = [f for f in os.listdir('aclImdb/test/neg')]
imdb_test_pos = [f for f in os.listdir('aclImdb/test/pos')]
print("File lists ready.")
def read(directory,path):
    texts = []
    for f in directory:
        f=os.path.join(path,f)
        with open(f, "rt") as inp_file:
            text = inp_file.readlines()
            assert len(text)==1
            texts.append(text[0].strip())
    return texts

# read all the files in each directory into a list
imdb_train_neg = read(imdb_train_neg,'aclImdb/train/neg')
imdb_train_pos = read(imdb_train_pos,'aclImdb/train/pos')
imdb_test_neg = read(imdb_test_neg,'aclImdb/test/neg')
imdb_test_pos = read(imdb_test_pos,'aclImdb/test/pos')

# add labels for texts
imdb_train_neg = [(text,-1) for text in imdb_train_neg]
imdb_train_pos = [(text,1) for text in imdb_train_pos]
imdb_test_neg = [(text,-1) for text in imdb_test_neg]
imdb_test_pos = [(text,1) for text in imdb_test_pos]

# shuffle the training set
imdb_train = imdb_train_neg + imdb_train_pos
random.shuffle(imdb_train)

imdb_train_texts = [text for text, label in imdb_train]
imdb_train_labels = [label for text, label in imdb_train]
#print(train_texts[:3])
print(imdb_train_labels[:20])

imdb_test = imdb_test_neg + imdb_test_pos
imdb_test_texts = [text for text, label in imdb_test]
imdb_test_labels = [label for text, label in imdb_test]

File lists ready.
[-1, 1, 1, 1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, -1]


<h2>Twitter dataset</h2>

In [4]:
training = 'training.1600000.processed.noemoticon.csv'
test = 'testdata.manual.2009.06.14.csv'

import csv

def parse_tweets(filename):
    with open(filename, 'r',errors='ignore') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            assert len(row) == 6
            yield int(row[0]),row[-1]
                
tweets_train = [*parse_tweets(training)]
tweets_test = [*parse_tweets(test)]
tweets_test = [tweet for tweet in tweets_test if tweet[0]!=2]

import random
random.shuffle(tweets_train)

tweets_train_texts = [text for label, text in tweets_train]
tweets_train_labels = [label for label, text in tweets_train]
tweets_train_labels = [-1 if label==0 else 1 for label in tweets_train_labels]
tweets_test_texts = [text for label, text in tweets_test]
tweets_test_labels = [label for label, text in tweets_test]
tweets_test_labels = [-1 if label==0 else 1 for label in tweets_test_labels]

<h2>Vectorizing</h2>

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from eli5 import show_weights

space_tokenizer = lambda text: text.split()


# Featurization and vectorization
vectorizer = TfidfVectorizer(tokenizer=space_tokenizer, ngram_range=(1,2))

imdb_vectorizer = vectorizer.fit(imdb_train_texts)
imdb_train_X = imdb_vectorizer.transform(imdb_train_texts)
#devel_X = vectorizer.transform(devel_texts)
imdb_test_X = imdb_vectorizer.transform(imdb_test_texts)

tweets_vectorizer = vectorizer.fit(tweets_train_texts)
tweets_train_X = tweets_vectorizer.transform(tweets_train_texts)
tweets_test_X = tweets_vectorizer.transform(tweets_test_texts)

<h2>Training SVM classifier</h2>

In [9]:
%%time
# Training and prediction

imdb_classifier = LinearSVC(
    C=1.0,
    class_weight=None,
    max_iter=1000,
    loss='squared_hinge'
)
tweets_classifier = LinearSVC(
    C=1.0,
    class_weight=None,
    max_iter=1000,
    loss='squared_hinge'
)
imdb_classifier.fit(imdb_train_X, imdb_train_labels)
tweets_classifier.fit(tweets_train_X, tweets_train_labels)


CPU times: user 43.5 s, sys: 132 ms, total: 43.7 s
Wall time: 43.7 s


In [10]:
%%time
imdb_pred_labels = imdb_classifier.predict(imdb_test_X)
tweets_pred_labels = tweets_classifier.predict(tweets_test_X)

CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 74.4 ms


<h2>Results</h2>

In [14]:
# Evaluation and analysis
imdb_accuracy = accuracy_score(imdb_test_labels, imdb_pred_labels)
tweets_accuracy = accuracy_score(tweets_test_labels, tweets_pred_labels)

print('IMDB accuracy {:.2%}'.format(imdb_accuracy))
print('Tweets accuracy {:.2%}'.format(tweets_accuracy))

IMDB accuracy 89.72%
Tweets accuracy 82.45%


In [11]:
print("IMDB dataset results")
print(classification_report(imdb_test_labels, imdb_pred_labels))
print()
print("Tweet dataset results")
print(classification_report(tweets_test_labels, tweets_pred_labels))

IMDB dataset results
              precision    recall  f1-score   support

          -1       0.90      0.89      0.90     12500
           1       0.89      0.90      0.90     12500

   micro avg       0.90      0.90      0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000


Tweet dataset results
              precision    recall  f1-score   support

          -1       0.84      0.80      0.82       177
           1       0.81      0.85      0.83       182

   micro avg       0.82      0.82      0.82       359
   macro avg       0.83      0.82      0.82       359
weighted avg       0.83      0.82      0.82       359



In [13]:
show_weights(imdb_classifier, vec=imdb_vectorizer)

Weight?,Feature
+4.087,@jonbrak
+2.940,@flyingtweeter
+2.580,:s. goodnight!
+2.348,bike fit
+2.272,@miamendez u
+2.266,@xcarcrashhearts yeah
+2.224,bold using
… 964912 more positive …,… 964912 more positive …
… 904589 more negative …,… 904589 more negative …
-2.204,@fallen_reason haha
