# Imports

In [1]:
import pandas as pd
import gensim.downloader as api
import os
import sys
os.path.abspath(os.curdir)
os.chdir("..")
ML_FOLDER_PATH = os.path.abspath(os.curdir)
sys.path.append(ML_FOLDER_PATH)
import src.helpers as hlp
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from nltk.tokenize import TweetTokenizer
from sklearn import svm

tokenizer = TweetTokenizer()

[nltk_data] Downloading package words to /Users/jdidio/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Load data and preprocessing

In [4]:
t_pos = pd.read_table("data/train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("data/train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = -1
df = pd.concat((t_pos,t_neg))

In [5]:
df = hlp.preprocess_data(df)
df['tweet'] = df['tweet'].progress_apply(lambda s: tokenizer.tokenize(s))

100%|██████████| 196970/196970 [00:00<00:00, 1213473.33it/s]
100%|██████████| 196970/196970 [00:00<00:00, 297048.24it/s]
100%|██████████| 196970/196970 [00:00<00:00, 263525.04it/s]
100%|██████████| 196970/196970 [00:00<00:00, 293575.52it/s]
100%|██████████| 196970/196970 [00:00<00:00, 303091.76it/s]
100%|██████████| 196970/196970 [00:01<00:00, 147172.78it/s]
100%|██████████| 196970/196970 [00:00<00:00, 421872.92it/s]
100%|██████████| 196970/196970 [00:00<00:00, 495001.20it/s]
100%|██████████| 196970/196970 [00:00<00:00, 555541.62it/s]
100%|██████████| 196970/196970 [00:00<00:00, 463867.95it/s]
100%|██████████| 196970/196970 [00:03<00:00, 50746.19it/s]
100%|██████████| 173235/173235 [00:06<00:00, 28474.58it/s]


# Tweet embedding using pretrained glove
Here we download a pretrained GloVe model which is trained on Twitter data and has a dimension of 100 per words

In [6]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [7]:
glove_twitter_100 = api.load("glove-twitter-100")

In [8]:
def tweet_embedding(model, tweet):
    vec = np.zeros((len(tweet), 100))
    for count, w in enumerate(tweet):
        try:
            w_vec = model.get_vector(w)
            vec[count] = w_vec
        except:
            pass
    vectors = np.array(vec)
    return vectors.mean(axis=0)

In [9]:
df['tweet'] = df['tweet'].progress_apply(lambda s: tweet_embedding(glove_twitter_100, s))

100%|██████████| 173235/173235 [00:03<00:00, 51418.91it/s]


# Model training and evaluation

In [10]:
X = pd.DataFrame(df.tweet.tolist())
y = df.label

In [11]:
s_split = ShuffleSplit(n_splits=5,test_size=0.1)

In [12]:
svm_model = svm.LinearSVC(verbose=1, max_iter=20000)
svm_score = cross_val_score(svm_model, X, y, cv=s_split)

[LibLinear]..................................................................................................................................*.*.**
optimization finished, #iter = 1325
Objective value = -112552.527883
nSV = 142733
[LibLinear]...................................................................................................................................**.***.*
optimization finished, #iter = 1331
Objective value = -112507.711996
nSV = 142817
[LibLinear]...................................................................................................................................**.*
optimization finished, #iter = 1323
Objective value = -112547.092859
nSV = 142840
[LibLinear]...................................................................................................................................*..............................................................*.***
optimization finished, #iter = 1946
Objective value = -112571.687873
nSV = 142802
[LibLinear]....

In [13]:
print(f'SVM mean accuracy = {svm_score.mean():.4f}')
print(f'SVM standard deviation accuracy = {svm_score.std():.4f}')

SVM mean accuracy = 0.7365
SVM standard deviation accuracy = 0.0032
