# Imports

In [1]:
import pandas as pd
import gensim.downloader as api
import os
import sys
os.path.abspath(os.curdir)
os.chdir("..")
ML_FOLDER_PATH = os.path.abspath(os.curdir)
sys.path.append(ML_FOLDER_PATH)
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm

# Load data

In [2]:
t_pos = pd.read_table("data/train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("data/train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = -1
df = pd.concat((t_pos,t_neg))

In [3]:
df['tweet'] = df['tweet'].apply(lambda s: tweet_tokenizer.tokenize(s))

In [4]:
df.head()['tweet'][0]

['<user>',
 'i',
 'dunno',
 'justin',
 'read',
 'my',
 'mention',
 'or',
 'not',
 '.',
 'only',
 'justin',
 'and',
 'god',
 'knows',
 'about',
 'that',
 ',',
 'but',
 'i',
 'hope',
 'you',
 'will',
 'follow',
 'me',
 '#believe',
 '15']

# Tweet embedding using pretrained glove

In [5]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [6]:
glove_twitter_100 = api.load("glove-twitter-100")

In [7]:
def tweet_embedding(model, tweet):
    vec = np.zeros((len(tweet), 100))
    for count, w in enumerate(tweet):
        try:
            w_vec = model.get_vector(w)
            vec[count] = w_vec
        except:
            pass
    vectors = np.array(vec)
    return vectors.mean(axis=0)

In [8]:
df['tweet'] = df['tweet'].apply(lambda s: tweet_embedding(glove_twitter_100, s))

In [9]:
X = pd.DataFrame(df.tweet.tolist())
y = df.label

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [11]:
svm_model = svm.SVC(verbose=1, max_iter=200000)
svm_model.fit(X_train, y_train)

[LibSVM]....................................................
*....
*.
*
optimization finished, #iter = 57551
obj = -77235.318902, rho = 2.774437
nSV = 80793, nBSV = 79926
Total nSV = 80793


SVC(max_iter=200000, verbose=1)

In [12]:
print(f'SVM training accuracy = {svm_model.score(X_train, y_train):.4f}')
print(f'SVM val accuracy = {svm_model.score(X_val, y_val):.4f}')

SVM training accuracy = 0.8001
SVM val accuracy = 0.7952


In [14]:
test = pd.read_table("data/test_data.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')

In [18]:
test['tweet'] = test['tweet'].apply(lambda s: tweet_embedding(glove_twitter_100, s))

In [19]:
test = pd.DataFrame(test.tweet.tolist())

In [21]:
pred = svm_model.predict(test)

In [23]:
pred.mean()

0.9802