# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import sys
os.path.abspath(os.curdir)
os.chdir("..")
ML_FOLDER_PATH = os.path.abspath(os.curdir)
sys.path.append(ML_FOLDER_PATH)

In [2]:
import src.helpers as hlp
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
from gensim.models import Word2Vec, doc2vec
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package words to /Users/jdidio/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Load and process data

In [3]:
t_pos = pd.read_table("data/train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("data/train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = -1
df = pd.concat((t_pos,t_neg), ignore_index=True)

In [4]:
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_stopwords(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_white_space(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.to_lower(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_repeating_char(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.remove_non_english_words(x))
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.lemmatize(x))
df = df[df['tweet'] != '']
df = df.drop_duplicates()
df.reset_index(inplace=True)
df['tweet'] = df['tweet'].progress_apply(lambda x: hlp.tokenize(x))

100%|██████████| 196970/196970 [00:00<00:00, 283933.44it/s]
100%|██████████| 196970/196970 [00:00<00:00, 264959.03it/s]
100%|██████████| 196970/196970 [00:00<00:00, 293068.25it/s]
100%|██████████| 196970/196970 [00:00<00:00, 303712.95it/s]
100%|██████████| 196970/196970 [00:01<00:00, 149172.15it/s]
100%|██████████| 196970/196970 [00:00<00:00, 1273147.93it/s]
100%|██████████| 196970/196970 [00:00<00:00, 425000.70it/s]
100%|██████████| 196970/196970 [00:00<00:00, 499732.37it/s]
100%|██████████| 196970/196970 [00:00<00:00, 565010.83it/s]
100%|██████████| 196970/196970 [00:00<00:00, 209223.34it/s]
100%|██████████| 196970/196970 [00:03<00:00, 54581.86it/s]
100%|██████████| 158945/158945 [00:04<00:00, 36660.20it/s]


# Build and train Word2Vec model

In [9]:
text = 'hello '
print(hlp.tokenize(text))

import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())

['hello']


[nltk_data] Downloading package words to /Users/jdidio/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [13]:
sent = "hellooo how are you doing"


sent = " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words or not w.isalpha())
print(sent)

how are you doing


In [5]:
w2v = Word2Vec(df['tweet'], vector_size=50, window=3, min_count=2, sg=1, seed=16)
w2v.train(df['tweet'], total_examples= len(df['tweet']), epochs=10)

(8164272, 8960540)

In [7]:
df['tweet'] = df['tweet'].progress_apply(lambda x: w2v(x))

  0%|          | 1/158945 [00:00<07:05, 373.86it/s]


AttributeError: 'Word2Vec' object has no attribute 'get_vector'

In [14]:
def tweet_embedding(model, tweet):
    vec = np.zeros((len(tweet), 50))
    for count, w in enumerate(tweet):
        try:
            w_vec = model.wv.get_vector(w)
            vec[count] = w_vec
        except:
            continue
    vectors = np.array(vec)
    return vectors.mean(axis=0)

In [15]:
df['tweet'] = df['tweet'].apply(lambda x: tweet_embedding(w2v, x))

In [65]:
X = pd.DataFrame(df.tweet.tolist())
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.10, random_state = 42)

In [66]:
from sklearn.svm import LinearSVC
LSVC = LinearSVC(verbose=1)
LSVC.fit(X_train, y_train)
print(f'SVM training accuracy = {LSVC.score(X_train, y_train):.4f}')
print(f'SVM val accuracy = {LSVC.score(X_val, y_val):.4f}')

[LibLinear]....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -151404.664229
nSV = 155874
SVM training accuracy = 0.5827
SVM val accuracy = 0.5837




# Doc2Vec

In [69]:
t_pos = pd.read_table("data/train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("data/train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = -1
df = pd.concat((t_pos,t_neg), ignore_index=True)
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_white_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.to_lower(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.lemmatize(x))
df = df[df['tweet'] != '']
df = df.drop_duplicates()
df.reset_index(inplace=True)
df['tweet'] = df['tweet'].apply(lambda x: hlp.tokenize(x))
df_tagged = df.apply(lambda r: doc2vec.TaggedDocument(words=r.tweet, tags=[r.label]), axis=1)

In [70]:
model_dbow = doc2vec.Doc2Vec(dm=1, vector_size=50, negative=5, hs=0, min_count=2, sample = 0)
model_dbow.build_vocab(df_tagged)

In [71]:
model_dbow.train(df_tagged, total_examples=len(df_tagged), epochs=30)

In [72]:
df['tweet'] = df['tweet'].apply(lambda x: model_dbow.infer_vector(x))

In [73]:
X = pd.DataFrame(df.tweet.tolist())
y = df.label
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.10, random_state = 42)

In [74]:
from sklearn.svm import LinearSVC
LSVC = LinearSVC(verbose=1)
LSVC.fit(X_train, y_train)
print(f'SVM training accuracy = {LSVC.score(X_train, y_train):.4f}')
print(f'SVM val accuracy = {LSVC.score(X_val, y_val):.4f}')

[LibLinear].................................................*
optimization finished, #iter = 495
Objective value = -116026.816099
nSV = 142506
SVM training accuracy = 0.7185
SVM val accuracy = 0.7190


In [75]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(n_estimators=100)
adaboost.fit(X_train, y_train)
print(f'SVM training accuracy = {adaboost.score(X_train, y_train):.4f}')
print(f'SVM val accuracy = {adaboost.score(X_val, y_val):.4f}')

SVM training accuracy = 0.7267
SVM val accuracy = 0.7184
