In [28]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
from keras import optimizers
import pickle

import warnings
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils

from sklearn.linear_model import LogisticRegression

In [14]:
train = pickle.load(open("../pickle/train.pickle", "rb"))
val = pickle.load(open("../pickle/val.pickle", "rb"))
test = pickle.load(open("../pickle/test.pickle", "rb"))

In [16]:
X_tr = train.tweet
X_val = val.tweet
X_tt = test.tweet
y_tr = train.target
y_val = val.target
y_tt = test.target

# Doc2Vec


In [18]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result

In [19]:
all_x = pd.concat([X_tr, X_val, X_tt])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

  """


In [20]:
len(all_x_w2v)

24783

## DBOW (Distributed Bag of Words)

In [22]:
model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=4, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 24783/24783 [00:00<00:00, 2386688.31it/s]


In [24]:
%%time
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|██████████| 24783/24783 [00:00<00:00, 2748114.63it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3167487.46it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3303484.27it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3373711.86it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3328341.58it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2543118.76it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2466716.56it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2971200.12it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2778377.46it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2581262.38it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2695242.98it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2827886.07it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2687647.02it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2388498.07it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2247074.86it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2854522.48it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2657686.54it/

In [25]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [26]:
train_vecs_dbow = get_vectors(model_ug_dbow, X_tr, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, X_val, 100)

In [29]:
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_tr)

LogisticRegression()

In [31]:
clf.score(validation_vecs_dbow, y_val)

0.9031476997578692

In [32]:
model_ug_dbow.save('../data/d2v_model_ug_dbow.doc2vec')
model_ug_dbow = Doc2Vec.load('../data/d2v_model_ug_dbow.doc2vec')

In [33]:
model_ug_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMC (Distributed Memory Concatenated) 

In [34]:
cores = multiprocessing.cpu_count()
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 24783/24783 [00:00<00:00, 2682099.19it/s]


In [35]:
%%time
for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|██████████| 24783/24783 [00:00<00:00, 2425901.10it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3418761.26it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3300861.71it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2704077.31it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3195825.99it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3389442.94it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3081293.49it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3501328.35it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3198677.91it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3065301.41it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3226477.82it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3557163.64it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3405767.70it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3332396.24it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3332396.24it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3516846.64it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3531303.03it/

In [36]:
model_ug_dmc.save('../data/d2v_model_ug_dmc.doc2vec')
model_ug_dmc = Doc2Vec.load('../data/d2v_model_ug_dmc.doc2vec')

In [42]:
model_ug_dmc.most_similar('jew')

KeyError: "word 'jew' not in vocabulary"

In [43]:
train_vecs_dmc = get_vectors(model_ug_dmc, X_tr, 100)
validation_vecs_dmc = get_vectors(model_ug_dmc, X_val, 100)

In [44]:
clf = LogisticRegression()
clf.fit(train_vecs_dmc, y_tr)

LogisticRegression()

In [45]:
clf.score(validation_vecs_dmc, y_val)

0.9408124831853646

In [46]:
model_ug_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMM (Distributed Memory Mean)

In [47]:
cores = multiprocessing.cpu_count()
model_ug_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmm.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 24783/24783 [00:00<00:00, 2293025.59it/s]


In [48]:
%%time
for epoch in range(30):
    model_ug_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmm.alpha -= 0.002
    model_ug_dmm.min_alpha = model_ug_dmm.alpha

100%|██████████| 24783/24783 [00:00<00:00, 2419520.41it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2704992.09it/s]
100%|██████████| 24783/24783 [00:00<00:00, 2637389.59it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3378756.25it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3129627.15it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3593936.87it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3669941.96it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3332396.24it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3539961.72it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3405209.85it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3627043.37it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3317612.54it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3419773.52it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3398529.92it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3641273.55it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3394867.11it/s]
100%|██████████| 24783/24783 [00:00<00:00, 3600285.26it/

In [49]:
model_ug_dmm.save('../data/d2v_model_ug_dmm.doc2vec')
model_ug_dmm = Doc2Vec.load('../data/d2v_model_ug_dmm.doc2vec')

In [50]:
model_ug_dmm.most_similar('nigger')

KeyError: "word 'nigger' not in vocabulary"

In [51]:
train_vecs_dmm = get_vectors(model_ug_dmm, X_tr, 100)
validation_vecs_dmm = get_vectors(model_ug_dmm, X_val, 100)

In [52]:
clf = LogisticRegression()
clf.fit(train_vecs_dmm, y_tr)

LogisticRegression()

In [53]:
clf.score(validation_vecs_dmm, y_val)

0.9174065106268496

# ANN with Tfidf Vectorizer

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec1 = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec1.fit(X_tr)

TfidfVectorizer(max_features=100000, ngram_range=(1, 3))

In [55]:
x_train_tfidf = tvec1.transform(X_tr)

In [56]:
x_validation_tfidf = tvec1.transform(X_val).toarray()

In [57]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_tfidf, y_tr)

CPU times: user 3.95 s, sys: 2.09 s, total: 6.04 s
Wall time: 1.81 s


LogisticRegression()

In [58]:
clf.score(x_validation_tfidf, y_val)

0.944578961528114

In [59]:
seed = 42
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [63]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield np.array(X_batch), np.array(y_batch)
        if (counter > number_of_batches):
            counter=0

In [65]:
%%time
model = Sequential()
model.add(Dense(64, activation = 'relu', input_dim = 100000))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

model.fit_generator(generator = batch_generator(x_train_tfidf, y_tr, 32),
                    epochs = 10, 
                    random_state = 42, 
                    validation_data = (x_validation_tfidf, y_val),
                    steps_per_epoch = x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 6s, sys: 1min 52s, total: 17min 58s
Wall time: 3min 39s


<tensorflow.python.keras.callbacks.History at 0x7f825d08a128>

## Normalizing Inputs

In [66]:
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(x_train_tfidf)
x_train_tfidf_norm = norm.transform(x_train_tfidf)
x_validation_tfidf_norm = norm.transform(x_validation_tfidf)

In [69]:
%%time
model_n = Sequential()
model_n.add(Dense(64, activation = 'relu', input_dim = 100000))
model_n.add(Dense(1, activation ='sigmoid'))
model_n.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

model_n.fit_generator(generator = batch_generator(x_train_tfidf_norm, y_tr, 32),
                      epochs = 10, 
                      validation_data = (x_validation_tfidf_norm, y_val),
                      steps_per_epoch = x_train_tfidf_norm.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 14s, sys: 1min 56s, total: 18min 10s
Wall time: 3min 33s


<tensorflow.python.keras.callbacks.History at 0x7f80cfab0f98>

## Using Dropout for Overfitting

In [71]:
model1 = Sequential()
model1.add(Dense(64, activation='relu', input_dim=100000))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model1.fit_generator(generator=batch_generator(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f80cf965eb8>

## Shuffling Data

In [72]:
def batch_generator_shuffle(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    np.random.shuffle(index)
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield np.array(X_batch), np.array(y_batch)
        if (counter > number_of_batches):
            np.random.shuffle(index)
            counter=0

In [73]:
%%time
model_s = Sequential()
model_s.add(Dense(64, activation='relu', input_dim=100000))
model_s.add(Dense(1, activation='sigmoid'))
model_s.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 15min 44s, sys: 1min 52s, total: 17min 37s
Wall time: 3min 58s


<tensorflow.python.keras.callbacks.History at 0x7f8013777cf8>

## Shuffle and Dropout

In [74]:
%%time
model_s_1 = Sequential()
model_s_1.add(Dense(64, activation='relu', input_dim=100000))
model_s_1.add(Dropout(0.2))
model_s_1.add(Dense(1, activation='sigmoid'))
model_s_1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_1.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 3s, sys: 1min 59s, total: 18min 2s
Wall time: 3min 58s


<tensorflow.python.keras.callbacks.History at 0x7f7fbf528518>

## Learning Rate

In [76]:
%%time
import keras
custom_adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_2 = Sequential()
model_testing_2.add(Dense(64, activation='relu', input_dim=100000))
model_testing_2.add(Dense(1, activation='sigmoid'))
model_testing_2.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 32s, sys: 2min 8s, total: 18min 41s
Wall time: 5min 8s


<tensorflow.python.keras.callbacks.History at 0x7f7efc4f9cf8>

In [77]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_3 = Sequential()
model_testing_3.add(Dense(64, activation='relu', input_dim=100000))
model_testing_3.add(Dense(1, activation='sigmoid'))
model_testing_3.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_3.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 22s, sys: 2min 9s, total: 18min 31s
Wall time: 4min 58s


<tensorflow.python.keras.callbacks.History at 0x7f7fbf53ae10>

In [78]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_4 = Sequential()
model_testing_4.add(Dense(64, activation='relu', input_dim=100000))
model_testing_4.add(Dense(1, activation='sigmoid'))
model_testing_4.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_4.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                              epochs=10, 
                              validation_data=(x_validation_tfidf, y_val),
                              steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 38s, sys: 2min 21s, total: 19min
Wall time: 5min 42s


<tensorflow.python.keras.callbacks.History at 0x7f7f004954a8>

In [79]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_5 = Sequential()
model_testing_5.add(Dense(64, activation='relu', input_dim=100000))
model_testing_5.add(Dense(1, activation='sigmoid'))
model_testing_5.compile(optimizer=custom_adam,
                        loss='binary_crossentropy',
                        metrics=['accuracy'])

model_testing_5.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                              epochs=10, 
                              validation_data=(x_validation_tfidf, y_val),
                              steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 17min 4s, sys: 2min 30s, total: 19min 34s
Wall time: 6min 19s


<tensorflow.python.keras.callbacks.History at 0x7f7fb38cf898>

## Increasing number of hidden nodes

In [80]:
%%time
model_s_2 = Sequential()
model_s_2.add(Dense(128, activation='relu', input_dim=100000))
model_s_2.add(Dense(1, activation='sigmoid'))
model_s_2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                        epochs=10, 
                        validation_data=(x_validation_tfidf, y_val),
                        steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 29min 22s, sys: 2min 55s, total: 32min 18s
Wall time: 9min 18s


<tensorflow.python.keras.callbacks.History at 0x7f7ef5128cc0>