# Modeling Notebook with Advanced NLP Techniques

In [55]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import pickle
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from sklearn.model_selection import train_test_split
from sklearn import preprocessing, utils
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras import models, layers, optimizers
from keras.utils.np_utils import to_categorical

import multiprocessing
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import LabeledSentence, TaggedDocument



In [16]:
train = pickle.load(open("../pickle/train.pickle", "rb"))
val = pickle.load(open("../pickle/val.pickle", "rb"))
test = pickle.load(open("../pickle/test.pickle", "rb"))

In [17]:
train.head()

Unnamed: 0,tweet,target,tweet2
0,"[reject, constantly, house, threaten, rape, mo...",1,"'reject', 'constantly', 'house', 'threaten', '..."
1,"[convince, lame, nigger, liver, believe, cuz, ...",1,"'convince', 'lame', 'nigger', 'liver', 'believ..."
2,"[peace, fag, remember, best, lux, support, dro...",1,"'peace', 'fag', 'remember', 'best', 'lux', 'su..."
3,"[haha, ight, nig, calm, yoself]",1,"'haha', 'ight', 'nig', 'calm', 'yoself'"
4,"[tits, better, look, face, make, like, asian, ...",1,"'tits', 'better', 'look', 'face', 'make', 'lik..."


In [18]:
val.head()

Unnamed: 0,tweet,target,tweet2
0,"[lbum, fotos, gaywrites, make, project, queer,...",1,"'lbum', 'fotos', 'gaywrites', 'make', 'project..."
1,"[yay, america, israel, jew, hat, muslim, trash...",1,"'yay', 'america', 'israel', 'jew', 'hat', 'mus..."
2,"[miss, ofay, friends, day, scar, recent, happe...",1,"'miss', 'ofay', 'friends', 'day', 'scar', 'rec..."
3,"[trash, darkskin, nigga, steal, damn, garbage]",1,"'trash', 'darkskin', 'nigga', 'steal', 'damn',..."
4,"[cody, call, people, nigger, hes, fuck, spaz]",1,"'cody', 'call', 'people', 'nigger', 'hes', 'fu..."


In [29]:
train['target2'] = np.nan
train.loc[train.target == 1, 'target2'] = 'Hate'
train.loc[train.target == 0, 'target2'] = 'Not_Hate'

In [36]:
val['target2'] = np.nan
val.loc[val.target == 1, 'target2'] = 'Hate'
val.loc[val.target == 0, 'target2'] = 'Not_Hate'

In [37]:
X_tr = train.tweet
X_val = val.tweet
y_tr = train.target
y_val = val.target

In [22]:
X_tr

0        [reject, constantly, house, threaten, rape, mo...
1        [convince, lame, nigger, liver, believe, cuz, ...
2        [peace, fag, remember, best, lux, support, dro...
3                          [haha, ight, nig, calm, yoself]
4        [tits, better, look, face, make, like, asian, ...
                               ...                        
18581                                   [miss, lil, bitch]
18582          [gotta, hoe, smh, aint, captain, save, hoe]
18583                  [lmao, yeah, bitch, lil, shit, rip]
18584                                    [tbt, bad, bitch]
18585                          [hoe, act, know, imma, let]
Name: tweet, Length: 18586, dtype: object

# Doc2Vec


## DBOW (Distributed Bag of Words)

In [30]:

train_tagged = train.apply(lambda x: TaggedDocument(words=x['tweet'], tags=x.target2), axis=1)
train_tagged.values[30]

TaggedDocument(words=['retard', 'bruh', 'lol'], tags='Hate')

In [38]:
val_tagged = val.apply(lambda x: TaggedDocument(words=x['tweet'], tags=x.target2), axis=1)
val_tagged.values[30]

TaggedDocument(words=['fucc', 'nicca', 'pose', 'pullin'], tags='Hate')

In [32]:
import multiprocessing
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 18586/18586 [00:00<00:00, 961604.26it/s]


In [33]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 18586/18586 [00:00<00:00, 1995120.27it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2923178.87it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3134764.92it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2819463.06it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2862532.01it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2653708.27it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2613319.95it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3047391.98it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2532908.80it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3114973.79it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2907587.71it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2841869.93it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3188357.22it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3045368.16it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3189270.31it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2825902.06it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3099738.92it/

In [34]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [40]:
y_tr, X_tr = vec_for_learning(model_dbow, train_tagged)
y_val, X_val = vec_for_learning(model_dbow, val_tagged)

In [41]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_tr, y_tr)

LogisticRegression(C=100000.0, n_jobs=1)

In [42]:
y_pred = logreg.predict(X_val)

In [43]:
logreg.score(X_val, y_val)

0.9418886198547215

In [56]:
accuracy_score(y_val, y_pred)

0.9418886198547215

In [44]:
model_dbow.save('../data/d2v_model_dbow.doc2vec')
model_dbow = Doc2Vec.load('../data/d2v_model_dbow.doc2vec')

In [45]:
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMM (Distributed Memory Mean)

In [46]:
cores = multiprocessing.cpu_count()
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 18586/18586 [00:00<00:00, 1954600.56it/s]


In [49]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 18586/18586 [00:00<00:00, 2076427.94it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2755579.15it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3265828.83it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3102329.44it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3038838.89it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2759969.34it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3041921.96it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3170075.81it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3118837.13it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2825902.06it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2849661.29it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2995286.80it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3290643.06it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3121584.68it/s]
100%|██████████| 18586/18586 [00:00<00:00, 3144755.10it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2391708.11it/s]
100%|██████████| 18586/18586 [00:00<00:00, 2946825.97it/

In [50]:
model_dmm.save('../data/d2v_model_dmm.doc2vec')
model_dmm = Doc2Vec.load('../data/d2v_model_dmm.doc2vec')

In [52]:
model_dmm.most_similar('nigger')

[('niggers', 0.6091761589050293),
 ('scully', 0.5996459722518921),
 ('strength', 0.5672992467880249),
 ('mariners', 0.5286785364151001),
 ('vin', 0.5232475996017456),
 ('bedroom', 0.5231317281723022),
 ('tbird', 0.49917277693748474),
 ('unroll', 0.49384060502052307),
 ('devil', 0.49172112345695496),
 ('uncalled', 0.4915260970592499)]

In [53]:
y_tr, X_tr = vec_for_learning(model_dmm, train_tagged)
y_val, X_val = vec_for_learning(model_dmm, val_tagged)

In [52]:
logreg = LogisticRegression().fit(X_tr, y_tr)
logreg.predict(X_val)

LogisticRegression()

In [53]:
clf.score(validation_vecs_dmm, y_val)

0.9174065106268496

# ANN with Tfidf Vectorizer

In [54]:

tvec1 = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
tvec1.fit(X_tr)

TfidfVectorizer(max_features=100000, ngram_range=(1, 3))

In [55]:
x_train_tfidf = tvec1.transform(X_tr)

In [56]:
x_validation_tfidf = tvec1.transform(X_val).toarray()

In [57]:
%%time
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_tfidf, y_tr)

CPU times: user 3.95 s, sys: 2.09 s, total: 6.04 s
Wall time: 1.81 s


LogisticRegression()

In [58]:
clf.score(x_validation_tfidf, y_val)

0.944578961528114

In [59]:
seed = 42
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [63]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield np.array(X_batch), np.array(y_batch)
        if (counter > number_of_batches):
            counter=0

In [65]:
%%time
model = Sequential()
model.add(Dense(64, activation = 'relu', input_dim = 100000))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

model.fit_generator(generator = batch_generator(x_train_tfidf, y_tr, 32),
                    epochs = 10, 
                    random_state = 42, 
                    validation_data = (x_validation_tfidf, y_val),
                    steps_per_epoch = x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 6s, sys: 1min 52s, total: 17min 58s
Wall time: 3min 39s


<tensorflow.python.keras.callbacks.History at 0x7f825d08a128>

## Normalizing Inputs

In [66]:
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(x_train_tfidf)
x_train_tfidf_norm = norm.transform(x_train_tfidf)
x_validation_tfidf_norm = norm.transform(x_validation_tfidf)

In [69]:
%%time
model_n = Sequential()
model_n.add(Dense(64, activation = 'relu', input_dim = 100000))
model_n.add(Dense(1, activation ='sigmoid'))
model_n.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

model_n.fit_generator(generator = batch_generator(x_train_tfidf_norm, y_tr, 32),
                      epochs = 10, 
                      validation_data = (x_validation_tfidf_norm, y_val),
                      steps_per_epoch = x_train_tfidf_norm.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 14s, sys: 1min 56s, total: 18min 10s
Wall time: 3min 33s


<tensorflow.python.keras.callbacks.History at 0x7f80cfab0f98>

## Using Dropout for Overfitting

In [71]:
model1 = Sequential()
model1.add(Dense(64, activation='relu', input_dim=100000))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model1.fit_generator(generator=batch_generator(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f80cf965eb8>

## Shuffling Data

In [72]:
def batch_generator_shuffle(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    np.random.shuffle(index)
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield np.array(X_batch), np.array(y_batch)
        if (counter > number_of_batches):
            np.random.shuffle(index)
            counter=0

In [73]:
%%time
model_s = Sequential()
model_s.add(Dense(64, activation='relu', input_dim=100000))
model_s.add(Dense(1, activation='sigmoid'))
model_s.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 15min 44s, sys: 1min 52s, total: 17min 37s
Wall time: 3min 58s


<tensorflow.python.keras.callbacks.History at 0x7f8013777cf8>

## Shuffle and Dropout

In [74]:
%%time
model_s_1 = Sequential()
model_s_1.add(Dense(64, activation='relu', input_dim=100000))
model_s_1.add(Dropout(0.2))
model_s_1.add(Dense(1, activation='sigmoid'))
model_s_1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_1.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 3s, sys: 1min 59s, total: 18min 2s
Wall time: 3min 58s


<tensorflow.python.keras.callbacks.History at 0x7f7fbf528518>

## Learning Rate

In [76]:
%%time
import keras
custom_adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_2 = Sequential()
model_testing_2.add(Dense(64, activation='relu', input_dim=100000))
model_testing_2.add(Dense(1, activation='sigmoid'))
model_testing_2.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 32s, sys: 2min 8s, total: 18min 41s
Wall time: 5min 8s


<tensorflow.python.keras.callbacks.History at 0x7f7efc4f9cf8>

In [77]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_3 = Sequential()
model_testing_3.add(Dense(64, activation='relu', input_dim=100000))
model_testing_3.add(Dense(1, activation='sigmoid'))
model_testing_3.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_3.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                    epochs=10, validation_data=(x_validation_tfidf, y_val),
                    steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 22s, sys: 2min 9s, total: 18min 31s
Wall time: 4min 58s


<tensorflow.python.keras.callbacks.History at 0x7f7fbf53ae10>

In [78]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_4 = Sequential()
model_testing_4.add(Dense(64, activation='relu', input_dim=100000))
model_testing_4.add(Dense(1, activation='sigmoid'))
model_testing_4.compile(optimizer=custom_adam,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_testing_4.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                              epochs=10, 
                              validation_data=(x_validation_tfidf, y_val),
                              steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 38s, sys: 2min 21s, total: 19min
Wall time: 5min 42s


<tensorflow.python.keras.callbacks.History at 0x7f7f004954a8>

In [79]:
%%time
custom_adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model_testing_5 = Sequential()
model_testing_5.add(Dense(64, activation='relu', input_dim=100000))
model_testing_5.add(Dense(1, activation='sigmoid'))
model_testing_5.compile(optimizer=custom_adam,
                        loss='binary_crossentropy',
                        metrics=['accuracy'])

model_testing_5.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                              epochs=10, 
                              validation_data=(x_validation_tfidf, y_val),
                              steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 17min 4s, sys: 2min 30s, total: 19min 34s
Wall time: 6min 19s


<tensorflow.python.keras.callbacks.History at 0x7f7fb38cf898>

## Increasing number of hidden nodes

In [80]:
%%time
model_s_2 = Sequential()
model_s_2.add(Dense(128, activation='relu', input_dim=100000))
model_s_2.add(Dense(1, activation='sigmoid'))
model_s_2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_s_2.fit_generator(generator=batch_generator_shuffle(x_train_tfidf, y_tr, 32),
                        epochs=10, 
                        validation_data=(x_validation_tfidf, y_val),
                        steps_per_epoch=x_train_tfidf.shape[0]/32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 29min 22s, sys: 2min 55s, total: 32min 18s
Wall time: 9min 18s


<tensorflow.python.keras.callbacks.History at 0x7f7ef5128cc0>