In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
#5300 dimension Glove embeddings
import numpy as np
gloveFile = 'glove.42B.300d.txt'
with open(gloveFile, encoding="utf8" ) as f:
    #content = f.readlines()
    model = {}
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
print ("Done.",len(model)," words loaded!")

Done. 1917494  words loaded!


In [27]:
train_df = pd.read_pickle('train_df.pickle')

In [28]:
train_df = train_df.sample(frac = 1)


In [29]:
train_df['text_split'] = train_df['text'].str.split(' ')

In [30]:
len(set([item for x in train_df['text_split'] for item in x]))

57419

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
tokenizer = Tokenizer(num_words= 30000)
tokenizer.fit_on_texts(train_df['text'].values)

In [33]:
embedding_matrix = np.zeros((30000, 300))
for word, index in tokenizer.word_index.items():
    if index > 30000 - 1:
        break
    else:
        if word in model.keys():
            embedding_matrix[index] = model[word]  
        else:
            embedding_matrix[index] = model['unk']  

In [55]:
def make_pu(df,frac):
    positive = df[df['label']==1].sample(n = frac)
    negative = df[df['label']==0].sample(n = frac)
    print(positive)
    selected_examples = positive['comment_id'].values+ negative['comment_id'].values
    
    unlabeled = df[~((df['comment_id'].isin(selected_examples)))]
    total = pd.concat([positive,negative])
    total = total.sample(frac = 1)
    X = total['text'].values
    y = total['label'].values
    
    

    return X,y,unlabeled

In [142]:
X,y,unlabeled = make_pu(train_df,6117)

       comment_id                                               text  \
6072        16765                       Grab em by the pussy mah boi   
22068       38223  Boys like this are literal trash and are the m...   
18131       44521  No place on Broadway for jihadi Muslims like t...   
50708          66  Get ready to be flogged India is ready to flog...   
41370        9993  rBPT Fuck America holding its own people at gu...   
...           ...                                                ...   
79210       28053        @Speech7x7 Deportation is the only cure URL   
3227        15757                           I hope he raped her hard   
14416       14179                                        white trash   
2439         1678                        yeah fuck that stupid whore   
31976         441  Hey Petie kill yourself   EDIT Lol @ all the S...   

       hate_speech_score  label  \
6072                1.72      1   
22068               0.62      1   
18131               0.58      

In [143]:
X = tokenizer.texts_to_sequences(X)

In [144]:
X = pad_sequences(X, padding = 'pre', maxlen = 128)

In [145]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   32,
          1,  953,   45,  503, 4102, 1977,   48])

In [146]:
C

In [147]:
model_glove = Sequential()
model_glove.add(Embedding(30000,300, input_length=128, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(LSTM(32))
model_glove.add(Dense(1, activation='sigmoid'))
opt = Adam(learning_rate = 0.01)
model_glove.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

early_stopping = EarlyStopping(monitor = 'val_loss', patience = 4,restore_best_weights =True)


In [148]:
history = model_glove.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=50,batch_size=64,callbacks = [early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [149]:
test_ds = pd.read_pickle('test_df.pickle').sample(frac = 1)

In [150]:
X_test = pad_sequences(tokenizer.texts_to_sequences(test_ds['text'].values),  padding = 'pre', maxlen = 128)
y_test = test_ds['label'].values

In [151]:
y_pred = model_glove.predict_classes(X_test)
print("LSTM before EM")
print(classification_report(y_test,y_pred))

LSTM before EM
              precision    recall  f1-score   support

           0       0.80      0.70      0.75      2000
           1       0.73      0.83      0.78      2000

    accuracy                           0.76      4000
   macro avg       0.77      0.76      0.76      4000
weighted avg       0.77      0.76      0.76      4000



In [152]:
unlabeled_indices = unlabeled['comment_id']
unlabeled_text = pad_sequences(tokenizer.texts_to_sequences(unlabeled['text'].values),  padding = 'pre', maxlen = 128)
confidence_list = model_glove.predict(unlabeled_text)
confidence_list[:5]

confidence_list = np.array([i[0] for i in confidence_list])

In [153]:
most_confident_negative = np.where(confidence_list<=0.05 )
most_confident_positive= np.where(confidence_list>=.95 )
x = pd.DataFrame(columns = ['elements','label'])
x['elements'] = most_confident_positive[0]
x = x.reset_index(drop = True)
x['label'] = 1
x
x2= pd.DataFrame(columns = ['elements','label'])
x2['elements'] = most_confident_negative[0]
x2 = x2.reset_index(drop = True)
x2['label'] = 0
x3= pd.concat([x,x2])

x3 = x3.sample(frac= 1)

confident_indices = x3['elements'].values

X_conf = [unlabeled_text[i] for i in confident_indices]

y_conf = x3['label'].values

mask = np.ones(len(unlabeled_text), dtype=bool)
mask[confident_indices] = False
unlabeled_text = unlabeled_text[mask]
X_new =np.concatenate((np.array(X_conf),X_train),axis=0)
y_new = np.concatenate((np.array(y_conf),y_train),axis = 0)

In [154]:
#REtraining 
model_glove = Sequential()
model_glove.add(Embedding(30000,300, input_length=128, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(LSTM(32))
model_glove.add(Dense(1, activation='sigmoid'))
new_opt = Adam(learning_rate = 0.001)
model_glove.compile(loss='binary_crossentropy', optimizer=new_opt, metrics=['accuracy'])

model_glove.fit(X_new,y_new,epochs=50,validation_data=(X_val,y_val),batch_size=64,callbacks = [early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


<tensorflow.python.keras.callbacks.History at 0x294d2079c88>

In [155]:
y_pred = model_glove.predict_classes(X_test)
print("LSTM after EM")
print(classification_report(y_test,y_pred))

LSTM after EM
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      2000
           1       0.75      0.79      0.77      2000

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000

