In [2]:
import joblib

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
import re

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet

In [7]:
df = pd.read_csv("../data/hate_classes.tsv", sep='\t')
# process text for embeddings
text = df.Text.apply(basic_tokenize)

tokenizer = Tokenizer(num_words=20000, split=' ', oov_token='<unw>', filters='')
tokenizer.fit_on_texts(text.values)
X = tokenizer.texts_to_sequences(text.values)
X = pad_sequences(X, maxlen=200)

voc = tokenizer.word_index
print('Found %s unique tokens.' % len(voc))

embeddings_index = {}
f = open(os.path.join('./', 'glove.twitter.27B.200d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 20965 unique tokens.
Found 1193515 word vectors.


In [28]:
max_features = 20000
num_words = min(max_features, len(voc))
print(num_words)

embedding_dim = 200

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in voc.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

20000


In [23]:
Y = []
for row in df.itertuples():
    Y.append((row.REL,row.RAE,row.SXO,row.GEN,row.IDL,row.NAT,row.POL,row.MPH,row.EX,row.IM))
Y = np.array(Y)


In [77]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, Embedding, LSTM
from tensorflow.keras.optimizers import RMSprop
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=200, weights=[embedding_matrix], input_length=200, trainable=False))
#LSTM 
model.add(LSTM(units=128 , return_sequences = True , recurrent_dropout = 0.25 , dropout = 0.25))
model.add(LSTM(units=64 , recurrent_dropout = 0.1 , dropout = 0.1))
model.add(Dense(units = 32 , activation = 'relu'))
model.add(Dense(10, activation='sigmoid'))
model.compile(optimizer=RMSprop(lr = 0.001), loss='categorical_crossentropy', metrics=['binary_accuracy'])
model.summary()


Model: "sequential_32"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_30 (Embedding)     (None, 200, 200)          4000000   
_________________________________________________________________
lstm_18 (LSTM)               (None, 200, 128)          168448    
_________________________________________________________________
lstm_19 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dense_62 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_63 (Dense)             (None, 10)                330       
Total params: 4,220,266
Trainable params: 220,266
Non-trainable params: 4,000,000
_________________________________________________________________


In [86]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import RMSprop
model = Sequential()
model.add(Embedding(max_features, output_dim=200, weights=[embedding_matrix], input_length=200, trainable=False))
model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(40, activation='relu'))
model.add(Dense(10, activation='sigmoid'))
# compile network
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr = 0.001), metrics=['binary_accuracy'])
print(model.summary())

Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 200, 200)          4000000   
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 193, 64)           102464    
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 96, 64)            0         
_________________________________________________________________
flatten_21 (Flatten)         (None, 6144)              0         
_________________________________________________________________
dense_70 (Dense)             (None, 40)                245800    
_________________________________________________________________
dense_71 (Dense)             (None, 10)                410       
Total params: 4,348,674
Trainable params: 348,674
Non-trainable params: 4,000,000
_____________________________________

In [87]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.1)

batch_size = 64  # mini-batch with 256 examples
epochs = 30
history = model.fit(
    x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_data=(x_test, y_test))

Train on 5830 samples, validate on 648 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [90]:
print(x_test[1])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0   243    21     7 14994     8
  1013    48  3885    58  3176  9180     9 14995  1