In [1]:
import csv
import h5py
import numpy as np


from gensim.models.keyedvectors import KeyedVectors
from nltk.tokenize import TweetTokenizer

DATA_FILE = './data/train.csv'
W2V_MODEL = './models/w2v.840B.300d.txt'

In [None]:
#

In [2]:
from gensim.models.keyedvectors import KeyedVectors


def load_w2v_model_from_path(model_path, binary_input=False):
    """
    :param model_path: path to w2v model
    :type model_path: string
    :param binary_input: True : binary input, False : text input
    :type binary_input: boolean
    :return: loaded w2v model
    :rtype: KeyedVectors object
    """
    w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=binary_input)
    return w2v_model


model = load_w2v_model_from_path(W2V_MODEL)

In [33]:
full_data_set = []

with open(DATA_FILE) as f:
    reader = csv.reader(f)
    header = next(reader)
    for line in reader:
        full_data_set.append(line)

# load data into native lists
print(header)
id_data = [i for i in map(lambda x: x[0], full_data_set)]
text_data = [i for i in map(lambda x: x[1], full_data_set)]
toxic_data = [i for i in map(lambda x: x[2], full_data_set)]
severe_toxic_data = [i for i in map(lambda x: x[3], full_data_set)]
obscene_data = [i for i in map(lambda x: x[4], full_data_set)]
threat_data = [i for i in map(lambda x: x[5], full_data_set)]
insult_data = [i for i in map(lambda x: x[6], full_data_set)]
identity_hate_data = [i for i in map(lambda x: x[6], full_data_set)]
full_data = {'id' : id_data,'toxic' :toxic_data,'severe_toxic' : severe_toxic_data,'obscene' : obscene_data ,'threat' : threat_data,'insult' : insult_data,'identity_hate' : identity_hate_data}

['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [4]:
tknzr = TweetTokenizer()
max_length = 0
#tokenize sentences
tokenized_sentences = []
for sentence in text:
    tokenized_sentences.append(tknzr.tokenize(sentence))
    max_length = max(max_length,len(sentence))


In [8]:
import keras 
#keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32',
#    padding='pre', truncating='pre', value=0.)


Using TensorFlow backend.
  return f(*args, **kwds)


In [12]:
#vectorise sentences
removed_indexes = []
vectorized_sentences = []
for i in range(len(tokenized_sentences)):
    tokenized_sentence = tokenized_sentences[i]
    if len(tokenized_sentence) > 50 :
        tokenized_sentence = tokenized_sentence[:50]
    vector_rep_of_sentence = []
    for word in tokenized_sentence:
        if word in model.vocab:
            vector_rep_of_sentence.append(model.wv[word])
    if not vector_rep_of_sentence :
        removed_indexes.append(i)
    else :
        array = np.array(vector_rep_of_sentence)
        zeroes = np.zeros((50-len(vector_rep_of_sentence),300))
        vector_rep_of_sentence = np.concatenate((array,zeroes),axis=0)
        vectorized_sentences.append(vector_rep_of_sentence)

In [13]:
vectorized_sentences_np = np.array(vectorized_sentences)

In [34]:
def safe_remove_indexes_from_list(list_of_indexes,full_data_set):
    list_of_indexes.sort(reverse=True) #always remove the largest indexes first or you will get an index error
    for key in full_data_set : #for each sequence
        sequence = full_data_set[key]
        for index in list_of_indexes : #iterate through index
            sequence.pop(index)
        full_data_set[key] = sequence

In [35]:
safe_remove_indexes_from_list(removed_indexes,full_data)

In [39]:
for key in full_data :
    assert len(full_data[key]) == len(vectorized_sentences_np)

In [42]:
x_train = vectorized_sentences_np [-10000:]
x_test = vectorized_sentences_np [:10000]
y_train = full_data['toxic'] [-10000:]
y_test = full_data['toxic'] [:10000]


In [43]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

data_dim = 300
timesteps = 50
num_classes = 2

# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(32, return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=64, epochs=5,
          validation_data=(x_test, y_test))


Train on 10000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f727e58a780>

In [44]:
x_predict = model.predict(x_train[:100])


In [49]:
for index,val in enumerate(x_predict) :
    print("predicted is {}, truth is {},".format(x_predict[index][0],y_train[index]))

predicted is 0.008796238340437412, truth is 0,
predicted is 0.002557283965870738, truth is 0,
predicted is 0.0020868261344730854, truth is 0,
predicted is 0.9687802195549011, truth is 1,
predicted is 0.0025312425568699837, truth is 0,
predicted is 0.968778133392334, truth is 1,
predicted is 0.0023737496230751276, truth is 0,
predicted is 0.0023098685778677464, truth is 0,
predicted is 0.003657362423837185, truth is 0,
predicted is 0.002309787319973111, truth is 0,
predicted is 0.003101394744589925, truth is 0,
predicted is 0.05995829775929451, truth is 0,
predicted is 0.0022372365929186344, truth is 0,
predicted is 0.002550855278968811, truth is 0,
predicted is 0.0402420312166214, truth is 0,
predicted is 0.9661478996276855, truth is 1,
predicted is 0.01763775385916233, truth is 0,
predicted is 0.24867761135101318, truth is 0,
predicted is 0.9689835906028748, truth is 1,
predicted is 0.00342194945551455, truth is 0,
predicted is 0.007008715067058802, truth is 0,
predicted is 0.00308953