In [1]:
import numpy as np 

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.utils.np_utils import to_categorical

from sklearn.utils import shuffle
from process_data import process_document
import json

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = json.load(open('toxic_data2.json', encoding='UTF8'))

x_train = data['x_train']
x_test = data['x_test']
y_train = data['y_train']
y_test = data['y_test']

In [3]:
print(len(x_train), 'len of x train')
print(len(y_train), 'len of y train')

print(len(x_test), 'len of x test')
print(len(y_test), 'len of y test')

print(x_train[0])
print(y_train[0])

print(x_test[0])
print(y_test[0])

print("Number of 100: {}".format(y_train.count("100"))) #100 is improper
print("Number of 105: {}".format(y_train.count("105")))

print("Number of 100 in val: {}".format(y_test.count("100"))) # 100 is improper
print("Number of 105 in val: {}".format(y_test.count("105")))

X = x_train
X_val = x_test

y = [0 if int(y) == 100 else 1 for y in y_train]
y_val = [0 if int(y) == 100 else 1 for y in y_test]


from statistics import mean
lengths = [len(x_ex.split(' ')) for x_ex in X]
lengths.sort()
print("Mean lengths of comments: {}".format(mean(lengths)))
print("Mean lengths of comments with the last 25 longest comments removed: {}"
      .format(mean(lengths[:-25])))

9944 len of x train
9944 len of y train
2487 len of x test
2487 len of y test
Kohalikud venelased ehk saavadki kuidagi oma eraisiku tasemel ja nÃ¤gemusega hakkama aga kui riik oma kohaolu ignoreerib siis kohalikud hakkavad otsima neid kes neid EI ignereeri ja ulatab nii vajaliku abikÃ¤e.... Loodus tÃ¼hja kohta ei salli . Kui Eesti jaoks on Ida-Virumaa kui regioon pohh-nahh siis ehk siis silmad avanevad kui korraga juhtmed seinast vÃ¤ljatÃµmmatass? ja kus ã¼lejã¤ã¤nud eesti tã¶ã¶tajaskond on kes teenib miinimumi lã¤hedast palka ? istute mã¶kuna oma peldikus ja vahite ukseaugust pealt ? vã¤hemalt ametiã¼hingud peaksid kãµik nagu ã¼ks mees vã¤ljas olema aga nemad soojendavad kotte kuskil kontoora nurgas ja teevad valitsusega isiklikke diile .... aivo peterson 09 09 62.181.220.227
105
HÃ¤lvik on hÃ¤lvik.  tånis 13 09 194.150.65.37
105
Number of 100: 5358
Number of 105: 4586
Number of 100 in val: 1348
Number of 105 in val: 1139
Mean lengths of comments: 50.466311343523735
Mean lengths of c

In [4]:
vocab_size = 50000
seq_len = 60

tokenizer = Tokenizer(num_words=seq_len,
                                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n12345687890',
                                   lower=True,
                                   split=" ",
                                   char_level=False,
                                   oov_token=None)


tokenizer.fit_on_texts(X_val + X)
print(len(tokenizer.word_index))

X_tokenized = tokenizer.texts_to_sequences(X)
X_val_tokenized = tokenizer.texts_to_sequences(X_val)

X_tokenized = pad_sequences(X_tokenized, maxlen=seq_len)
X_val_tokenized = pad_sequences(X_val_tokenized, maxlen=seq_len)

print(X_val[0])
print(X_val_tokenized[0])
print(X[0])
print(X_tokenized[0])


print(y[0:3])
y_oh = to_categorical(y, 2)
print(y_oh[0:3])
print(y_val[0:3])
y_val_oh = to_categorical(y_val, 2)
print(y_val_oh[0:3])

75162
HÃ¤lvik on hÃ¤lvik.  tånis 13 09 194.150.65.37
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2]
Kohalikud venelased ehk saavadki kuidagi oma eraisiku tasemel ja nÃ¤gemusega hakkama aga kui riik oma kohaolu ignoreerib siis kohalikud hakkavad otsima neid kes neid EI ignereeri ja ulatab nii vajaliku abikÃ¤e.... Loodus tÃ¼hja kohta ei salli . Kui Eesti jaoks on Ida-Virumaa kui regioon pohh-nahh siis ehk siis silmad avanevad kui korraga juhtmed seinast vÃ¤ljatÃµmmatass? ja kus ã¼lejã¤ã¤nud eesti tã¶ã¶tajaskond on kes teenib miinimumi lã¤hedast palka ? istute mã¶kuna oma peldikus ja vahite ukseaugust pealt ? vã¤hemalt ametiã¼hingud peaksid kãµik nagu ã¼ks mees vã¤ljas olema aga nemad soojendavad kotte kuskil kontoora nurgas ja teevad valitsusega isiklikke diile .... aivo peterson 09 09 62.181.220.227
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0 10  1  9  5 10  7 58 12 

In [5]:
embed_dim = 64
lstm_out = 32

model = Sequential()
model.add(Embedding(vocab_size, embed_dim, input_length=seq_len, dropout=0.4))
model.add(Bidirectional(LSTM(lstm_out, dropout_U=0.4, dropout_W=0.4)))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 64)            3200000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 3,224,962
Trainable params: 3,224,962
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
num_epochs = 50
batch_size = 128

model.fit(X_tokenized, y_oh, epochs=num_epochs, batch_size=batch_size, verbose=1, validation_data=(X_val_tokenized, y_val_oh))

Train on 9944 samples, validate on 2487 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
2048/9944 [=====>........................] - ETA: 11s - loss: 0.6289 - acc: 0.6470

KeyboardInterrupt: 