In [1]:
import numpy as np 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

from tqdm import tqdm
from sklearn.utils import shuffle
from process_data import process_document
import json

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = json.load(open('toxic_data2.json', encoding='UTF8'))

x_train = data['x_train']
x_test = data['x_test']
y_train = data['y_train']
y_test = data['y_test']

In [3]:
print(len(x_train), 'len of x train')
print(len(y_train), 'len of y train')

print(len(x_test), 'len of x test')
print(len(y_test), 'len of y test')

print(x_train[0])
print(y_train[0])

print(x_test[0])
print(y_test[0])

print("Number of 100: {}".format(y_train.count("100"))) #100 is improper
print("Number of 105: {}".format(y_train.count("105")))

print("Number of 100 in val: {}".format(y_test.count("100"))) # 100 is improper
print("Number of 105 in val: {}".format(y_test.count("105")))

X = x_train
X_val = x_test

y = [0 if int(y) == 100 else 1 for y in y_train]
y_val = [0 if int(y) == 100 else 1 for y in y_test]


from statistics import mean
lengths = [len(x_ex.split(' ')) for x_ex in X]
lengths.sort()
print("Mean lengths of comments: {}".format(mean(lengths)))
print("Mean lengths of comments with the last 25 longest comments removed: {}"
      .format(mean(lengths[:-25])))

9944 len of x train
9944 len of y train
2487 len of x test
2487 len of y test
Kohalikud venelased ehk saavadki kuidagi oma eraisiku tasemel ja nÃ¤gemusega hakkama aga kui riik oma kohaolu ignoreerib siis kohalikud hakkavad otsima neid kes neid EI ignereeri ja ulatab nii vajaliku abikÃ¤e.... Loodus tÃ¼hja kohta ei salli . Kui Eesti jaoks on Ida-Virumaa kui regioon pohh-nahh siis ehk siis silmad avanevad kui korraga juhtmed seinast vÃ¤ljatÃµmmatass? ja kus ã¼lejã¤ã¤nud eesti tã¶ã¶tajaskond on kes teenib miinimumi lã¤hedast palka ? istute mã¶kuna oma peldikus ja vahite ukseaugust pealt ? vã¤hemalt ametiã¼hingud peaksid kãµik nagu ã¼ks mees vã¤ljas olema aga nemad soojendavad kotte kuskil kontoora nurgas ja teevad valitsusega isiklikke diile .... aivo peterson 09 09 62.181.220.227
105
HÃ¤lvik on hÃ¤lvik.  tånis 13 09 194.150.65.37
105
Number of 100: 5358
Number of 105: 4586
Number of 100 in val: 1348
Number of 105 in val: 1139
Mean lengths of comments: 50.466311343523735
Mean lengths of c

In [4]:
print(X_val[0:5])
print(y_val[0:5])

['HÃ¤lvik on hÃ¤lvik.  tå\x8dnis 13 09 194.150.65.37', 'KAS TÃ•ESTI KÃ•IK OPOSITSIOON KAASAARVATUD EKRE JA KESKERAKOND ON NII LONTIS KÃ•RVADEGA JA KUULETUVAD JA LÃ„HEVAD KOALITSIOONIGA ÃœHTE PAATI,ET VALIDA. PAGULAS KVOOTE JA KOOSELUSEADUST SOOSIV KANDIDAAT. MIDA ARVAB RAHVAS,NENDE VALIJAD?  eestlane 22 09 90.191.230.90', '25 aastat(saatkonna rahadega-mÃµisa ehitamine)!\nJÃ¤Ã¤b\nAlles JÃ•KS! siimust ei rã¤ã¤gi me 75 aastat, seega jã¤ã¤b helme. helmest ei rã¤ã¤gi 14 09 90.191.198.152', "Lp Ãµiguskantsler vÃµtab ette kriitika Vabariigi Valitsuse ja isegi Riigikogu aadressil ja eks ta peabki seda tegema, see on tema tÃ¶Ã¶. On meil olnud ennegi sÃµnakaid Ãµiguskantslerid, meenutage JÃµks'i. NÃ¤ib, et selle ametiga kÃ¤ib kaasas teatud kompleks olla targem kollektiivsest mÃµistusest, arvamus, et rahva valitud Riigikogu ei ole kompetentne, et Ãµiguskantsler on kompetentsem kui valitsus ja sajad riigiametnikest kÃµrgelt haritud spetsialistid kokku. Lisaks meie enda kollektiivsele mÃµistusele o

In [5]:
class InputProcessor():
    def __init__(self, X_data, vocab_size=30000, comment_length=40):        
        self.vocab_size = vocab_size
        self.comment_length = comment_length
        self.word_to_ix = {}
        
        #Initialize word_to_ix
        self.add_word_to_ix("0")
        for comment in X_data:
            for word in comment.split(' '):        
                self.add_word_to_ix(word)
        
        
    def add_word_to_ix(self, word):
        if(word not in self.word_to_ix):
            if len(self.word_to_ix) >= self.vocab_size:
                self.word_to_ix[word] = self.vocab_size
            else:
                self.word_to_ix[word] = len(self.word_to_ix)
                
    def preprocess_input(self, sentences_to_process):
        processed_sentences = []

        for num, sentence in enumerate(sentences_to_process):
            processed_sentences.append([])
            sentence = sentence.split(' ')
            # Crop sentence or add zero padding
            if len(sentence) >= self.comment_length:
                sentence = sentence[:self.comment_length]
            else:
                new_sentence = []
                for i in range(self.comment_length):
                    if i > len(sentence) - 1:
                        new_sentence.append("0")
                    else:
                        new_sentence.append(sentence[i])
                sentence = new_sentence

            for word in sentence:
                if word in self.word_to_ix:
                    processed_sentences[num].append(self.word_to_ix[word])
                else:
                    self.add_word_to_ix(word)
                    processed_sentences[num].append(self.word_to_ix[word])
                    
        return processed_sentences
    
    @staticmethod
    def create_mini_batches(input_X, input_y, batch_size):
        batched_X = []
        batched_y = []
        for i in range(int(len(input_X) / batch_size) + 1):
            batched_X.append(input_X[i*batch_size:i*batch_size + batch_size])
            batched_y.append(input_y[i*batch_size:i*batch_size + batch_size])
            
        return (batched_X, batched_y)
        
InputProc = InputProcessor(X)

print('Total words in vocab: ', len(InputProc.word_to_ix))            
test_x = InputProc.preprocess_input([X[0]])
print(X[0])
print(test_x, 'Length: ' + str(len(test_x[0])))

Total words in vocab:  110133
Kohalikud venelased ehk saavadki kuidagi oma eraisiku tasemel ja nÃ¤gemusega hakkama aga kui riik oma kohaolu ignoreerib siis kohalikud hakkavad otsima neid kes neid EI ignereeri ja ulatab nii vajaliku abikÃ¤e.... Loodus tÃ¼hja kohta ei salli . Kui Eesti jaoks on Ida-Virumaa kui regioon pohh-nahh siis ehk siis silmad avanevad kui korraga juhtmed seinast vÃ¤ljatÃµmmatass? ja kus ã¼lejã¤ã¤nud eesti tã¶ã¶tajaskond on kes teenib miinimumi lã¤hedast palka ? istute mã¶kuna oma peldikus ja vahite ukseaugust pealt ? vã¤hemalt ametiã¼hingud peaksid kãµik nagu ã¼ks mees vã¤ljas olema aga nemad soojendavad kotte kuskil kontoora nurgas ja teevad valitsusega isiklikke diile .... aivo peterson 09 09 62.181.220.227
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 6, 15, 16, 17, 18, 19, 20, 21, 22, 21, 23, 24, 9, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]] Length: 40


In [6]:
vocab_size = 35000
embed_dim = 64
lstm_out = 32

def list_to_numpy(input_list):
    inp = np.zeros((len(input_list), len(input_list[0])))
    for num_ex, ex in enumerate(input_list):
        for num_i, i in enumerate(ex):
            inp[num_ex][num_i] = i
    return inp
        
X = list_to_numpy(InputProc.preprocess_input(X))
X_val = list_to_numpy(InputProc.preprocess_input(X_val))

y_val = np.asarray(y_val)
y = np.asarray(y)

In [7]:
print(y[0:5])
y = to_categorical(y, 2)
print(y[0:5])
print(y_val[0:5])
y_val = to_categorical(y_val, 2)
print(y_val[0:5])

[1 0 1 0 0]
[[0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]
[1 0 0 1 1]
[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]


In [10]:
model = Sequential()
model.add(Embedding(vocab_size, embed_dim, input_length=X.shape[1], dropout=0.4))
model.add(LSTM(lstm_out, dropout_U=0.4, dropout_W=0.4))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  
  This is separate from the ipykernel package so we can avoid doing imports until


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 64)            2240000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 2,252,482
Trainable params: 2,252,482
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
NUM_EPOCHS = 50
BATCH_SIZE = 32
print(X[0])
print(y[0])
model.fit(X, y, epochs = NUM_EPOCHS, batch_size=BATCH_SIZE, verbose = 2, validation_data=(X_val, y_val))

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.  6. 15. 16. 17.
 18. 19. 20. 21. 22. 21. 23. 24.  9. 25. 26. 27. 28. 29. 30. 31. 32. 33.
 34. 35. 36. 37.]
[0. 1.]
Train on 9944 samples, validate on 2487 samples
Epoch 1/50
 - 18s - loss: 0.6743 - acc: 0.5764 - val_loss: 0.6121 - val_acc: 0.6590
Epoch 2/50
 - 16s - loss: 0.5627 - acc: 0.7221 - val_loss: 0.5482 - val_acc: 0.7081
Epoch 3/50
 - 16s - loss: 0.4358 - acc: 0.8075 - val_loss: 0.5589 - val_acc: 0.7189
Epoch 4/50
 - 16s - loss: 0.3454 - acc: 0.8610 - val_loss: 0.5594 - val_acc: 0.7346
Epoch 5/50
 - 16s - loss: 0.2787 - acc: 0.8964 - val_loss: 0.5695 - val_acc: 0.7330
Epoch 6/50
 - 16s - loss: 0.2252 - acc: 0.9186 - val_loss: 0.6569 - val_acc: 0.7378
Epoch 7/50
 - 16s - loss: 0.1894 - acc: 0.9333 - val_loss: 0.6751 - val_acc: 0.7346
Epoch 8/50
 - 16s - loss: 0.1579 - acc: 0.9442 - val_loss: 0.7547 - val_acc: 0.7286
Epoch 9/50
 - 16s - loss: 0.1327 - acc: 0.9558 - val_loss: 0.9129 - val_acc: 0.7230
Epoch 10/50
 - 15s - los

KeyboardInterrupt: 

In [1]:
print(out[0][0])
print(out.size())


out = out.view(out.size()[1], out.size()[2])

print(out.size())
print(out[0])


NameError: name 'out' is not defined

In [2]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()




def train_model(input_X, input_y, model, optimizer, criterion, InpPrep, epochs=5, batch_size=64):
    
    batched_X, batched_y = InputProc.create_mini_batches(input_X, input_y, batch_size)
    
    inputs_val = InpPrep.preprocess_input(X_val)
    inp_val = autograd.Variable(torch.LongTensor(inputs_val))    

    for epoch in range(epochs): 
        running_loss = 0.0
        running_corrects = 0
        
        for batch_num, (x, y) in tqdm(enumerate(zip(batched_X, batched_y))):
            model.zero_grad()
            
            x = InpPrep.preprocess_input(x)            
            predicted = model(autograd.Variable(torch.LongTensor(x)))
            predicted = predicted.view(predicted.size()[1], predicted.size()[2])
                        
            loss = criterion(predicted, autograd.Variable(torch.LongTensor(y)))
            loss.backward()
            optimizer.step()
            
            
            
            _, preds = torch.max(predicted.data, 1)
            running_loss += loss.data[0]
            running_corrects += torch.sum(preds == torch.LongTensor(y))
            
        
        if epoch % 10 == 0:            
            print('-' * 5)                                    
            
            # VALIDATION ACC
            val_predictions = model(inp_val)
            val_predictions = val_predictions.view(val_predictions.size()[1], val_predictions.size()[2])                
            _, val_preds = torch.max(val_predictions.data, 1)        
            val_summed = np.sum(val_preds.numpy() == y_val)
                                
            print('Traning accuracy: {:.2f}%'.format(running_corrects / len(input_y)))
            print('Validation accuracy: {:.2f}%'.format(val_summed / len(val_preds)))
            print("Epoch {} loss: {:f}, single: {:f}".format(epoch, running_loss, running_loss / BATCH_SIZE))
    
    return model

NameError: name 'optim' is not defined

In [3]:
model = train_model(X, y, model, optimizer, criterion, InputProc, epochs=50, batch_size=BATCH_SIZE)

NameError: name 'train_model' is not defined

In [None]:
training_preds = model(inp)
training_preds = training_preds.view(training_preds.size()[1], training_preds.size()[2])

print(training_preds.size())
_, preds = torch.max(training_preds.data, 1)

summed = np.sum(preds.numpy() == y)
print('Traning accuracy: {}'.format(summed / len(training_preds)))

In [None]:
print(len(X_val))
inputs_val = InputProc.preprocess_input(X_val)
print(X_val[0].split(' ')[:30])
print(inputs_val[0])
inp_val = autograd.Variable(torch.LongTensor(inputs_val))

In [None]:
val_predictions = model(inp_val)
val_predictions = val_predictions.view(val_predictions.size()[1], val_predictions.size()[2])
print(val_predictions.size())

In [None]:
print(y_val[0:5])
_, val_preds = torch.max(val_predictions.data, 1)
print(val_preds[0])

summed_val = np.sum(val_preds.numpy() == y_val)
print('Validation accuracy: {}'.format(summed_val / len(val_preds)))