In [1]:
import numpy as np 
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from process_data import process_document

In [64]:
X, y = process_document()
print(X[0])
print(y[0])
X_val = X[500:600]
y_val = y[500:600]

X = X[:500]
y = y[:500]



print("Number of 0: {}".format(y.count(0)))
print("Number of 1: {}".format(y.count(1)))
from statistics import mean
lengths = [len(x_ex.split(' ')) for x_ex in X]
lengths.sort()
print("Mean lengths of comments: {}".format(mean(lengths)))
print("Mean lengths of comments with the last 25 longest comments removed: {}"
      .format(mean(lengths[:-25])))

mã»ts maha kangelste ees. ja nii on.
0
Number of 0: 122
Number of 1: 378
Mean lengths of comments: 33.038
Mean lengths of comments with the last 25 longest comments removed: 24.76421052631579


In [9]:
class InputProcessor():
    def __init__(self, X_data, vocab_size=10000, comment_length=30):        
        self.vocab_size = vocab_size
        self.comment_length = comment_length
        self.word_to_ix = {}
        
        #Initialize word_to_ix
        self.add_word_to_ix("0")
        for comment in X_data:
            for word in comment.split(' '):        
                self.add_word_to_ix(word)
        
        
    def add_word_to_ix(self, word):
        if(word not in self.word_to_ix):
            if len(self.word_to_ix) >= self.vocab_size:
                self.word_to_ix[word] = self.vocab_size
            else:
                self.word_to_ix[word] = len(self.word_to_ix)
                
    def preprocess_input(self, sentences_to_process):
        processed_sentences = []

        for num, sentence in enumerate(sentences_to_process):
            processed_sentences.append([])
            sentence = sentence.split(' ')
            # Crop sentence or add zero padding
            if len(sentence) >= self.comment_length:
                sentence = sentence[:self.comment_length]
            else:
                new_sentence = []
                for i in range(self.comment_length):
                    if i > len(sentence) - 1:
                        new_sentence.append("0")
                    else:
                        new_sentence.append(sentence[i])
                sentence = new_sentence

            for word in sentence:
                if word in self.word_to_ix:
                    processed_sentences[num].append(self.word_to_ix[word])
                else:
                    self.add_word_to_ix(word)
                    processed_sentences[num].append(self.word_to_ix[word])
                    
        return processed_sentences
    
    @staticmethod
    def create_mini_batches(input_X, input_y, batch_size):
        batched_X = []
        batched_y = []
        for i in range(int(len(input_X) / batch_size) + 1):
            batched_X.append(input_X[i*batch_size:i*batch_size + batch_size])
            batched_y.append(input_y[i*batch_size:i*batch_size + batch_size])
            
        return (batched_X, batched_y)
        
InputProc = InputProcessor(X)

print('Total words in vocab: ', len(InputProc.word_to_ix))            
test_x = InputProc.preprocess_input([X[0]])
print(X[0])
print(test_x, 'Length: ' + str(len(test_x[0])))

Total words in vocab:  7587
mã»ts maha kangelste ees. ja nii on.
[[1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] Length: 30


In [10]:
class RNNClassifier(nn.Module):
    def __init__(self, input_vocab_size, hidden_size, output_size, n_layers=1):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        #self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        #self.hidden = self.init_hidden()    

    def forward(self, input_sentence):
        batch_size = input_sentence.size(0)
        input_sentence = input_sentence.t()
        #print('  input', input_sentence.size())
        
        embedded = self.embedding(input_sentence)
        #print('  embedded', embedded.size())
        
        hidden = self._init_hidden(batch_size)
        output, hidden = self.gru(embedded, hidden)
        #print(' gru hidden output', hidden.size())
        #print(' gru output', output.size())
        
        fc_output = self.fc(hidden)
        #print("  fc output", fc_output.size())
        
        outputs = F.softmax(fc_output, dim=2)
        return outputs
        
    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return autograd.Variable(hidden)

In [11]:
vocab_size = 10000
HIDDEN_SIZE = 32

model = RNNClassifier(vocab_size, HIDDEN_SIZE, 2)
#loss_function = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=0.1)
#print(X)
inputs = InputProc.preprocess_input(X)
print(len(inputs))
print(inputs[0])

inp = autograd.Variable(torch.LongTensor(inputs))
out = model(inp)
print("in", inp.size(), "out", out.size())


500
[1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
in torch.Size([500, 30]) out torch.Size([1, 500, 2])


In [12]:
print(out[0][0])
print(out.size())

out = out.view(out.size()[1], out.size()[2])

print(out.size())
print(out[0])


Variable containing:
 0.6536
 0.3464
[torch.FloatTensor of size 2]

torch.Size([1, 500, 2])
torch.Size([500, 2])
Variable containing:
 0.6536
 0.3464
[torch.FloatTensor of size 2]



In [54]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

NUM_EPOCHS = 50
BATCH_SIZE = 32
batched_X, batched_y = InputProc.create_mini_batches(X, y, BATCH_SIZE)



def train_model(input_X, input_y, model, optimizer, criterion, InpPrep, epochs=5):
    for epoch in range(epochs): 
        running_loss = 0.0
        running_corrects = 0
        
        for batch_num, (x, y) in enumerate(zip(input_X, input_y)):                
            model.zero_grad()
            
            x = InpPrep.preprocess_input(x)            
            predicted = model(autograd.Variable(torch.LongTensor(x)))
            predicted = predicted.view(predicted.size()[1], predicted.size()[2])
            
            loss = criterion(predicted, autograd.Variable(torch.LongTensor(y)))
            loss.backward()
            optimizer.step()

            
            #_, preds = torch.max(predicted.data, 1)
            running_loss += loss.data[0]
            
            print(loss)
            #running_corrects += torch.sum(preds == autograd.Variable(torch.LongTensor(y)).data)
            
#             if batch_num % 3 == 0:
#                 print('Epoch {}: batch {}/{} loss: {}, acc: {}'.
#                       format(epoch, batch_num, len(input_X), running_loss / 1,
#                              running_corrects / 1
#                             ))

        print("Epoch loss: {}, single: {}".format(running_loss, running_loss / BATCH_SIZE))

    return model

In [55]:
model = train_model(batched_X, batched_y, model, optimizer, criterion, InputProc, epochs=2)

Variable containing:
 0.3997
[torch.FloatTensor of size 1]

Variable containing:
 0.3445
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Variable containing:
 0.3758
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Variable containing:
 0.3445
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Variable containing:
 0.3445
[torch.FloatTensor of size 1]

Variable containing:
 0.3445
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Variable containing:
 0.3445
[torch.FloatTensor of size 1]

Variable containing:
 0.3758
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Variable containing:
 0.3445
[torch.FloatTensor of size 1]

Variable containing:
 0.3133
[torch.FloatTensor of size 1]

Epoch loss: 5.411207795143127, single: 0

In [70]:
training_preds = model(inp)
training_preds = training_preds.view(training_preds.size()[1], training_preds.size()[2])

print(training_preds.size())
_, preds = torch.max(training_preds.data, 1)

summed = np.sum(preds.numpy() == y)
print('Traning accuracy: {}'.format(summed / len(training_preds)))

torch.Size([500, 2])
Traning accuracy: 0.974


In [69]:
print(len(X_val))
inputs_val = InputProc.preprocess_input(X_val)
print(X_val[0].split(' ')[:30])
print(inputs_val[0])
inp_val = autograd.Variable(torch.LongTensor(inputs_val))

100
['ammun', 'on', 'ilmselge,', 'siililegi', 'selge,', 'et', 'eesti', 'vajab', 'otse', 'valitavat', 'presidenti.', 'presidendivalimistel', 'oleks', 'mingigi', 'mãµte,', 'kui', 'seda', 'teeks', 'rahavas.', 'ja', 'meie', 'lã¤hinaabruses', 'on', 'riigid,', 'kus', 'presidendil', 'pole', 'suuremat', 'vãµimu', 'kui']
[7587, 15, 7588, 7589, 7013, 28, 60, 3145, 80, 7590, 7591, 7592, 64, 3547, 6368, 34, 438, 7593, 7594, 5, 401, 7595, 15, 7596, 897, 7597, 181, 7598, 1549, 34]


In [90]:
val_predictions = model(inp_val)
val_predictions = val_predictions.view(val_predictions.size()[1], val_predictions.size()[2])
print(val_preds.size())

torch.Size([1, 2])


In [91]:
print(y_val[0:5])
_, val_preds = torch.max(val_predictions.data, 1)
print(val_preds[0])

summed_val = np.sum(val_preds.numpy() == y_val)
print('Validation accuracy: {}'.format(summed_val / len(val_preds)))

[1, 1, 0, 1, 1]
1
Validation accuracy: 0.64
