In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


train_data = pd.read_csv("data_vectorized.csv")

In [2]:
tokens = train_data["tokens"].apply(eval)

flat_tokens = []
for i in tokens:
    flat_tokens.extend(i)

In [3]:
# create word 2 index dictionary for mapping later

wd2ix = dict()
ix2wd = dict()

wd2ix[-1] = -1
ix2wd[-1] = -1

cnt = 1
for words in set(flat_tokens):
    wd2ix[words] = cnt
    ix2wd[cnt] = words
    cnt+=1

In [4]:
cut_tokens = 60
padding = 0

tokens_aft_cut = []
for i in tokens:
    if len(i) >= cut_tokens:
        tokens_aft_cut.append([wd2ix[word] for word in i[:cut_tokens]])
    elif len(i) < cut_tokens:
        new_i = [wd2ix[word] for word in i]
        new_i += [padding for k in range(cut_tokens-len(i))]
        tokens_aft_cut.append(new_i)

In [5]:
data_X = pd.Series(tokens_aft_cut)
data_y = train_data["vec"].apply(eval)

In [6]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.333)

In [8]:
# Original code is from https://github.com/spro/practical-pytorch
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader

#from name_dataset import NameDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Parameters and DataLoaders
HIDDEN_SIZE = 100
N_CHARS = 9758  # ASCII
N_CLASSES = 17


class RNNClassifier(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        # Note: we run this all at once (over the whole input sequence)

        # input = B x S . size(0) = B
        batch_size = input.size(0)
        # input:  B x S  -- (transpose) --> S x B
        input = input.t()
        # Embedding S x B -> S x B x I (embedding size)
        print("  input", input.size())
        embedded = self.embedding(input)
        #print(embedded)

        # Make a hidden
        hidden = self._init_hidden(batch_size)
        print("embedded: {}, hidden: {}".format(embedded.size(), hidden.size()))
        output, hidden = self.gru(embedded, hidden)
        print("  gru hidden output", output.size())
        print(" gru hidden size: ", hidden.size())
        #print(output)
        # Use the last layer output as FC's input
        # No need to unpack, since we are going to use hidden
        fc_output = self.fc(output[-1,:,:]).reshape(batch_size,N_CLASSES)
        print("  fc output", fc_output.size())
        #print(fc_output)
        return torch.sigmoid(fc_output)

    def _init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
        return Variable(hidden)
    
    
inputs = Variable(torch.LongTensor(train_X.tolist()))
labels = Variable(torch.LongTensor(train_y.tolist()))

classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, N_CLASSES)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.05)

#labels = labels.squeeze(1)
for epoch in range(50):
    outputs = classifier(inputs)
    optimizer.zero_grad()
    print(outputs.size())
    loss = criterion(outputs, labels.float())
    loss.backward()
    optimizer.step()
    print(outputs)
    print("epoch: %d, loss: %1.3f" % (epoch + 1, loss.item()))
    print("Accuracy: ", ((outputs > .5)== labels).sum().tolist() / (labels.size(0) * labels.size(1)))
    pred = classifier(Variable(torch.LongTensor(test_X.tolist())))
    true_y = Variable(torch.LongTensor(test_y.tolist()))
    print("Test Set Accuracy: ", ((pred > .5) == true_y).sum().tolist() / (true_y.size(0) * true_y.size(1)))
    #res = []
    #for i in range(len(outputs)):
    #    res.append([train_data.columns[4:-1][idx] for idx in (outputs[i]>.5).nonzero().flatten().tolist()])
    #print("Predicted : ", ''.join())

print("Learning finished!")    
    

  input torch.Size([60, 27971])
embedded: torch.Size([60, 27971, 100]), hidden: torch.Size([1, 27971, 100])
  gru hidden output torch.Size([60, 27971, 100])
 gru hidden size:  torch.Size([1, 27971, 100])
  fc output torch.Size([27971, 17])
torch.Size([27971, 17])
tensor([[0.5353, 0.5771, 0.5080,  ..., 0.6109, 0.4819, 0.4715],
        [0.5340, 0.5770, 0.5082,  ..., 0.6106, 0.4813, 0.4717],
        [0.5336, 0.5768, 0.5080,  ..., 0.6110, 0.4810, 0.4720],
        ...,
        [0.5450, 0.5841, 0.5116,  ..., 0.5912, 0.4901, 0.4724],
        [0.5337, 0.5769, 0.5080,  ..., 0.6110, 0.4810, 0.4719],
        [0.5335, 0.5769, 0.5080,  ..., 0.6111, 0.4808, 0.4720]],
       grad_fn=<SigmoidBackward>)
epoch: 1, loss: 0.908
Accuracy:  0.5841953956513785
  input torch.Size([60, 13965])
embedded: torch.Size([60, 13965, 100]), hidden: torch.Size([1, 13965, 100])
  gru hidden output torch.Size([60, 13965, 100])
 gru hidden size:  torch.Size([1, 13965, 100])
  fc output torch.Size([13965, 17])
Test Set Acc

tensor([[4.8121e-07, 5.9482e-07, 7.0428e-07,  ..., 3.9134e-07, 5.1920e-07,
         2.5632e-07],
        [2.4535e-08, 6.9098e-08, 8.3341e-08,  ..., 4.4090e-08, 2.4843e-08,
         1.7069e-08],
        [2.6651e-08, 7.6776e-08, 8.9537e-08,  ..., 4.7349e-08, 2.5620e-08,
         1.7848e-08],
        ...,
        [6.8506e-08, 1.6373e-07, 1.8841e-07,  ..., 9.7729e-08, 5.3089e-08,
         4.0937e-08],
        [9.4503e-08, 1.8551e-07, 2.0165e-07,  ..., 1.2568e-07, 7.3335e-08,
         4.5220e-08],
        [1.7178e-07, 2.8423e-07, 2.4772e-07,  ..., 1.9859e-07, 1.3248e-07,
         8.1883e-08]], grad_fn=<SigmoidBackward>)
epoch: 8, loss: 0.693
Accuracy:  0.8799723242770349
  input torch.Size([60, 13965])
embedded: torch.Size([60, 13965, 100]), hidden: torch.Size([1, 13965, 100])
  gru hidden output torch.Size([60, 13965, 100])
 gru hidden size:  torch.Size([1, 13965, 100])
  fc output torch.Size([13965, 17])
Test Set Accuracy:  0.8799814662707188
  input torch.Size([60, 27971])
embedded: torc

Test Set Accuracy:  0.8799814662707188
  input torch.Size([60, 27971])
embedded: torch.Size([60, 27971, 100]), hidden: torch.Size([1, 27971, 100])
  gru hidden output torch.Size([60, 27971, 100])
 gru hidden size:  torch.Size([1, 27971, 100])
  fc output torch.Size([27971, 17])
torch.Size([27971, 17])
tensor([[1.3607e-10, 3.3045e-10, 4.0817e-10,  ..., 2.2200e-10, 1.2783e-10,
         8.1913e-11],
        [3.5894e-11, 1.3398e-10, 1.6750e-10,  ..., 7.8131e-11, 3.6384e-11,
         2.8571e-11],
        [3.6531e-11, 1.2910e-10, 1.6899e-10,  ..., 7.5520e-11, 3.7676e-11,
         2.9859e-11],
        ...,
        [5.5680e-11, 1.8839e-10, 2.4122e-10,  ..., 1.0323e-10, 4.6838e-11,
         3.9542e-11],
        [5.8228e-11, 1.8167e-10, 2.3805e-10,  ..., 1.0934e-10, 4.3567e-11,
         3.6590e-11],
        [1.1939e-10, 3.1592e-10, 3.1682e-10,  ..., 1.9092e-10, 9.6347e-11,
         7.2833e-11]], grad_fn=<SigmoidBackward>)
epoch: 15, loss: 0.693
Accuracy:  0.8799723242770349
  input torch.Size([6

embedded: torch.Size([60, 13965, 100]), hidden: torch.Size([1, 13965, 100])
  gru hidden output torch.Size([60, 13965, 100])
 gru hidden size:  torch.Size([1, 13965, 100])
  fc output torch.Size([13965, 17])
Test Set Accuracy:  0.8799814662707188
  input torch.Size([60, 27971])
embedded: torch.Size([60, 27971, 100]), hidden: torch.Size([1, 27971, 100])
  gru hidden output torch.Size([60, 27971, 100])
 gru hidden size:  torch.Size([1, 27971, 100])
  fc output torch.Size([27971, 17])
torch.Size([27971, 17])
tensor([[3.6920e-12, 1.0717e-11, 1.4243e-11,  ..., 7.3516e-12, 3.3415e-12,
         2.3007e-12],
        [1.5905e-12, 6.5658e-12, 8.7486e-12,  ..., 3.8219e-12, 1.6201e-12,
         1.3521e-12],
        [1.6041e-12, 6.5469e-12, 8.7701e-12,  ..., 3.7837e-12, 1.6109e-12,
         1.3584e-12],
        ...,
        [1.8885e-12, 7.5925e-12, 1.0087e-11,  ..., 4.0289e-12, 1.6573e-12,
         1.4467e-12],
        [2.7542e-12, 1.4954e-11, 1.7387e-11,  ..., 7.2110e-12, 2.6429e-12,
         2.52

embedded: torch.Size([60, 13965, 100]), hidden: torch.Size([1, 13965, 100])
  gru hidden output torch.Size([60, 13965, 100])
 gru hidden size:  torch.Size([1, 13965, 100])
  fc output torch.Size([13965, 17])
Test Set Accuracy:  0.8799814662707188
  input torch.Size([60, 27971])
embedded: torch.Size([60, 27971, 100]), hidden: torch.Size([1, 27971, 100])
  gru hidden output torch.Size([60, 27971, 100])
 gru hidden size:  torch.Size([1, 27971, 100])
  fc output torch.Size([27971, 17])
torch.Size([27971, 17])
tensor([[4.6048e-13, 1.5926e-12, 2.2223e-12,  ..., 1.0919e-12, 4.1462e-13,
         3.1001e-13],
        [3.1221e-13, 1.4489e-12, 1.9124e-12,  ..., 8.3641e-13, 3.1155e-13,
         2.6896e-13],
        [3.1994e-13, 1.3881e-12, 1.9262e-12,  ..., 7.7796e-13, 3.1935e-13,
         2.8404e-13],
        ...,
        [3.5294e-13, 1.5032e-12, 2.0297e-12,  ..., 8.0587e-13, 3.0680e-13,
         2.6750e-13],
        [1.0878e-12, 4.7121e-12, 3.7529e-12,  ..., 2.8599e-12, 7.6614e-13,
         6.47

embedded: torch.Size([60, 13965, 100]), hidden: torch.Size([1, 13965, 100])
  gru hidden output torch.Size([60, 13965, 100])
 gru hidden size:  torch.Size([1, 13965, 100])
  fc output torch.Size([13965, 17])
Test Set Accuracy:  0.8799814662707188
  input torch.Size([60, 27971])
embedded: torch.Size([60, 27971, 100]), hidden: torch.Size([1, 27971, 100])
  gru hidden output torch.Size([60, 27971, 100])
 gru hidden size:  torch.Size([1, 27971, 100])
  fc output torch.Size([27971, 17])
torch.Size([27971, 17])
tensor([[1.6165e-13, 6.3123e-13, 8.8377e-13,  ..., 4.2249e-13, 1.4525e-13,
         1.1413e-13],
        [1.3370e-13, 6.5685e-13, 8.6791e-13,  ..., 3.7795e-13, 1.3268e-13,
         1.1666e-13],
        [1.1771e-13, 6.4253e-13, 8.7656e-13,  ..., 3.0994e-13, 1.2751e-13,
         1.2476e-13],
        ...,
        [1.4971e-13, 6.5144e-13, 8.8988e-13,  ..., 3.5168e-13, 1.2959e-13,
         1.1306e-13],
        [4.8638e-13, 2.1761e-12, 1.7170e-12,  ..., 1.3129e-12, 3.4780e-13,
         2.81

KeyboardInterrupt: 