#### imports and constants

In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import random
seed_val = 42
device = "cuda:0"
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [3]:
data_path = "data/aclImdb/imdb"
embedding_path = "resources/word_vectors/glove.6B.200d.txt"

#### load data

In [4]:
df_train = pd.read_csv(data_path+"_train_clean.csv")
df_train, df_val = train_test_split(df_train, test_size=0.1)
df_test = pd.read_csv(data_path+"_test_clean.csv")

In [5]:
def balance_df(df):
    pos = df[df.label==1]
    neg = df[df.label==0].sample(len(pos))
    return pd.concat([pos,neg])

#### load GLOVE embeddings

In [6]:
def load_embedding(path):
    words = [ ]
    vals = [ ]
    with open(path, encoding='utf-8') as fin:
        fin.readline()
        for line in fin:
            line = line.rstrip()
            if line:
                parts = line.split(' ')
                words.append(parts[0])
                vals += [float(x) for x in parts[1:]]
    return words, np.asarray(vals).reshape(len(words),-1)

In [7]:
words, embeddings = load_embedding(embedding_path)

### Dataset

In [8]:
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [9]:
def pad_sequences(inp, maxlen, token=0):
    if len(inp) >= maxlen:
        return inp[:maxlen-1] + [inp[-1]]
    else:
        return inp + [token]*(maxlen - len(inp))

In [10]:
class wordLSTMDataset(Dataset):
    def __init__(self, words, text_df, maxlen=128):
        self.word2ind = defaultdict(lambda:len(words))  # this is the out of vocabulary token
        self.maxlen = maxlen           # fixed length for padding sequences
        self.pad_ind = len(words) + 1  # a padding index so that all inputs are the same length
        self.word2ind.update({word:i for i,word in enumerate(words)})
        self.text_df = text_df
        
    def __getitem__(self, i):
        row = self.text_df.iloc[i]
        text, label = row.content, row.label
        word_tokens = [self.word2ind[w] for w in text.split()]
        return torch.Tensor(pad_sequences(word_tokens, self.maxlen, self.pad_ind)).long(), label
    
    def __len__(self):
        return len(self.text_df)

In [11]:
dataset_train = wordLSTMDataset(words, df_train)
dataset_val = wordLSTMDataset(words, df_val)

In [12]:
# dataloaders
dataloader_train = DataLoader(dataset_train, batch_size=32, num_workers=8, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=32, num_workers=8, shuffle=False)

### Model

In [13]:
from torch import nn

In [14]:
class WordLSTM(nn.Module):
    def __init__(self, embedding, hidden_size=150, depth=1, dropout=0.3, nclasses=2, fix_emb=True,
                 normalise=False):
        super(WordLSTM, self).__init__()
        
        if normalise:
            embedding /= np.linalg.norm(embedding,axis=1).reshape(-1, 1)
        
        self.drop = nn.Dropout(dropout)
        self.embedding = nn.Embedding(embedding.shape[0]+2, embedding.shape[1])
        self.embedding.weight.data.uniform_(-0.25, 0.25)

        self.embedding.weight.data[:len(embeddings)].copy_(torch.from_numpy(embeddings))
        
        if fix_emb:
            self.embedding.weight.requires_grad = False
        
        self.lstm = nn.LSTM(embedding.shape[1], hidden_size//2, depth, dropout=dropout, bidirectional=True, batch_first=True)
        self.bn = nn.BatchNorm1d(hidden_size)
        self.out = nn.Linear(hidden_size, nclasses)

    def forward(self, x):
        emb = self.embedding(x)
        emb = self.drop(emb)
        output, hidden = self.lstm(emb)
        output = torch.tanh(self.bn(torch.max(output, dim=1)[0]))
        output = self.drop(output)
        return self.out(output)

In [15]:
model = WordLSTM(embeddings)
device = 'cuda'
model = model.to(device)

  "num_layers={}".format(dropout, num_layers))


### loss and optimiser

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

### training loop

In [17]:
def val_metrics(model, dataloader, device):
    running_accuracy = 0.0
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # forward
        inputs = inputs.to(device)
        with torch.no_grad():
            model.eval()
            outputs = model(inputs)
        labels = labels.to(device)
        loss = criterion(outputs, labels.view(-1))

        _, preds = outputs.max(1)
        running_accuracy += ((preds == labels.view(-1)).sum().to(dtype=torch.float)/len(outputs)).item()


        # print statistics
        running_loss += loss.item()
    # loss
    print('[%d, %5d] val loss: %.3f' %
          (1, i + 1, running_loss / (i+1)))

    # accuracy
    print('[%d, %5d] val accuracy: %.3f' %
          (1, i + 1, running_accuracy / (i+1)))


In [18]:
for epoch in range(5):  # loop over the dataset multiple times
    running_accuracy = 0.0
    running_loss = 0.0
    for i, data in enumerate(dataloader_train, 0):
        model.train()
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        inputs = inputs.to(device)
        outputs = model(inputs)
        labels = labels.to(device)
        loss = criterion(outputs, labels.view(-1))
        loss.backward()
        optimizer.step()
        
        _, preds = outputs.max(1)
        running_accuracy += ((preds == labels.view(-1)).sum().to(dtype=torch.float)/len(outputs)).item()
        
        # print statistics
        n = 200
        running_loss += loss.item()
        if i % n == n-1:    # print every n mini-batches
            # loss
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / n))
            running_loss = 0.0
            
            # accuracy
            print('[%d, %5d] accuracy: %.3f' %
                  (epoch + 1, i + 1, running_accuracy / n))
            running_accuracy = 0.0
    val_metrics(model, dataloader_val, device)
print('Finished Training')

[1,   200] loss: 0.541
[1,   200] accuracy: 0.713
[1,   400] loss: 0.420
[1,   400] accuracy: 0.808
[1,    46] val loss: 0.367
[1,    46] val accuracy: 0.833
[2,   200] loss: 0.374
[2,   200] accuracy: 0.830
[2,   400] loss: 0.374
[2,   400] accuracy: 0.838
[1,    46] val loss: 0.355
[1,    46] val accuracy: 0.839
[3,   200] loss: 0.354
[3,   200] accuracy: 0.851
[3,   400] loss: 0.347
[3,   400] accuracy: 0.848
[1,    46] val loss: 0.311
[1,    46] val accuracy: 0.867
[4,   200] loss: 0.341
[4,   200] accuracy: 0.848
[4,   400] loss: 0.331
[4,   400] accuracy: 0.853
[1,    46] val loss: 0.319
[1,    46] val accuracy: 0.857
[5,   200] loss: 0.315
[5,   200] accuracy: 0.863
[5,   400] loss: 0.317
[5,   400] accuracy: 0.864
[1,    46] val loss: 0.311
[1,    46] val accuracy: 0.854
Finished Training


In [19]:
torch.save(model.state_dict(), open(data_path+'_word_lstm.pth', 'wb'))

## Inference

In [20]:
base_path = 'data/aclImdb/imdb'
model_path = base_path + '_word_lstm.pth'
tst_path = base_path + '_test_clean.csv'
out_path = base_path + '_test_pred_lstm.csv'

In [21]:
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [22]:
tst_df = pd.read_csv(tst_path)
tst_df = tst_df.dropna()

In [23]:
# datasets
test_sent_dataset = wordLSTMDataset(words, tst_df)

# dataloaders
valid_dataloader = DataLoader(test_sent_dataset, batch_size=32, num_workers=8, shuffle=False)

In [24]:
pred_list = []
prob_list = []

for data in valid_dataloader:
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # forward + backward + optimize
        inputs = inputs.to(device)
        with torch.no_grad():
            model.eval()
            outputs = model(inputs)
        _, preds = outputs.max(1)
        pred_list.append(preds.cpu().numpy())
        prob_list.append(outputs.cpu().numpy())
        
tst_df['preds'] = np.concatenate(pred_list)

In [25]:
(tst_df['preds']==tst_df['label']).sum()/len(tst_df)

0.8699581815729125

In [26]:
(tst_df['preds']==tst_df['label']).sum()/len(tst_df)

0.8699581815729125

In [28]:
tst_df.to_csv(out_path, index=False)