In [1]:
import sys

sys.path.append("..")

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import yaml
import numpy as np
import scipy.sparse as sp
import pickle as pkl
import os

from preprocess.utils import DocumentStatsBuilder as DSBuilder
from preprocess.utils import TextConverter
from preprocess.data import TextDataForTC
from time import time

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Prep

In [5]:
start = time()
config_file = 'config.yaml'
text_data = TextDataForTC(config_file, '20ng')
label2idx = text_data.get_label2idx()
vocab = text_data.get_vocab()
word2idx = vocab.get_word2idx()
text_ls_train = text_data.get_text_ls('train')
text_ls_val = text_data.get_text_ls('val')
text_ls_test = text_data.get_text_ls('test')
label_ls_train = text_data.get_label_ls('train')
label_ls_val = text_data.get_label_ls('val')
label_ls_test = text_data.get_label_ls('test')
print("time taken: " + str(time() - start))

time taken: 15.879352569580078


In [6]:
import torch
from torch import nn, LongTensor
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from IPython.core.debugger import set_trace
from dataloader import MyDataset, batchify, batchify_test

In [7]:
w2i = vocab.get_word2idx()
w2i["<unk>"] = len(w2i)

In [8]:
def text_2_int_list(ls, vocab_dict):
    """ map list of strings to list of list of ints """
    result = []
    for t in ls:
        sent = t.split()
        ints = [vocab_dict[w] if w in vocab_dict else vocab_dict["<unk>"] for w in sent]
        result.append(ints)
    return result

In [9]:
train_x = text_2_int_list(text_ls_train, w2i)
valid_x = text_2_int_list(text_ls_val, w2i)
test_x  = text_2_int_list(text_ls_test, w2i)

In [10]:
train = MyDataset(train_x, label_ls_train)
valid = MyDataset(valid_x, label_ls_val)
test = MyDataset(test_x)

### Training

In [11]:
from torch import nn, LongTensor, Tensor
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.optim as optim
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [12]:
import time
def accuracy(preds, y):
    return (np.array(preds) == np.array(y)).astype(int).mean()


def train_epoch(epoch, model, optimizer, criterion):
    model.train()
    train_loss, n_data = 0, 0
    start = time.time()
    preds = []
    labels = []
    for i, (x, y) in enumerate(train_loader):
        n_data += x.size()[0]
        labels.extend(y.tolist())
        if torch.cuda.is_available(): x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        out = model(x)
        preds.extend(out.argmax(axis=1).tolist())
        loss = criterion(out, y)
        loss.backward()
        if grad_clip: torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        train_loss += loss
        if i % print_iter == print_iter - 1:
            model, valid_preds, valid_labels, valid_loss = validate(model, criterion)
            print("""epoch {} - batch [{}/{}] - train loss: {:.2f} - acc: {:.3f} - valid loss : {:.2f} - acc : {:.3f} time taken: {:.2f}""".format(epoch, i, 
                len(train_loader), train_loss/(i+1),
                accuracy(preds, labels), valid_loss, accuracy(valid_preds, valid_labels),
                time.time()-start), flush=True)

            model.train()
            start = time.time()
            train_loss = 0

    # end of epoch
    model, valid_preds, valid_labels, valid_loss = validate(model, criterion)
    print("""epoch {} - batch [{}/{}] - train loss: {:.2f} - acc: {:.3f} - valid loss : {:.2f} - acc : {:.3f} time taken: {:.2f}""".format(epoch, i, 
        len(train_loader), train_loss/(i+1),
        accuracy(preds, labels), valid_loss, accuracy(valid_preds, valid_labels),
        time.time()-start), flush=True)
    return model

def learning_rate_decay(optimizer):
    for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * 0.1
    return optimizer

def training(model, epoches, lr, wd):
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    criterion = nn.CrossEntropyLoss()
    for ep in range(epoches):
        model = train_epoch(ep, model, optimizer, criterion)
        optimizer = learning_rate_decay(optimizer)
    return model

def validate(model, criterion):
    model.eval()
    valid_loss = 0
    preds, labels = [], []
    for i, (x, y) in enumerate(valid_loader):
        labels.extend(y.tolist())
        if torch.cuda.is_available(): x, y = x.cuda(), y.cuda()
        out = model(x)
        loss = criterion(out, y)
        preds.extend(out.argmax(axis=1).tolist())
        valid_loss += loss
    return model, preds, labels, valid_loss/(i+1)
    
def predict(model, loader):
    model.eval()
    preds, labels = [], []
    for i, (x, _) in enumerate(loader):
        if torch.cuda.is_available(): x = x.cuda()
        out = model(x)
        preds.extend(out.argmax(axis=1).tolist())
    return preds

In [13]:
from torch.nn.utils import weight_norm

class LSTM_clf(nn.Module):

    def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               layers=1, bidirectional=False):
        super(LSTM_clf, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_dim)
        self.net = nn.LSTM(embed_dim, hidden_dim,  num_layers=layers, 
                           bidirectional=bidirectional, dropout=0.5)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(hidden_dim * (int(bidirectional) + 1))
        self.linear = nn.Linear(hidden_dim * (int(bidirectional) + 1), out_size)

    def forward(self, x):
        out = self.word_embedding(x)
        out = self.net(out)[0]
        out = self.relu(out).transpose(1,2)
        out = F.max_pool1d(out, out.size()[2]).squeeze()
        out = self.linear(self.bn(out))
        return out

### Simple LSTM

In [25]:
torch.manual_seed(1)
embed_dim = 200
lstm_hidden = 200
layers = 2
vocab_size = len(w2i)
bs = 8
n_class = len(label2idx)
lr = 0.002
grad_clip = 1
print_iter = 400
lstm = LSTM_clf(embed_dim, lstm_hidden, vocab_size, n_class, layers)

In [26]:
train_loader = DataLoader(train, batch_size=bs, shuffle=True, collate_fn=batchify)
valid_loader = DataLoader(valid, batch_size=bs, shuffle=False, collate_fn=batchify)
test_loader = DataLoader(test, batch_size=bs, shuffle=False, collate_fn=batchify_test)

In [27]:
%time training(lstm, 10, 2e-3, 1e-4)

epoch 0 - batch [399/1273] - train loss: 3.10 - acc: 0.069 - valid loss : 3.07 - acc : 0.073 time taken: 18.89
epoch 0 - batch [799/1273] - train loss: 1.50 - acc: 0.076 - valid loss : 2.97 - acc : 0.104 time taken: 20.22
epoch 0 - batch [1199/1273] - train loss: 0.97 - acc: 0.085 - valid loss : 3.02 - acc : 0.118 time taken: 20.62
epoch 0 - batch [1272/1273] - train loss: 0.16 - acc: 0.088 - valid loss : 2.97 - acc : 0.121 time taken: 4.40
epoch 1 - batch [399/1273] - train loss: 2.79 - acc: 0.138 - valid loss : 2.88 - acc : 0.137 time taken: 20.35
epoch 1 - batch [799/1273] - train loss: 1.38 - acc: 0.144 - valid loss : 2.81 - acc : 0.155 time taken: 19.70
epoch 1 - batch [1199/1273] - train loss: 0.91 - acc: 0.153 - valid loss : 2.77 - acc : 0.178 time taken: 19.01
epoch 1 - batch [1272/1273] - train loss: 0.15 - acc: 0.155 - valid loss : 2.74 - acc : 0.195 time taken: 4.83
CPU times: user 2min 11s, sys: 26.1 s, total: 2min 37s
Wall time: 2min 8s


LSTM_clf(
  (word_embedding): Embedding(42904, 200)
  (net): LSTM(200, 200, num_layers=2, dropout=0.5)
  (relu): ReLU()
  (bn): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear): Linear(in_features=200, out_features=20, bias=True)
)

### Transformer

In [14]:
from models.classifiers import TransformerModel

In [24]:
torch.manual_seed(1)
embed_dim = 200
nhead = 4
nhid = 200
layers = 2
vocab_size = len(w2i)
bs = 4
n_class = len(label2idx)
lr = 0.002
grad_clip = 1
print_iter = 10
dropout = 0.5

transformer = TransformerModel(n_class, embed_dim, nhead, nhid, layers, len(w2i), dropout)

In [25]:
train_loader = DataLoader(train, batch_size=bs, shuffle=True, collate_fn=batchify)
valid_loader = DataLoader(valid, batch_size=bs, shuffle=False, collate_fn=batchify)
test_loader = DataLoader(test, batch_size=bs, shuffle=False, collate_fn=batchify_test)

In [26]:
%time training(transformer, 10, 2e-3, 1e-4)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.17 GiB total capacity; 10.87 GiB already allocated; 320.00 KiB free; 10.88 GiB reserved in total by PyTorch)