In [1]:
import os
import sys
import torch

PROJ_DIR = os.path.join(os.environ['WORKSPACE'], 'tutorial/')

if PROJ_DIR not in sys.path:
    sys.path.append(PROJ_DIR)

# Read datasets

In [2]:
import pickle
from src.dataset import IMDBDatset
from src.utilities import flatten

with open('data.pickle', 'rb') as fp:
    corpus = pickle.load(fp)

# Create dataloader

In [3]:
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler

def collate(batch):
    tokens, labels = zip(*batch)
    targets = torch.tensor(labels, dtype=torch.long)
    return tokens, targets

def get_dataloader(dataset, batch_size, shuffle=False):
    sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
    dloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate)
    return dloader

dataloaders = {
    'train': get_dataloader(corpus['train'], batch_size=32, shuffle=True),
    'dev':   get_dataloader(corpus['dev'],   batch_size=128, shuffle=False),
    'test':  get_dataloader(corpus['test'],  batch_size=128, shuffle=False)
}

The train dataset will vary every time we iterate it because it uses `RandomSampler`:

In [4]:
for batch in dataloaders['train']:
    print("Batch size:", len(batch[0]))
    print(batch[1])
    break

Batch size: 32
tensor([1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 1, 0, 0, 1])


Whereas the dev and test datasets will keep in the same order because of the `SequentialSampler`:

In [5]:
for batch in dataloaders['dev']:
    print("Batch size:", len(batch[0]))
    print(batch[1])
    break

Batch size: 128
tensor([0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
        0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 1])


# Initialize the classifier

In [6]:
from src.nets.embedder import WordEmbedder
from src.nets.lstm import LSTMLayer
from src.nets.classifier import LSTMClassifier

vocab = set(flatten(corpus['train'].tokens + corpus['dev'].tokens))

def create_lstm_classifier():
    embedder = WordEmbedder(vocab, os.path.join(PROJ_DIR, 'glove.6B/glove.6B.100d.txt'))
    lstm_layer = LSTMLayer(embedder.emb_dim, hidden_dim=64, bidirectional=True, num_layers=2)
    lstm_model = LSTMClassifier(embedder, lstm_layer)
    return lstm_model

model = create_lstm_classifier()
model

LSTMClassifier(
  (embedder): WordEmbedder(
    (embeddings): Embedding(21695, 100)
  )
  (extractor): LSTMLayer(
    (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  )
  (classifier): Linear(in_features=64, out_features=1, bias=True)
  (xentropy): BCEWithLogitsLoss()
)

# Training

In [7]:
from sklearn.metrics import accuracy_score
from src.utilities import process_logits

def track_best_model(model_path, model, epoch, best_acc, dev_acc, dev_loss):
    if best_acc > dev_acc:
        return best_acc, ''
    state = {
        'epoch': epoch,
        'acc': dev_acc,
        'loss': dev_loss,
        'model': model.state_dict()
    }
    torch.save(state, model_path)
    return dev_acc, ' * '


def train(model, dataloaders, optimizer, config):
    best_acc = 0
    for epoch in range(1, config['epochs'] + 1):
        epoch_msg = f'E{epoch:03d}'
        epoch_track = ''
        
        for dataset in dataloaders:
            if dataset == 'train':
                model.train()
                model.zero_grad()
            else:
                model.eval()

            epoch_loss = 0
            preds, truth = [], []

            # ========================================================================
            for batch_i, (tokens, targets) in enumerate(dataloaders[dataset]):
                result = model(tokens, targets)
                loss = result['loss']
                
                if dataset == 'train':
                    loss.backward() 
                    optimizer.step()
                    optimizer.zero_grad()
                    model.zero_grad()

                epoch_loss += loss.item() * len(targets)
                batch_preds, _ = process_logits(result['output'])

                preds += batch_preds
                truth += targets.data.cpu().tolist()
            # ========================================================================
            
            epoch_acc = accuracy_score(truth, preds)
            epoch_loss /= len(dataloaders[dataset].dataset)
            epoch_msg += ' [{}] Loss: {:.4f}, Acc: {:.4f}'.format(dataset.upper(), epoch_loss, epoch_acc)
            
            if dataset == 'dev':
                best_acc, epoch_track = track_best_model(config['checkpoint'], model, epoch, best_acc, epoch_acc, epoch_loss)

        print(epoch_msg + epoch_track)
    print("Done training!")
    
    state = torch.load(config['checkpoint'])
    model.load_state_dict(state['model'])
    
    print('Returning best model from epoch {} with loss {:.5f} and accuracy {:.5f}'.format(
        state['epoch'], state['loss'], state['acc']))
    return model

In [8]:
import torch.optim as optim

config = {
    'lr': 1e-2,
    'momentum': 0.99,
    'epochs': 10,
    'checkpoint': 'lstm_model.pt'
}

params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.SGD(params, lr=config['lr'], momentum=config['momentum'])
model = train(model, dataloaders, optimizer, config)

E001 [TRAIN] Loss: 0.6943, Acc: 0.5145 [DEV] Loss: 0.6926, Acc: 0.4942 [TEST] Loss: 0.6922, Acc: 0.5021 * 
E002 [TRAIN] Loss: 0.6803, Acc: 0.5567 [DEV] Loss: 0.6888, Acc: 0.5184 [TEST] Loss: 0.6957, Acc: 0.5081 * 
E003 [TRAIN] Loss: 0.6465, Acc: 0.6233 [DEV] Loss: 0.6905, Acc: 0.5972 [TEST] Loss: 0.6745, Acc: 0.5980 * 
E004 [TRAIN] Loss: 0.5818, Acc: 0.6942 [DEV] Loss: 0.5376, Acc: 0.7318 [TEST] Loss: 0.5385, Acc: 0.7234 * 
E005 [TRAIN] Loss: 0.5194, Acc: 0.7412 [DEV] Loss: 0.5141, Acc: 0.7478 [TEST] Loss: 0.5202, Acc: 0.7392 * 
E006 [TRAIN] Loss: 0.4884, Acc: 0.7595 [DEV] Loss: 0.5008, Acc: 0.7522 [TEST] Loss: 0.5019, Acc: 0.7439 * 
E007 [TRAIN] Loss: 0.4719, Acc: 0.7755 [DEV] Loss: 0.5117, Acc: 0.7490 [TEST] Loss: 0.5136, Acc: 0.7413
E008 [TRAIN] Loss: 0.4459, Acc: 0.7964 [DEV] Loss: 0.5198, Acc: 0.7468 [TEST] Loss: 0.5195, Acc: 0.7428
E009 [TRAIN] Loss: 0.4178, Acc: 0.8115 [DEV] Loss: 0.5471, Acc: 0.7456 [TEST] Loss: 0.5445, Acc: 0.7452
E010 [TRAIN] Loss: 0.3823, Acc: 0.8357 [DEV] L