Build training pipeline for models with different hparams.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import json

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchmetrics import MeanSquaredError

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger

from typing import Callable, List, Tuple, Iterable, Dict
from functools import reduce
from collections import OrderedDict

from tqdm import tqdm

import optuna
from optuna.visualization import plot_parallel_coordinate, plot_contour
from optuna.importance import get_param_importances

import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"

import wandb
import inspect

In [2]:
#!python -m spacy download en_core_web_sm

In [3]:
PAD_TOKEN = "<pad>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
SPECIAL_TOKENS = (PAD_TOKEN, EOS_TOKEN, UNK_TOKEN)

In [4]:
# spacy tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# Functions

In [5]:
def nums_from_fractions(total: int, fractions: Tuple[float]) -> Tuple[int]:
    """
    :param fractions: fractions of the total number. One elem must be -1, 
        which denotes "remaining"
    """
    assert fractions.count(-1) == 1, (
        "Must have exactly one occurence of -1 to denote a fraction of 'remaining' items"
    )
    nums = [int(total * f) if f != -1 else 0 for f in fractions]
    idx_remaining = fractions.index(-1)
    nums[idx_remaining] = total - sum(nums)
    assert all([elem >= 0 for elem in nums])
    return tuple(nums)

assert nums_from_fractions(100, [0.7, 0.3, -1]) == (70, 30, 0)
assert nums_from_fractions(100, [0.7, 0.155, -1]) == (70, 15, 15)
assert nums_from_fractions(100, [0.7, 0, -1]) == (70, 0, 30)
# tested that these lines raise error, as expected: 
# nums_from_fractions(100, [0.7, 0.3, -2])
# nums_from_fractions(100, [0.7, 0.5, -1])

def build_vocab_from_texts(
    texts: Iterable[str], tokenizer: Callable, specials=SPECIAL_TOKENS, 
    unk_token=UNK_TOKEN, **kwargs
) -> Vocab:
    tk_seqs = [tokenizer(s) for s in tqdm(texts)]
    voc = build_vocab_from_iterator(tk_seqs, specials=specials, **kwargs)
    voc.set_default_index(voc[unk_token])
    return voc

def seqs_from_texts(
    texts: List[str], tokenizer: Callable, voc: Vocab, pad_token=PAD_TOKEN
) -> th.Tensor:
    """
    Returns padded sequences (numericalized texts), in tensor form
    """
    nz_texts = [th.tensor(voc(tokenizer(text))) for text in texts]
    seqs = pad_sequence(nz_texts, padding_value=voc[pad_token])
    return seqs

def count_oov_rate(
    seqs: Iterable[th.Tensor], voc: Vocab, unk_token=UNK_TOKEN, 
    pad_token=PAD_TOKEN
) -> float:
    num_oov = 0
    num_tokens = 0
    for i, item in enumerate(seqs):
        num_oov += th.sum(item == voc[unk_token]).item()
        num_tokens += th.sum(item != voc[pad_token]).item()
    return num_oov / num_tokens

def glove_voc_and_embedding(
    embedding_dim: int, 
    glove_embedding_params: Dict,
    pad_token=PAD_TOKEN, 
    eos_token=EOS_TOKEN, 
    unk_token=UNK_TOKEN
) -> Tuple[Vocab, Embedding]:
    embedding_vecs = GloVe(name=glove_embedding_params["name"], dim=embedding_dim)

    embedding_dict = OrderedDict()
    embedding_dict.update({pad_token: 1})
    embedding_dict.update({eos_token: 1})
    embedding_dict.update({unk_token: 1})
    embedding_dict.update(embedding_vecs.stoi)
    # min_freq=0 is a hack to read in the 0th token from embedding_vecs.stoi
    voc = vocab(embedding_dict, min_freq=0)
    voc.set_default_index(voc[unk_token])

    embedding = Embedding.from_pretrained(
        embedding_vecs.vectors, freeze=glove_embedding_params["freeze_embedding"], 
        padding_idx=voc[pad_token]
    )
    
    return voc, embedding

In [6]:
class TextDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: Callable, voc: Vocab) -> None:
        assert "text" in df.columns
        assert "target" in df.columns
        self.tokenizer = tokenizer
        self.voc = voc
        
        nz_texts = []  # numericalized_texts
        seq_lengths = []  # sequence lengths
        for text in tqdm(df.text):
            nz_text = th.tensor(self.voc(self.tokenizer(text)))
            nz_texts.append(nz_text)
            seq_lengths.append(len(nz_text))
        
        # shape of x is: T x B, where T is length of longest seq, B is batch size
        self.seqs = pad_sequence(nz_texts, padding_value=self.voc[PAD_TOKEN])
        self.seq_lengths = th.tensor(seq_lengths)
        self.targets = th.tensor(df.target.values).float()
        
    def __len__(self) -> int:
        return len(self.targets)
    
    def __getitem__(self, i: int) -> Tuple[Tuple[th.Tensor, int], float]:
        seq = self.seqs[:, i]
        seq_length = self.seq_lengths[i]
        targets = self.targets[i]
        return (seq, seq_length), targets

In [7]:
class VanillaRNN(pl.LightningModule):
    def __init__(
        self, 
        embedding: nn.Embedding, 
        hidden_size: int = 128, 
        num_layers: int = 1,
        lr: float = 1e-3, 
        dropout: float = 0.5
    ):
        super().__init__()
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lr = lr
        self.dropout = dropout
        
        self.save_hyperparameters(ignore=['embedding'])

        # TODO: try using bidirectional in rnn
        self.rnn = nn.RNN(
            self.embedding.embedding_dim, self.hidden_size, batch_first=True, 
            dropout=self.dropout, num_layers=self.num_layers
        )
        self.fc = nn.Linear(self.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x: List[th.Tensor]) -> th.Tensor:
        assert len(x) == 2
        seqs, seq_lengths = x
        embedded = self.embedding(seqs)
        packed = pack_padded_sequence(
            embedded, seq_lengths, batch_first=True, enforce_sorted=False
        )
        
        # TODO: try usng a randomly generated initial hidden state 
        # (instead of the zero vector default)
        _, h_n = self.rnn(packed)
        
        assert h_n.shape[0], h_n.shape[2] == (1, self.hidden_size)
        
        x = h_n[-1, :, :]
        x = self.fc(x)
        x = self.sigmoid(x)
        return x
    
    def training_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        return self.generalized_step(batch, batch_idx, "train")

    def validation_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        return self.generalized_step(batch, batch_idx, "val")

    def test_step(self, batch: th.Tensor, batch_idx: int) -> th.Tensor:
        return self.generalized_step(batch, batch_idx, "test")
    
    def generalized_step(self, batch: th.Tensor, batch_idx: int, label: str) -> th.Tensor:
        x, y = batch
        predicted = self(x).squeeze(1)
        loss = F.mse_loss(predicted, y)
        self.log(f"{label}_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), self.lr)
        return optimizer
    
def construct_model(model_config: Dict):
    if model_config["model_arch"] == "VanillaRNN":
        params = list(inspect.signature(VanillaRNN).parameters)
        relevant_params = [p for p in params if p != "embedding"]
        hparams = {k: v for k, v in model_config.items() if (k in relevant_params)}
        return VanillaRNN(embedding, **hparams)
    else:
        raise NotImplementedError

In [8]:
df = pd.read_csv("data/data_disaster_tweets.csv")
df = df[["text", "target"]].reset_index(drop=True)

data_config = dict(
    fractions=[0.7, 0.15, -1],
    batch_size=64
)
embedding_dim = 100
glove_embedding_config = None  # contains keys: name, freeze_embedding

model_config = dict(
    model_arch="VanillaRNN",
    num_layers=3,
    hidden_size=128,
    lr=1e-3,
    dropout=0.5,
    layer_norm=False,
    residual_connections=False,
    loss_fn="MSELoss",
)

wandb_config = dict(
    project='scratch', 
    log_model=False
)

train_config = dict(
    max_epochs=10
)

In [9]:
texts = {}
texts["train"], texts["val"], texts["test"] = random_split(
    df.text, nums_from_fractions(len(df.text), data_config["fractions"])
)

if glove_embedding_config:
    voc, embedding = glove_voc_and_embedding(embedding_dim, glove_embedding_config)
else:
    voc = build_vocab_from_texts(texts["train"], tokenizer)
    embedding = Embedding(len(voc), embedding_dim, padding_idx=voc[PAD_TOKEN])
    
oov_rates = {}
dls = {}  # dataloaders
for label in ["train", "val", "test"]:
    oov_rates[label] = count_oov_rate(seqs_from_texts(texts[label], tokenizer, voc), voc)
    
    ds = TextDataset(df.iloc[texts[label].indices], tokenizer, voc)
    dls[label] = DataLoader(ds, batch_size=data_config["batch_size"], shuffle=True)

100%|█████████████████████████████████████████████████████████| 5329/5329 [00:01<00:00, 3932.66it/s]
100%|████████████████████████████████████████████████████████| 5329/5329 [00:00<00:00, 12927.25it/s]
100%|████████████████████████████████████████████████████████| 1141/1141 [00:00<00:00, 13577.23it/s]
100%|████████████████████████████████████████████████████████| 1143/1143 [00:00<00:00, 14668.14it/s]


In [13]:
model = construct_model(model_config)

In [14]:
logger = WandbLogger(**wandb_config)
logger.watch(model, log="all")
trainer = pl.Trainer(max_epochs=train_config["max_epochs"], logger=logger)
trainer.fit(model, dls["train"], dls["val"])
wandb.finish()

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 2.1 M 
1 | rnn       | RNN       | 95.5 K
2 | fc        | Linear    | 129   
3 | sigmoid   | Sigmoid   | 0     
----------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.780     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.08MB of 0.08MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇██
train_loss,▇▆██▆▆▅▅▃▄▃▃▃▃▁▄
trainer/global_step,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
val_loss,▄▂▂▁▄▅█▇██

0,1
epoch,9.0
train_loss,0.08755
trainer/global_step,839.0
val_loss,0.23524


# Scratch