Investigate differences between using glove vocab vs using training-generated vocab

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import json

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchmetrics import MeanSquaredError

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger

from typing import Callable, List, Tuple, Iterable
from functools import reduce
from collections import OrderedDict

from tqdm import tqdm

import optuna
from optuna.visualization import plot_parallel_coordinate, plot_contour
from optuna.importance import get_param_importances

import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"

import wandb

In [4]:
PAD_TOKEN = "<pad>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
SPECIAL_TOKENS = (PAD_TOKEN, EOS_TOKEN, UNK_TOKEN)

In [5]:
# spacy tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# glove embeddings --> vocab
# embedding_dim = 100
# embedding_vecs = GloVe(name='6B', dim=embedding_dim)

# embedding_dict = OrderedDict()
# embedding_dict.update({PAD_TOKEN: 1})
# embedding_dict.update({EOS_TOKEN: 1})
# embedding_dict.update({UNK_TOKEN: 1})
# embedding_dict.update(embedding_vecs.stoi)
# # min_freq=0 is a hack to read in the 0th token from embedding_vecs.stoi
# voc = vocab(embedding_dict, min_freq=0)
# voc.set_default_index(voc[UNK_TOKEN])

# # glove embeddings --> embedding module
# embedding = Embedding.from_pretrained(
#     embedding_vecs.vectors, freeze=True, padding_idx=voc[PAD_TOKEN]
# )

In [67]:
def build_vocab_from_texts(texts: Iterable[str], tokenizer: Callable, **kwargs) -> Vocab:
    tk_seqs = [tokenizer(s) for s in texts]
    voc = build_vocab_from_iterator(tk_seqs, **kwargs)
    voc.set_default_index(voc[UNK_TOKEN])
    return voc

def num_train_val_test_from_ratios(
    n: int, train_ratio: float = 0.7, val_ratio: float = 0.15
) -> Tuple[int]:
    num_train = int(train_ratio * n)
    num_val = int(val_ratio * n)
    num_test = n - num_train - num_val
    assert num_train > 0
    assert num_val > 0
    assert num_test > 0, num_test
    
    return num_train, num_val, num_test

assert num_train_val_test_from_ratios(10, 0.7, 0.15) == (7, 1, 2)

def oov_rate(seqs: Iterable[th.Tensor], voc: Vocab) -> float:
    num_oov = 0
    num_tokens = 0
    for i, item in enumerate(seqs):
        # item = d[0][0]
        num_oov += th.sum(item == voc[UNK_TOKEN]).item()
        num_tokens += th.sum(item != voc[PAD_TOKEN]).item()
    return num_oov / num_tokens

In [47]:
df = pd.read_csv("data/data_disaster_tweets.csv")
# voc = build_vocab_from_texts(df.text, tokenizer, specials=SPECIAL_TOKENS)

In [48]:
"""
split
build voc from train texts
oov_rate
"""

'\nsplit\nbuild voc from train texts\noov_rate\n'

In [56]:
train_set, val_set, test_set = random_split(
    df.text, num_train_val_test_from_ratios(len(df.text))
)

In [63]:
voc = build_vocab_from_texts(train_set, tokenizer, specials=SPECIAL_TOKENS)

In [70]:
nz_texts = [th.tensor(voc(tokenizer(text))) for text in test_set]
seqs = pad_sequence(nz_texts, padding_value=voc[PAD_TOKEN])
# oov_rate()

In [74]:
oov_rate(seqs, voc)

0.1549107804853937

# Dataset

In [6]:
class TwitterDisasterDataset(Dataset):
    """
    Disaster tweet data. Download: dataset from 
    https://www.kaggle.com/c/nlp-getting-started, rename the `train.csv` 
    as `data_disaster_tweets.csv`.
    """
    def __init__(
        self, tokenizer: Callable, voc: Vocab, df: Optional[pd.DataFrame] = None
    ) -> None:
        self.tokenizer = tokenizer
        self.voc = voc
        
        if not df:
            # Load data and remove unnecessary columns
            df = pd.read_csv("data/data_disaster_tweets.csv")
            df = df[["text", "target"]]
            df = df.reset_index(drop=True)
        
        # TODO: test
        # print("Warning: testing only with 1000 data points")
        # df = df.iloc[0:1000, :] 
        
        nz_texts = []  # numericalized_texts
        seq_lengths = []  # sequence lengths
        for text in tqdm(df.text):
            nz_text = th.tensor(self.voc(self.tokenizer(text)))
            nz_texts.append(nz_text)
            seq_lengths.append(len(nz_text))
        
        # shape of x is: T x B, where T is length of longest seq, B is batch size
        self.seqs = pad_sequence(nz_texts, padding_value=self.voc[PAD_TOKEN])
        self.seq_lengths = th.tensor(seq_lengths)
        self.targets = th.tensor(df.target).float()
        
    def __len__(self) -> int:
        return len(self.targets)
    
    def __getitem__(self, i: int) -> Tuple[Tuple[th.Tensor, int], float]:
        seq = self.seqs[:, i]
        seq_length = self.seq_lengths[i]
        targets = self.targets[i]
        return (seq, seq_length), targets

In [7]:
%%time
full_ds = TwitterDisasterDataset(tokenizer, voc)

num_train = int(0.7 * len(full_ds))
num_val = int(0.15 * len(full_ds))
num_test = len(full_ds) - num_train - num_val
batch_size = 64

train_ds, val_ds, test_ds = random_split(full_ds, [num_train, num_val, num_test])

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)  
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

100%|█████████████████████████████████████████████████████████| 7613/7613 [00:01<00:00, 4015.74it/s]


CPU times: user 1.93 s, sys: 122 ms, total: 2.05 s
Wall time: 2.08 s


In [8]:
test = next(iter(val_dl))
assert len(test) == 2
assert len(test[0]) == 2
assert isinstance(test[0][0], th.Tensor)
assert isinstance(test[0][1], th.Tensor)
assert isinstance(test[1], th.Tensor)

# Investigate: oov rate?

In [54]:
def oov_rate(dl: DataLoader, voc: Vocab) -> float:
    num_oov = 0
    num_tokens = 0
    for i, d in enumerate(dl):
        seqs = d[0][0]
        num_oov += th.sum(seqs == voc[UNK_TOKEN]).item()
        num_tokens += th.sum(seqs != voc[PAD_TOKEN]).item()
    return num_oov / num_tokens
        

In [55]:
compute_oov_rate(train_dl, voc)

0.3223282906769

In [56]:
compute_oov_rate(val_dl, voc)

0.33188698782111337

In [57]:
compute_oov_rate(test_dl, voc)

0.3217611780194061

In [46]:
num_train

5329

In [43]:
len(train_dl)

84

In [19]:
test = next(iter(train_dl))[0][0]

In [30]:
th.sum(test == voc[UNK_TOKEN]).item()
th.sum(test != voc[PAD_TOKEN]).item()

351