Investigate differences between using glove vocab vs using training-generated vocab

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import json

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchmetrics import MeanSquaredError

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger

from typing import Callable, List, Tuple, Iterable
from functools import reduce
from collections import OrderedDict

from tqdm import tqdm

import optuna
from optuna.visualization import plot_parallel_coordinate, plot_contour
from optuna.importance import get_param_importances

import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"

import wandb
import os

In [2]:
PAD_TOKEN = "<pad>"
EOS_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
SPECIAL_TOKENS = (PAD_TOKEN, EOS_TOKEN, UNK_TOKEN)

In [3]:
# spacy tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# glove embeddings --> vocab
embedding_dim = 100
embedding_vecs = GloVe(name='6B', dim=embedding_dim)

embedding_dict = OrderedDict()
embedding_dict.update({PAD_TOKEN: 1})
embedding_dict.update({EOS_TOKEN: 1})
embedding_dict.update({UNK_TOKEN: 1})
embedding_dict.update(embedding_vecs.stoi)
# min_freq=0 is a hack to read in the 0th token from embedding_vecs.stoi
voc_glove = vocab(embedding_dict, min_freq=0)
voc_glove.set_default_index(voc_glove[UNK_TOKEN])

embedding = Embedding.from_pretrained(
    embedding_vecs.vectors, freeze=True, padding_idx=voc_glove[PAD_TOKEN]
)

In [24]:
def build_vocab_from_texts(
    texts: Iterable[str], tokenizer: Callable, specials=SPECIAL_TOKENS, **kwargs
) -> Vocab:
    tk_seqs = [tokenizer(s) for s in tqdm(texts)]
    voc = build_vocab_from_iterator(tk_seqs, specials=specials, **kwargs)
    voc.set_default_index(voc[UNK_TOKEN])
    return voc

def nums_from_fractions(total: int, fractions: Tuple[float]) -> Tuple[int]:
    """
    :param fractions: fractions of the total number. One elem must be -1, 
        which denotes "remaining"
    """
    assert fractions.count(-1) == 1, (
        "Must have exactly one occurence of -1 to denote a fraction of 'remaining' items"
    )
    nums = [int(total * f) if f != -1 else 0 for f in fractions]
    idx_remaining = fractions.index(-1)
    nums[idx_remaining] = total - sum(nums)
    assert all([elem >= 0 for elem in nums])
    return tuple(nums)

assert nums_from_fractions(100, [0.7, 0.3, -1]) == (70, 30, 0)
assert nums_from_fractions(100, [0.7, 0.155, -1]) == (70, 15, 15)
assert nums_from_fractions(100, [0.7, 0, -1]) == (70, 0, 30)
# tested that these lines raise error, as expected: 
# nums_from_fractions(100, [0.7, 0.3, -2])
# nums_from_fractions(100, [0.7, 0.5, -1])

def seqs_from_texts(texts: List[str], tokenizer: Callable, voc: Vocab) -> th.Tensor:
    """
    Returns padded sequences (numericalized texts)
    """
    nz_texts = [th.tensor(voc(tokenizer(text))) for text in texts]
    seqs = pad_sequence(nz_texts, padding_value=voc[PAD_TOKEN])
    return seqs

def count_oov_rate(seqs: Iterable[th.Tensor], voc: Vocab) -> float:
    num_oov = 0
    num_tokens = 0
    for i, item in enumerate(seqs):
        # item = d[0][0]
        num_oov += th.sum(item == voc[UNK_TOKEN]).item()
        num_tokens += th.sum(item != voc[PAD_TOKEN]).item()
    return num_oov / num_tokens

# Disaster tweets dataset

In [63]:
df = pd.read_csv("data/data_disaster_tweets.csv")
texts_train, texts_val, texts_test = random_split(
    df.text, nums_from_fractions(len(df.text), [0.7, 0.15, -1])
)

In [65]:
len(texts_train), len(texts_val), len(texts_test)

(5329, 1141, 1143)

In [66]:
# oov rate, using training-generated vocab
voc_train = build_vocab_from_texts(texts_train, tokenizer)
seqs = seqs_from_texts(texts_test, tokenizer, voc_train)
count_oov_rate(seqs, voc_train)

100%|████████████████████████████████████████████████████████| 5329/5329 [00:00<00:00, 16799.01it/s]


0.1528582850289826

In [67]:
# oov rate, using glove vocab
seqs = seqs_from_texts(texts_test, tokenizer, voc_glove)
count_oov_rate(seqs, voc_glove)

0.3126124325404757

In [None]:
for i in range(5):
    print(df.text[i][:500], "\n")

# Sentiment140 dataset

In [69]:
df = pd.read_csv("data/data_twitter_sentiment.csv", header=None, encoding='latin-1')
df = df.rename(columns={
    0: "sentiment_raw",
    5: "text",
})
texts_train, texts_val, texts_test = random_split(
    df.text, nums_from_fractions(len(df.text), [0.7, 0.15, -1])
)

In [71]:
len(texts_train), len(texts_val), len(texts_test)

(1120000, 240000, 240000)

In [72]:
# oov rate, using training-generated vocab
voc_train = build_vocab_from_texts(texts_train, tokenizer)
seqs = seqs_from_texts(texts_test, tokenizer, voc_train)
count_oov_rate(seqs, voc_train)

100%|██████████████████████████████████████████████████| 1120000/1120000 [01:04<00:00, 17379.70it/s]


0.02634319577525798

In [73]:
# oov rate, using glove vocab
seqs = seqs_from_texts(texts_test, tokenizer, voc_glove)
count_oov_rate(seqs, voc_glove)

0.2038886550684923

In [82]:
for i in range(5):
    print(df.text[i][:500], "\n")

AttributeError: 'DataFrame' object has no attribute 'text'

# Amazon reviews dataset

In [74]:
# used example code from 
# https://colab.research.google.com/drive/1Zv6MARGQcrBbLHyjPVVMZVnRWsRnVMpV#scrollTo=7igYuRaV4bF7

data = []
with gzip.open('data/data_reviews_Office_Products_5.json.gz') as f:
    for l in tqdm(f):
        data.append(json.loads(l.strip()))
    
df = pd.DataFrame.from_dict(data)
df = df.rename(columns={
    "reviewText": "text",
})

texts_train, texts_val, texts_test = random_split(
    df.text, nums_from_fractions(len(df.text), [0.7, 0.15, -1])
)

53258it [00:01, 45504.18it/s]


In [76]:
len(texts_train), len(texts_val), len(texts_test)

(37280, 7988, 7990)

In [77]:
# oov rate, using training-generated vocab
voc_train = build_vocab_from_texts(texts_train, tokenizer)
seqs = seqs_from_texts(texts_test, tokenizer, voc_train)
count_oov_rate(seqs, voc_train)

100%|███████████████████████████████████████████████████████| 37280/37280 [00:12<00:00, 2893.54it/s]


0.007513979454133859

In [78]:
# oov rate, using glove vocab
seqs = seqs_from_texts(texts_test, tokenizer, voc_glove)
count_oov_rate(seqs, voc_glove)

0.11351353328188943

In [None]:
for i in range(5):
    print(df.text[i][:500], "\n")

# Movie reviews

In [81]:
basepath = "data/stanford_movie_reviews/aclImdb/"

labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in tqdm(sorted(os.listdir(path))):
            with open(os.path.join(path, file),
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],
                           ignore_index=True)
df.columns = ['review', 'sentiment']


100%|████████████████████████████████████████████████████████| 12500/12500 [00:28<00:00, 440.05it/s]
100%|████████████████████████████████████████████████████████| 12500/12500 [00:35<00:00, 356.94it/s]
100%|████████████████████████████████████████████████████████| 12500/12500 [00:40<00:00, 307.04it/s]
100%|████████████████████████████████████████████████████████| 12500/12500 [00:45<00:00, 275.46it/s]


In [83]:
df = df.rename(columns={
    "review": "text",
})

texts_train, texts_val, texts_test = random_split(
    df.text, nums_from_fractions(len(df.text), [0.7, 0.15, -1])
)

In [84]:
len(texts_train), len(texts_val), len(texts_test)

(35000, 7500, 7500)

In [85]:
# oov rate, using training-generated vocab
voc_train = build_vocab_from_texts(texts_train, tokenizer)
seqs = seqs_from_texts(texts_test, tokenizer, voc_train)
count_oov_rate(seqs, voc_train)

100%|███████████████████████████████████████████████████████| 35000/35000 [00:24<00:00, 1404.71it/s]


0.008642635768701648

In [89]:
len(voc_train.get_itos())

146580

In [86]:
# oov rate, using glove vocab
seqs = seqs_from_texts(texts_test, tokenizer, voc_glove)
count_oov_rate(seqs, voc_glove)

0.13025608124778243

In [90]:
len(voc_glove.get_itos())

400003

In [87]:
for i in range(5):
    print(df.text[i][:500], "\n")

I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the  

Actor turned director Bill Paxton follows up his promising debut, the Gothic-horror "Frailty", with this family friendly sports drama about the 1913 U.S. Open where a young American caddy rises from his humble background to play against his Bristish idol in what was dubbed as "The Greatest Game Ever Played." I'm no fan of golf, and these scrappy underdog sports flicks are a dime a dozen (most recently done to grand effect with "Miracle" and "Cinderella Man"), but some how this film was enthra