## LSTM with attention

In [1]:
from proj.data.data import *
from proj.main import *
from proj.constants import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(DATA_DIR, "train_test_split_dataset.csv"))

def split_col(df):
    train = df[df['phase'] == 'train']
    val = df[df['phase'] == 'dev']
    test = df[df['phase'] == 'test']
    return train, val, test

dfs = split_col(subset_df)
dls = []
bs = 256
model = "lstmAttention"
tokenizer = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=True)
    dls.append(dl)

hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":10}
trainer = Trainer("deep learning", "lstm_attn", dls, hp, bs)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngbra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  "num_layers={}".format(dropout, num_layers))


In [2]:
trainer.load_weights("lstm_balanced.pkl")

In [14]:
trainer.opt.param_groups[0]['lr'] = 2e-4

In [12]:
trainer.scheduler.step_size

2

In [15]:
trainer.one_cycle()

  0%|          | 0/143 [00:00<?, ?it/s]epoch number: 0
100%|██████████| 143/143 [00:22<00:00,  6.50it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.6976169347763062, acc:0.7632211538461539, f1Score:0.7634022645767745
100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
  0%|          | 0/143 [00:00<?, ?it/s]
epoch val info: loss:1.7456220388412476, acc:0.71484375, f1Score:0.686134552925911
epoch number: 1
100%|██████████| 143/143 [00:21<00:00,  6.76it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.6937229633331299, acc:0.7671820367132867, f1Score:0.7670261540831824
100%|██████████| 20/20 [00:13<00:00,  1.44it/s]
  0%|          | 0/143 [00:00<?, ?it/s]
epoch val info: loss:1.7375907897949219, acc:0.7236328125, f1Score:0.6927312594523534
epoch number: 2
100%|██████████| 143/143 [00:21<00:00,  6.77it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.6910780668258667, acc:0.7700502622377622, f1Score:0.7701585975469705
100%|██████████| 20

### Balanced LSTM Training

In [1]:
from proj.data.data import *
from proj.main import *
from proj.constants import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(DATA_DIR, "train_test_split_dataset.csv"))

def split_col(df):
    train = df[df['phase'] == 'train']
    val = df[df['phase'] == 'dev']
    test = df[df['phase'] == 'test']
    return train, val, test

dfs = split_col(subset_df)
dls = []
bs = 256
model = "lstm"
tokenizer = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=True)
    dls.append(dl)

hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":10}
# trainer = Trainer("deep learning", "lstm_balanced", dls, hp, bs)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngbra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
trainer.load_weights("lstm_balanced.pkl")

In [32]:
trainer.scheduler.gamma = 0.5
trainer.opt.param_groups[0]['lr'] = 3e-3

In [33]:
trainer.one_cycle()

  0%|          | 0/143 [00:00<?, ?it/s]epoch number: 0
100%|██████████| 143/143 [00:16<00:00,  8.59it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.675702691078186, acc:0.7857025786713286, f1Score:0.7875331348136794
100%|██████████| 20/20 [00:08<00:00,  2.47it/s]
  0%|          | 0/143 [00:00<?, ?it/s]
epoch val info: loss:1.7290502786636353, acc:0.7310546875, f1Score:0.6914131040822501
epoch number: 1
100%|██████████| 143/143 [00:12<00:00, 11.75it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.6732827425003052, acc:0.7883522727272727, f1Score:0.7897412882438359
100%|██████████| 20/20 [00:08<00:00,  2.39it/s]
  0%|          | 0/143 [00:00<?, ?it/s]
epoch val info: loss:1.7258352041244507, acc:0.7349609375, f1Score:0.6943042482518397
epoch number: 2
100%|██████████| 143/143 [00:12<00:00, 11.73it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.6690218448638916, acc:0.792340472027972, f1Score:0.7935759035654084
100%|██████████| 2

In [48]:
# preds = trainer.getPreds(-1, False)
testPreds = trainer.getPreds(2, False)

100%|██████████| 41/41 [00:09<00:00,  4.47it/s]
epoch test info: loss:1.7225946187973022, acc:0.7384717987804879, f1Score:0.6999938135651691



In [73]:
from proj.main import *

dfCopy[PRED_COL] = predCategories
dfCopy["correct"] = dfCopy[PRED_COL] == dfCopy[Y_COL]
csvPath = os.path.join(
    PREDS_DIR, f"{trainer.model_name}_test_preds.csv")
dfCopy.to_csv(csvPath, index=False)

In [72]:
dfCopy = dls[2].dataset.getDF()
if len(testPreds) < len(dfCopy):
    extra = len(dfCopy) - len(testPreds)
    testPreds = torch.cat([testPreds, torch.tensor([-1] * extra)])
predCategories = list(map(lambda l: CATEGORY_SUBSET[l], testPreds.numpy()))

In [25]:
trainer.one_cycle()

  0%|          | 0/143 [00:00<?, ?it/s]epoch number: 0
100%|██████████| 143/143 [00:13<00:00, 10.94it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.7085702419281006, acc:0.7524857954545454, f1Score:0.7529745742038847
100%|██████████| 20/20 [00:08<00:00,  2.45it/s]
  0%|          | 0/143 [00:00<?, ?it/s]
epoch val info: loss:1.756931185722351, acc:0.703125, f1Score:0.6783157826457091
epoch number: 1
100%|██████████| 143/143 [00:12<00:00, 11.57it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.7104846239089966, acc:0.7502731643356644, f1Score:0.7502623627367463
100%|██████████| 20/20 [00:07<00:00,  2.51it/s]
  0%|          | 0/143 [00:00<?, ?it/s]
epoch val info: loss:1.7268766164779663, acc:0.7333984375, f1Score:0.6884326424116878
epoch number: 2
100%|██████████| 143/143 [00:11<00:00, 12.00it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
epoch train info: loss:1.7139006853103638, acc:0.7474049388111889, f1Score:0.7479018842502474
100%|██████████| 20/2

## Bert Training

In [2]:
from proj.data.data import *
from proj.main import *
from proj.constants import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(DATA_DIR, "train_test_split_dataset.csv"))
# subset_df = subset_df.iloc[:2000, :]

def split_col(df):
    train = df[df['phase'] == 'train']
    val = df[df['phase'] == 'dev']
    test = df[df['phase'] == 'test']
    return train, val, test

dfs = split_col(subset_df)
dls = []
bs = 256
model = "distilBert"
tokenizer = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=False)
    dls.append(dl)

hp = {**DEFAULT_HP, "model": model, "lr":2e-4}
trainer = Trainer("deep learning", "distilBert_og", dls, hp, bs)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [2]:
trainer.load_weights("distilBert_og.pkl")

In [4]:
ds = NewsDataset(dfs[0], tokenizer=tokenizer)
dl = to_dataloader(ds, bs, sampler=None, drop_last=False)
trainer.dls.append(dl)

In [5]:
preds = trainer.getPreds(-1, toSave=True)

100%|██████████| 144/144 [00:40<00:00,  3.55it/s]

epoch train info: loss:0.04168923571705818, acc:0.9861924913194444, f1Score:0.9889603945898806


In [3]:
trainer.freeze(True)

In [5]:
trainer.one_cycle()

  0%|          | 0/144 [00:00<?, ?it/s]epoch number: 0
100%|██████████| 144/144 [01:02<00:00,  2.30it/s]
  0%|          | 0/21 [00:00<?, ?it/s]
epoch train info: loss:0.06283114105463028, acc:0.9781901041666666, f1Score:0.9802947362823728
100%|██████████| 21/21 [00:29<00:00,  1.40s/it]
  0%|          | 0/144 [00:00<?, ?it/s]
epoch val info: loss:0.7902320027351379, acc:0.8143601190476191, f1Score:0.7972760329498467
epoch number: 1
100%|██████████| 144/144 [01:02<00:00,  2.30it/s]
  0%|          | 0/21 [00:00<?, ?it/s]
epoch train info: loss:0.0553152859210968, acc:0.9810655381944444, f1Score:0.9832098484763951
100%|██████████| 21/21 [00:29<00:00,  1.41s/it]
  0%|          | 0/144 [00:00<?, ?it/s]
epoch val info: loss:0.8516273498535156, acc:0.8223586309523809, f1Score:0.8086717794087275
epoch number: 2
100%|██████████| 144/144 [01:03<00:00,  2.28it/s]
  0%|          | 0/21 [00:00<?, ?it/s]
epoch train info: loss:0.04697829484939575, acc:0.9836154513888888, f1Score:0.9858222504106312
10

In [16]:
from torch.optim import AdamW
model = trainer.model
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=hp["lr"])