## Small tfmer

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split_col(subset_df)
dls = []
bs = 256
model = "distilBertSmall"
tokenizer = None
sampler = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=False)
    dls.append(dl)
model_name = "distilBertSmall"
hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":5}
trainer = Trainer("sample", model_name, dls, hp, bs)

In [None]:
trainer.one_cycle()

## Embed training

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split(subset_df)
dls = []
bs = 256
model = "distilBertEmbed"
tokenizer = None
sampler = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    # ds = NewsDataset(d, tokenizer=tokenizer, useBigram=True)
    ds = NewsDataset(d, tokenizer=tokenizer, embed=True)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=False)
    dls.append(dl)
model_name = "distilBertEmbed"
hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":5}
trainer = Trainer("sample", model_name, dls, hp, bs)

In [None]:
trainer.one_cycle()

## POS Tagging

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split_col(subset_df)
dls = []
bs = 256
model = "distilBertPOS"
tokenizer = None
sampler = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    # ds = NewsDataset(d, tokenizer=tokenizer, useBigram=True)
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=False)
    dls.append(dl)
model_name = "distilBertPOS"
hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":5}
trainer = Trainer("sample", model_name, dls, hp, bs)

In [None]:
trainer.one_cycle()

## Training augmentation

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split_col(subset_df)
dls = []
bs = 256
model = "distilBert"
tokenizer = None
sampler = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer, augment= (i==0))
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=False)
    dls.append(dl)
model_name = "distilBert_synAugment_0.3prob_2"
hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":5}
trainer = Trainer("sample", model_name, dls, hp, bs)

In [None]:
trainer.one_cycle()

## LSTM Attention

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd
from torch.nn.utils.rnn import pad_packed_sequence

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split_col(subset_df)
dls = []
bs = 128
model = "lstmAttention"
tokenizer = None
sampler = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=True)
    dls.append(dl)
model_name = "lstmAttention_embed_train"
hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":10}
trainer = Trainer("sample", model_name, dls, hp, bs)

In [None]:
trainer.one_cycle()

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split_col(subset_df)
dls = []
bs = 128
model = "lstm"
tokenizer = None
sampler = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=True)
    dls.append(dl)
model_name = "lstm_balanced"
hp = {**DEFAULT_HP, "model": model, "lr":2e-2, "epochs":10}
trainer = Trainer("sample", model_name, dls, hp, bs)

In [None]:
trainer.one_cycle()

## LSTM Bigram

In [None]:
from proj.data.data import *
from proj.main import *
from proj.models import all_tokenizers
import os
import pandas as pd

subset_df = pd.read_csv(os.path.join(
    DATA_DIR, "train_test_split_dataset.csv"))
dfs = split_col(subset_df)
dls = []
bs = 256
model = "lstmBigram"
tokenizer = None

if model in all_tokenizers:
    tokenizer = all_tokenizers[model]()

for i, d in enumerate(dfs):
    ds = NewsDataset(d, tokenizer=tokenizer)
    sampler = get_weighted_sampler(ds.labels()) if i == 0 else None
    dl = to_dataloader(ds, bs, sampler=sampler, drop_last=True)
    dls.append(dl)

hp = {**DEFAULT_HP, "model": model, "lr":2e-4, "epochs":10}
trainer = Trainer("sample", "lstm_bigram", dls, hp, bs)

In [None]:
trainer.one_cycle()

In [None]:
weights_path = os.path.join(WEIGHTS_DIR, "deep learning", "lstm_balanced.pkl")
sd = torch.load(weights_path)
del sd['embedding.weight']
trainer.model.load_state_dict(sd, strict=False)
trainer.model.to(trainer.device)