# Testing Notebook

In [11]:
import numpy as np
import pandas as pd
import torch
import transformers

# from koila import lazy
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.file_utils import PaddingStrategy

import config

# from src import config

In [12]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.bert_drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(768, 1)

    @autocast()
    def forward(self, ids, mask, token_type_ids):
        t, out = self.bert(
            ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False
        )
        bo = self.bert_drop(out)
        out = self.out(bo)
        return out

In [13]:
class SpamDataset(Dataset):
    def __init__(self, texts, target):
        self.texts = texts
        self.target = target
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item) -> dict[str, torch.Tensor]:
        text = str(self.texts[item])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding=PaddingStrategy("max_length"),
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target": torch.tensor(self.target[item], dtype=torch.float),
            "mask": torch.tensor(mask, dtype=torch.long),
        }

In [14]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view[-1, 1])


def train_fn(
    data_loader: DataLoader,
    model: nn.Module,
    optimizer,
    device: torch.device,
    scheduler=transformers.get_scheduler,
):
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["target"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        # ids, token_type_ids, mask = lazy(ids, token_type_ids, mask, batch=0)
        # targets = lazy(targets)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        # outputs = lazy(outputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()


def eval_fn(
    data_loader: torch.utils.data.DataLoader,
    model: nn.Module,
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
) -> tuple[list[float], list[float]]:
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for b, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["target"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            # ids, token_type_ids, mask = lazy(ids, token_type_ids, mask, batch=0)
            # targets = lazy(targets)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [15]:
# df = pd.read_csv(config.TRAINING_FILE)
# df.targets.unique()

In [16]:
def run():
    df = pd.read_csv(config.TRAINING_FILE)
    df_train, df_valid = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        stratify=df.targets.values,
    )
    df_train.reset_index(drop=True, inplace=True)
    df_valid.reset_index(drop=True, inplace=True)

    train_ds = SpamDataset(texts=df_train.texts.values, target=df_train.targets.values)
    train_dl = DataLoader(
        dataset=train_ds,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4,
        pin_memory=True,
    )
    valid_ds = SpamDataset(
        texts=df_valid.texts.values,
        target=df_valid.targets.values,
    )
    valid_dl = DataLoader(
        dataset=valid_ds,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1,
        pin_memory=True,
    )
    device = torch.device("cuda")
    model = BERTBaseUncased().to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )
    best_accuracy = 0
    for epochs in range(config.EPOCHS):
        train_fn(
            data_loader=train_dl,
            model=model,
            optimizer=optimizer,
            device=device,
            scheduler=scheduler,
        )
        outputs, targets = eval_fn(valid_dl, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), f"{epochs}-".join(config.MODEL_PATH))
            best_accuracy = accuracy


run()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/418 [00:00<?, ?it/s]


TypeError: forward() got an unexpected keyword argument 'token_type_ids'