## Module/Library installation + imports


In [None]:
pip install transformers

In [None]:
pip install wandb

In [None]:
pip install "ray[tune]"

In [None]:
# save and display all runs in W&B
import wandb
wandb.login()

In [1]:
import pandas as pd
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, AutoConfig,
                          Trainer, TrainingArguments)
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
)
from sklearn.model_selection import StratifiedShuffleSplit

## Load dataset

In [6]:
train_df = pd.read_csv("https://raw.githubusercontent.com/elip06/covid19-fact-checking/main/dataset_preparation/preprocessed_datasets/train_dataset_final.csv?token=AL3S7USONU7SWCL464X7SOLA34FCO")
test_df = pd.read_csv("https://raw.githubusercontent.com/elip06/covid19-fact-checking/main/dataset_preparation/preprocessed_datasets/test_dataset_final.csv?token=AL3S7USJS5CMY5SRWUR4ED3A34FEW")

In [7]:
X_train = train_df.text
Y_train = train_df.labels
X_test = test_df.text
Y_test = test_df.labels

## Helper functions/classes

Create custom dataset class

In [None]:
class CovidDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

Compute evaluation metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
    return {
        'accuracy': acc,
        'tn': tn,
        'tp': tp,
        'fp': fp,
        'fn': fn
    }

Tokenize sentences and initialize custom dataset

In [None]:
def tokenize(X, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    encodings = tokenizer(X.values.tolist(), truncation=True, padding=True)
    return encodings

In [None]:
def createDataset(X, Y, model_name):
    dataset = CovidDataset(tokenize(X, model_name), Y.values.tolist())
    return dataset

Initialize model with given name

In [None]:
def model_init(model_name):
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)


Basic training procedure

In [None]:
def train(run_name, model_name, train_dataset, val_dataset):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=64,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy='epoch',
        save_steps=2000,
        report_to = ['wandb'],
        disable_tqdm=True,
        run_name = run_name
    )

    trainer = Trainer(
        model_init=model_init(model_name),
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    trainer.train()

In [None]:
def basic_train(run_name, model_name):
    run = wandb.init(project="model-comparison", reinit=True, name=run_name)
    train_dataset = createDataset(X_train, Y_train, model_name)
    val_dataset = createDataset(X_test, Y_test, model_name)
    train(run_name, model_name, train_dataset, val_dataset)
    run.finish()

Cross-validation procedure

In [None]:
def cross_validate(run_name, model_name):
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=3)
    i = 1
    for train_index, test_index in sss.split(X_train, Y_train):
        run = wandb.init(project="cross-validation", reinit=True, name=(run_name + "-cv-fold" + str(i)))
        X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
        Y_train_cv, Y_test_cv = Y_train[train_index], Y_train[test_index]
        train_dataset = createDataset(X_train_cv, Y_train_cv, model_name)
        val_dataset = createDataset(X_test_cv, Y_test_cv, model_name)
        train(run_name, model_name, train_dataset, val_dataset)
        run.finish()
        i += 1

Hyperparameter Tuning

In [None]:
def my_hp_space_ray(trial):
    from ray import tune

    return {
        "learning_rate": tune.loguniform(1e-5, 1e-3),
        "num_train_epochs": tune.choice(range(1, 5)),
        "seed": tune.choice(range(1, 42)),
        "per_device_train_batch_size": tune.choice([8, 16]),
        "warmup_steps": tune.choice(range(0, 1000)),
        "weight_decay": tune.loguniform(1e-6, 0.1),
    }

In [None]:
def find_best_hyperparameters(project_name, model_name):
    # Evaluate during training and a bit more often
    # than the default to be able to prune bad trials early.
    train_dataset = createDataset(X_train, Y_train, model_name)
    val_dataset = createDataset(X_test, Y_test, model_name)

    training_args = TrainingArguments(project_name, evaluation_strategy='epoch', save_steps=2000, report_to = ['wandb'], disable_tqdm=True, run_name = 'hyperparameter_tuning')

    trainer = Trainer(
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        model_init=model_init(model_name),
        compute_metrics=compute_metrics,
    )
    trainer.hyperparameter_search(backend="ray", direction="maximize", hp_space=my_hp_space_ray)

## BERT

In [None]:
basic_train('BERT', 'bert-base-uncased')

In [None]:
cross_validate('bert', 'bert-base-uncased')

## DistilBERT

In [None]:
basic_train('DistilBERT', 'distilbert-base-uncased')

In [None]:
cross_validate('distilbert', 'distilbert-base-uncased')

In [None]:
find_best_hyperparameters('distilbert-hyperparameter-tuning', 'distilbert-base-uncased')

## RoBERTa

In [None]:
basic_train('RoBERTa', 'roberta-base')

## SciBERT

In [None]:
basic_train('SciBERT', 'allenai/scibert_scivocab_uncased')