<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#imports-and-functions" data-toc-modified-id="imports-and-functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>imports and functions</a></span></li><li><span><a href="#experiments" data-toc-modified-id="experiments-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>experiments</a></span><ul class="toc-item"><li><span><a href="#0" data-toc-modified-id="0-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>0</a></span></li><li><span><a href="#1" data-toc-modified-id="1-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>1</a></span></li><li><span><a href="#2" data-toc-modified-id="2-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>2</a></span></li></ul></li></ul></div>

# imports and functions

In [None]:
import collections
import pandas as pd
import os
import sys
import json
import matplotlib.pyplot as plt
from tqdm.auto import tqdm, trange

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler
# import torchmetrics

import datasets
from datasets import load_metric
from transformers import AutoConfig, AutoTokenizer, BertModel, RobertaModel
from transformers import BertForSequenceClassification
from transformers import TrainingArguments, Trainer

from sklearn.metrics import mean_squared_error, accuracy_score, precision_recall_fscore_support


In [None]:
# https://github.com/huggingface/transformers/issues/5486
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
with open('../data/xslue/tasks.json', 'r') as f:
    tasks = json.load(f)
tasks

In [None]:
class MyDataset(Dataset): 
    # currently it's a Mapping-style dataset. Not sure if a Iterable-style dataset will be better
    def __init__(self, tsv_file):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        self.encodings = self.tokenizer(self.df['text'].tolist(), truncation=True, padding=True, max_length=128)
        if self.df['label'].dtype == 'float64':
            self.df['label'] = self.df['label'].astype('float32')
        self.labels = self.df['label'].tolist()
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item


In [None]:
pearsonr = load_metric("pearsonr")
spearmanr = load_metric("spearmanr")

In [None]:
# GPU memory usage: 6617 - 6680mb with bs 32
# bs 64 gives OOM
# bs 48 GPU memory 7894
batch_size = 32

In [None]:
def train_baseline(task, freeze_bert=False):
    torch.cuda.empty_cache()
    model = None
    trainer = None 
    num_labels = tasks[task]
    
    data_folder = '../../data/xslue'
    train_dataset = MyDataset(f'{data_folder}/processed/train/{task}.tsv')
    test_dataset = MyDataset(f'{data_folder}/processed/test/{task}.tsv')
    valid_dataset = MyDataset(f'{data_folder}/processed/dev/{task}.tsv')
    
    singletaskbert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels) 
    if freeze_bert:
        for param in singletaskbert.bert.parameters():
            param.requires_grad = False
    result_folder = '../../result'
    training_args = TrainingArguments(
        output_dir=f'{result_folder}/baselines/{task+'_freezed' if freeze_bert else task}',   # output directory
        num_train_epochs=5,              # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=batch_size,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir=f'{result_folder}/baselines/{task+'_freezed' if freeze_bert else task}/logs',  # directory for storing logs
#         logging_first_step = True, 
#         logging_steps=500,               # log & save weights each logging_steps
#         save_steps=500,
        evaluation_strategy="epoch",     # evaluate each `logging_steps`
        save_total_limit = 1,
        save_strategy = 'epoch',
        load_best_model_at_end=True, # decide on loss
    )
    
    if num_labels == 1:
        def compute_metrics(pred):
            predictions, labels = pred
            rmse = mean_squared_error(labels, predictions, squared=False)
            return {"rmse": rmse}
    elif num_labels == 2:
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }
    else:
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }
    
    trainer = Trainer(
        model=singletaskbert,   # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
#         test_dataset=test_dataset,            # test dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
    trainer.train()

# experiments

## 0

In [None]:
task = list(tasks.keys())[7]
train_baseline(task)

In [None]:
task = list(tasks.keys())[7]
train_baseline(task, freeze_bert=True)

## 1

In [None]:
task = list(tasks.keys())[7]
train_baseline(task)

In [None]:
task = list(tasks.keys())[7]
train_baseline(task, freeze_bert=True)

## 2

In [None]:
task = list(tasks.keys())[7]
train_baseline(task)

In [None]:
task = list(tasks.keys())[7]
train_baseline(task, freeze_bert=True)