In [None]:
%cd ~/RATER-C

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch
import numpy as np
import pandas as pd
import janitor

from datasets import DatasetDict, Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, TextClassificationPipeline

from scipy.special import softmax
from sklearn.metrics import roc_auc_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
import random
from transformers import set_seed
seed = 123

set_seed(seed)
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
pretrained_model_name_or_path = 'microsoft/deberta-v3-large'

In [None]:
train = pd.read_excel('data/processed/train_val_test.xlsx', sheet_name = 'train')[['definition', 'ItemText', 'Target']].clean_names()
val =   pd.read_excel('data/processed/train_val_test.xlsx', sheet_name = 'val')[['definition', 'ItemText', 'Target']].clean_names()
test =  pd.read_excel('data/processed/train_val_test.xlsx', sheet_name = 'test')[['definition', 'ItemText', 'Target']].clean_names()

In [None]:
# for testing only
#train = train.sample(n = 1000, random_state = seed)
#val =     val.sample(n = 1000, random_state = seed)
#test =   test.sample(n = 1000, random_state = seed)

In [None]:
train.head(3)

In [None]:
test.groupby('target').size()

In [None]:
ds_train = Dataset.from_dict({'text_a': train.definition, 
                              'text_b': train.itemtext, 
                              'labels': train.target})

ds_val = Dataset.from_dict({'text_a': val.definition, 
                            'text_b': val.itemtext, 
                            'labels': val.target})

ds_test = Dataset.from_dict({'text_a': test.definition, 
                             'text_b': test.itemtext, 
                             'labels': test.target})

In [None]:
dataset_dict = DatasetDict({
    'train': ds_train, 
    'val': ds_val, 
    'test': ds_test
})

dataset_dict

In [None]:
dataset_dict['train'][0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)

In [None]:
def encode (examples):
    tokenized_examples = tokenizer(examples['text_a'], examples['text_b'], return_token_type_ids = True)
    tokenized_examples['labels'] = [int(label) for label in examples['labels']]
    return tokenized_examples

In [None]:
dataset_dict_tokenized = dataset_dict.map(
    encode,
    batched = True,
    num_proc = os.cpu_count(),
    remove_columns = ['text_a', 'text_b']
)
dataset_dict_tokenized

In [None]:
data_collator = DataCollatorWithPadding(tokenizer, padding = True)

In [None]:
def model_init ():
    return AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path, 
        num_labels = 2)

In [None]:
def wandb_hp_space (trial):
    return {
        'project': 'CV1',
        'method': 'grid',
        'metric': {'name': 'objective', 'goal': 'maximize'},
        'parameters': {
            'learning_rate': {'values': [1e-5, 3e-5, 5e-5]},
            'per_device_train_batch_size': {'values': [8]},
            'num_train_epochs': {'values': [1, 2, 3, 5, 10]}
        },
    }

In [None]:
def compute_metrics (eval_pred):
    predictions, labels = eval_pred
    probs = softmax(predictions, axis = -1)[:, 1]
    return {'auc': roc_auc_score(labels, probs)}

def tune_for_auc (metrics):
    return metrics['eval_auc']

In [None]:
args = TrainingArguments(
    'content_validity',
    per_device_eval_batch_size = 32,
    eval_strategy = 'epoch',
    save_strategy = 'no'
)

trainer = Trainer(
    args = args,
    data_collator = data_collator,
    model_init = model_init,
    train_dataset = dataset_dict_tokenized['train'],
    eval_dataset = dataset_dict_tokenized['val'],
    compute_metrics = compute_metrics
)

In [None]:
best_trial = trainer.hyperparameter_search(
    direction = 'maximize',
    backend = 'wandb',
    hp_space = wandb_hp_space,
    compute_objective = tune_for_auc
)

In [None]:
best_trial

In [None]:
best_trial.hyperparameters

In [None]:
best_settings_df = pd.DataFrame({
    'model': pretrained_model_name_or_path,
    'learning_rate': best_trial.hyperparameters['learning_rate'],
    'per_device_train_batch_size': best_trial.hyperparameters['per_device_train_batch_size'],
    'num_train_epochs': best_trial.hyperparameters['num_train_epochs']
}, index = [0])
best_settings_df.to_csv('results/' + pretrained_model_name_or_path.replace('/', '_') + '_hyperparams.csv', index = False)

In [None]:
# refit model using manual settings via True; otherwise, False
if False:
    best_trial = lambda: None
    best_trial.hyperparameters = {
        'learning_rate': 1e-05,
        'per_device_train_batch_size': 8,
        'num_train_epochs': 1
    }

best_trial.hyperparameters

In [None]:
final_args = TrainingArguments(
    'content_validity',
    learning_rate = best_trial.hyperparameters['learning_rate'],
    per_device_train_batch_size = best_trial.hyperparameters['per_device_train_batch_size'],
    num_train_epochs = best_trial.hyperparameters['num_train_epochs'],
    per_device_eval_batch_size = 32,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    report_to = 'none'
)

final_trainer = Trainer(
    args = final_args,
    data_collator = data_collator,
    model_init = model_init,
    train_dataset = dataset_dict_tokenized['train'],
    eval_dataset = dataset_dict_tokenized['val'],
    compute_metrics = compute_metrics
)

In [None]:
model_checkpoint = 'models/' + pretrained_model_name_or_path.replace('/', '_')
final_trainer.train()
final_trainer.save_model(model_checkpoint)

In [None]:
data_test_dict = []

for i, _ in test.iterrows():
    data_test_dict.append({'text': test['definition'][i], 
                           'text_pair': test['itemtext'][i]})

data_test_dict[0]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels = 2
).to(device)

pipe = TextClassificationPipeline(model = model, tokenizer = tokenizer, top_k = None, device = device)

# temporary workaround for XLNet batch size issue
if str(model.base_model).find('XLNetModel') != -1:
    batch_size = 1
else:
    batch_size = 128

raw_probs = pipe(data_test_dict, batch_size = batch_size)

In [None]:
probs = np.array([item[['LABEL_1' == i['label'] for i in item].index(True)]['score'] for item in raw_probs])
preds = np.where(probs >= 0.5, 1, 0)

In [None]:
out = pd.DataFrame({
    'definition': test['definition'],
    'itemtext': test['itemtext'],
    'target': test['target'],
    'prob': probs
})

out.to_csv('results/' + pretrained_model_name_or_path.replace('/', '_') + '_preds.csv', index = False)