In [None]:
import numpy as np
import seaborn as sns
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json
import time
import wandb
import torch
from torch import cuda
from torch.utils.data import DataLoader

from utils import (
    get_AGNews_datasets,
    train, test, accuracy,
    dynamic_masking,
    RobertaMLM_with_classifier
)
%env WANDB_PROJECT=TAPT_roberta
%env WANDB_LOG_MODEL='end'

# Import relevant models, tokenizers, and related libs
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Statics
DEVICE = 'cuda' if cuda.is_available() else 'cpu'
SEED = 69
SEEDED_GEN = torch.Generator().manual_seed(SEED)

# Confirm device type, should say CUDA if you have a GPU
print(DEVICE)

### See utils.py for all helper classes and model definitions

### Data Explore

What are the lengths of sentences? 

In [None]:
model_type = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_type)

path = join("data",f"train.csv")
df = pd.read_csv(path)
df['encoded'] = df['Description'].apply(lambda x: tokenizer(x)['input_ids'])
df['maxlength'] = df['encoded'].apply(lambda x: len(x))
display(df['maxlength'].describe())

from os.path import join
path = join("data",f"test.csv")
df = pd.read_csv(path)
df['encoded'] = df['Description'].apply(lambda x: tokenizer(x)['input_ids'])
df['maxlength'] = df['encoded'].apply(lambda x: len(x))
display(df['maxlength'].describe())

### Model Training

#### 1) Task Adaptive Pre-Training

In [None]:
from transformers import DataCollatorForLanguageModeling, LineByLineTextDataset
from transformers import Trainer, TrainingArguments, pipeline
import datasets

hyperparams_TAPT = {
    "EPOCHS" : 20,
    "MASK_PROB" : 0.1,
    'TRAINING_BATCH_SIZE' : 32,
    "MAX_LEN" : 77
}

model_type = "distilroberta-base"
checkpoint = "huggingface/TAPT_businessTAPT_1682554710"
robertaMLM_model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_type)

# dataset = datasets.load_dataset("ag_news")
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='data/traincommaed.csv',
    block_size=hyperparams_TAPT['MAX_LEN'],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=hyperparams_TAPT['MASK_PROB'])

training_args = TrainingArguments(
    output_dir='checkpoints/TAPT_Roberta_DAPT_TAPT',
    overwrite_output_dir=True,
    num_train_epochs=hyperparams_TAPT['EPOCHS'],
    per_device_train_batch_size=hyperparams_TAPT['TRAINING_BATCH_SIZE'],
    save_steps=10_000,
    save_total_limit=10,
    report_to="wandb"
)

trainer = Trainer(
    model=robertaMLM_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()
wandb.finish()

# Save model
curr_time = int(time.time())
model_folder = f'huggingface/DAPT_TAPT_{curr_time}'
trainer.save_model(model_folder)

#### 2) Domain Adaptive Pre-Training

In [None]:
from transformers import DataCollatorForLanguageModeling, LineByLineTextDataset
from transformers import Trainer, TrainingArguments, pipeline
import datasets

hyperparams_TAPT = {
    "EPOCHS" : 5,
    "MASK_PROB" : 0.1,
    'TRAINING_BATCH_SIZE' : 32,
    "MAX_LEN" : 77
}

model_type = "distilroberta-base"
robertaMLM_model = AutoModelForMaskedLM.from_pretrained(model_type)
tokenizer = AutoTokenizer.from_pretrained(model_type)

# dataset = datasets.load_dataset("ag_news")
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='data/businessnews.csv',
    block_size=hyperparams_TAPT['MAX_LEN'],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=hyperparams_TAPT['MASK_PROB'])

training_args = TrainingArguments(
    output_dir='checkpoints/BusinessDAPT_Roberta',
    overwrite_output_dir=True,
    num_train_epochs=hyperparams_TAPT['EPOCHS'],
    per_device_train_batch_size=hyperparams_TAPT['TRAINING_BATCH_SIZE'],
    save_steps=10_000,
    save_total_limit=5,
    report_to="wandb"
)

trainer = Trainer(
    model=robertaMLM_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()
wandb.finish()

# Save model
curr_time = int(time.time())
model_folder = f'huggingface/TAPT_businessTAPT_{curr_time}'
trainer.save_model(model_folder)

#### 3) Fine Tuning for Text Classification

In [None]:
# Define common configs
tokenizer_type = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

hyperparams = {
    "TRAIN_PCT" : 0.9,
    "TRAIN_BATCH_SIZE" : 200,
    "VALID_BATCH_SIZE" : 200,
    "TEST_BATCH_SIZE" : 200,
    "MAX_LEN" : 77,
    "EPOCHS" : 25,
    "LR" : 0.005,
    "L2_REG" : 0.000000,
    "ADAM_BETAS" : (0.87, 0.98),
    "ADAM_EPS" : 1e-6,
    "FC_HIDDEN" : 768,
    "FC_DROPOUT" : 0.05,
    "SCH_ENDFACTOR" : 0.1,
    "RUN_SUFFIX" : "_9"
}

project_name = "TAPT_roberta"

### --- LOAD DATA --------------
train_dataset, valid_dataset, test_dataset = get_AGNews_datasets(
    tokenizer,
    DEVICE,
    max_length=hyperparams['MAX_LEN'],
    train_pct=hyperparams['TRAIN_PCT'],
    generator=SEEDED_GEN
)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=hyperparams['TRAIN_BATCH_SIZE'], shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=hyperparams['VALID_BATCH_SIZE'], shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=hyperparams['TEST_BATCH_SIZE'], shuffle=True)

# Test data loading
display(next(iter(train_dataloader))['encoding'].shape)
display(train_dataset[5].keys())

#### 3.1) Pre-TAPT Fine Tuning

In [None]:
model_type = "distilroberta-base"
MLM_layers = AutoModelForMaskedLM.from_pretrained(model_type).roberta

wandb.init(
    project=project_name,
    job_type='pre_TAPT_finetune',
    name=f"20EPOCH-TAPT-FC_{hyperparams['RUN_SUFFIX']}",
    config=hyperparams
)

### --- TRAIN MODEL ----------------------
# Define Model and freeze pre-trained layers
model = RobertaMLM_with_classifier(MLM_layers, fc_hidden=hyperparams['FC_HIDDEN'], fc_dropout=hyperparams['FC_DROPOUT'])
model.to(DEVICE)
for param in model.mlm.parameters():
    param.requires_grad = False

# Define loss and optimizers
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    params=model.classifier.parameters(),
    betas=hyperparams['ADAM_BETAS'],
    eps=hyperparams['ADAM_EPS'],
    lr=hyperparams['LR'],
    weight_decay=hyperparams['L2_REG']
)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=hyperparams['SCH_ENDFACTOR'], total_iters=hyperparams['EPOCHS'])

# Perform Training and Testing
for epoch in range(hyperparams['EPOCHS']):
    print("-----------------------------------")
    print(f"Epoch {epoch+1}")
    print(f"Num. Training Batches = {len(train_dataloader)}")
    print(f"Num. Validation Batches = {len(valid_dataloader)}")
    print(f"Num. Test Batches = {len(test_dataloader)}")
    print("-----------------------------------")
    train(epoch+1, model, train_dataloader, valid_dataloader, optimizer, criterion, wandb)
    scheduler.step()
test(model, test_dataloader, criterion, wandb)

wandb.finish()

# Save model
curr_pretrain_time = int(time.time())
PATH = f"models/{model_type}_base_finetuned_{curr_pretrain_time}.pt"
torch.save(model.state_dict(), PATH)

#### 3.2) Post-TAPT Fine Tuning

In [None]:
model_type = f"huggingface/DAPT_TAPT_1682693602"

MLM_layers = AutoModelForMaskedLM.from_pretrained(model_type).roberta
tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)

wandb.init(
    project=project_name,
    job_type='post_TAPT_finetune',
    name=f"20EPOCH-DAPT_TAPT_FC_{hyperparams['RUN_SUFFIX']}",
    config=hyperparams
)

### --- TRAIN MODEL ----------------------
# Define Model and freeze pre-trained layers
final_model = RobertaMLM_with_classifier(MLM_layers, fc_hidden=hyperparams['FC_HIDDEN'], fc_dropout=hyperparams['FC_DROPOUT'])
final_model.to(DEVICE)
for param in final_model.mlm.parameters():
    param.requires_grad = False

# Define loss and optimizers
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    params=final_model.classifier.parameters(),
    betas=hyperparams['ADAM_BETAS'],
    eps=hyperparams['ADAM_EPS'],
    lr=hyperparams['LR'],
    weight_decay=hyperparams['L2_REG']
)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=hyperparams['SCH_ENDFACTOR'], total_iters=hyperparams['EPOCHS'])

# Perform Training and Testing
for epoch in range(hyperparams['EPOCHS']):
    print("-----------------------------------")
    print(f"Epoch {epoch+1}")
    print(f"Num. Training Batches = {len(train_dataloader)}")
    print(f"Num. Validation Batches = {len(valid_dataloader)}")
    print(f"Num. Test Batches = {len(test_dataloader)}")
    print("-----------------------------------")
    train(epoch+1, final_model, train_dataloader, valid_dataloader, optimizer, criterion, wandb)
    scheduler.step()
test(final_model, test_dataloader, criterion, wandb)

wandb.finish()

# Save model
curr_time = int(time.time())
PATH = f"models/DAPT_TAPT_1682693602_finetuned_{curr_time}.pt"
torch.save(final_model.state_dict(), PATH)

### 4) Analysis

In [None]:
# Get confusion matrix
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
# Load saved model from disk
model_type = "distilroberta-base"
# This path would be the related saved model - either TAPT, DAPT, DAPT+TAPT
PATH = f"models/TAPT_1682353322_finetuned_1682474239.pt"
MLM_layers = AutoModelForMaskedLM.from_pretrained(model_type).roberta
lazarus_model = RobertaMLM_with_classifier(MLM_layers, fc_hidden=hyperparams['FC_HIDDEN'], fc_dropout=hyperparams['FC_DROPOUT'])
lazarus_model.load_state_dict(torch.load(PATH))
lazarus_model.to(DEVICE)
lazarus_model.eval()
for param in lazarus_model.parameters():
    param.requires_grad = False

In [None]:
# Perform classification and analysis
tot_batches = len(test_dataloader)
total_acc = 0.
predicted = []
target = []
for data in tqdm(iter(test_dataloader)):
    encodings = data['encoding'].squeeze(dim=1)
    masks = data['mask'].squeeze(dim=1)
    targets = data['label']
    
    output = lazarus_model(encodings, masks)
    pred = torch.argmax(output, dim=-1)
    acc = accuracy(output, targets)
    total_acc += acc

    predicted.append(pred)
    target.append(targets)

predicted = torch.cat(predicted)
target = torch.cat(target)

In [None]:
print(confusion_matrix(predicted.cpu(), target.cpu()))

# '0': World
# '1': Sports
# '2': Business
# '3': Sci/Tech

# True label on ith (dim 0), predicted on jth (dim 1)

In [None]:
print(multilabel_confusion_matrix(predicted.cpu(), target.cpu()))
