In [None]:
print(0)

---
# LLM Finetuning

---

In [None]:
import torch

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments #, BertModel
from datasets import load_dataset, Dataset

from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

from sklearn.metrics import accuracy_score

#import tensorflow as tf
#import numpy as np
#import nltk
#import datasets

import pickle


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    labels = torch.tensor(labels)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):  
    return tokenizer(example['text'], padding='max_length', truncation=True)


In [6]:
modelsizes = {1:'prajjwal1/bert-tiny', 2:'prajjwal1/bert-mini', 3:'prajjwal1/bert-small'}

pre_t_model = modelsizes[3]

my_lr = 0.001

epox = 3


---
# IMDB

---

In [None]:

dataset = load_dataset('imdb')

tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)

train_data = tokenized_datasets.to_dict()

#with open(r'C:/Users/ege/git/NLP/datasets/imdb.pkl', 'wb') as file:
#    pickle.dump(train_data, file)


In [None]:

#with open(r'C:\Users\ege\git\NLP\datasets\imdb.pkl', 'rb') as file:
#    train_data = pickle.load(file)


train_dataset, validation_dataset = train_test_split(list(zip(train_data['input_ids'], train_data['attention_mask'], train_data['label'])), test_size=0.1)

#print(type(tokenized_datasets))

train_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in train_dataset]
validation_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in validation_dataset]

print("Train:", len(train_dataset))
print("Val:", len(validation_dataset))


train_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in train_dataset],
                                      'attention_mask': [x['attention_mask'] for x in train_dataset],
                                      'labels': [x['label'] for x in train_dataset]})

validation_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in validation_dataset],
                                           'attention_mask': [x['attention_mask'] for x in validation_dataset],
                                           'labels': [x['label'] for x in validation_dataset]})

#print(type(train_dataset_hf))

In [9]:

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=my_lr,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epox,
    weight_decay=0.01,
    report_to=[],
)


In [None]:
model_base = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)

print("\n The total trainable parameters on the baseline model:", sum([v.numel() for _,v in model_base.named_parameters() if v.requires_grad]))

optimizer = AdamW(filter(lambda p: p.requires_grad, model_base.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

trainer_base = Trainer(
    model=model_base,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [None]:
trainer_base.train()


base_metrics = trainer_base.evaluate()
print("Baseline Evaluation Metrics:", base_metrics)

In [None]:
model_bo = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)


print("\n The total trainable parameters before:", sum([v.numel() for _,v in model_bo.named_parameters() if v.requires_grad]))

for k, v in model_bo.named_parameters():
    v.requires_grad = (("classifier" in k) or ("bias" in k))

print(" after:", sum([v.numel() for _,v in model_bo.named_parameters() if v.requires_grad]))


optimizer = AdamW(filter(lambda p: p.requires_grad, model_bo.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

bias_trainer = Trainer(
    model=model_bo,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [None]:
bias_trainer.train()

bias_o_metrics = bias_trainer.evaluate()
print("Bias-only Evaluation Metrics:", bias_o_metrics)

---
# AG N

---

In [None]:

dataset = load_dataset('ag_news')

tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)

train_data = tokenized_datasets.to_dict()


#with open('C:/Users/ege/git/NLP/datasets/agn.pkl', 'wb') as file:
#    pickle.dump(train_data, file)


In [None]:
#with open('C:/Users/ege/git/NLP/datasets/agn.pkl', 'rb') as file:
#    train_data = pickle.load(file)

train_dataset, validation_dataset = train_test_split(
    list(zip(train_data['input_ids'], train_data['attention_mask'], train_data['label'])), 
    test_size=0.1
)

train_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in train_dataset]
validation_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in validation_dataset]

print("Train:", len(train_dataset))
print("Val:", len(validation_dataset))
#print("Test:", len(dataset['test']))


train_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in train_dataset],
                                      'attention_mask': [x['attention_mask'] for x in train_dataset],
                                      'labels': [x['label'] for x in train_dataset]})

validation_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in validation_dataset],
                                           'attention_mask': [x['attention_mask'] for x in validation_dataset],
                                           'labels': [x['label'] for x in validation_dataset]})


In [None]:
model_base = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=4).to(device)

optimizer = AdamW(filter(lambda p: p.requires_grad, model_base.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

trainer_base = Trainer(
    model=model_base,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [None]:
trainer_base.train()


base_metrics = trainer_base.evaluate()
print("Baseline Evaluation Metrics:", base_metrics)


In [None]:
model_bo = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=4).to(device)

for k, v in model_bo.named_parameters():
    v.requires_grad = (("classifier" in k) or ("bias" in k))


optimizer = AdamW(filter(lambda p: p.requires_grad, model_bo.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

bias_trainer = Trainer(
    model=model_bo,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [None]:
bias_trainer.train()

bias_o_metrics = bias_trainer.evaluate()
print("Bias-only Evaluation Metrics:", bias_o_metrics)

In [None]:

dataset = load_dataset('fancyzhx/yelp_polarity')

tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)

train_data = tokenized_datasets.to_dict()


#with open('C:/Users/ege/git/NLP/datasets/yelp.pkl', 'wb') as file:
#    pickle.dump(train_data, file)


In [None]:

#with open('C:/Users/ege/git/NLP/datasets/yelp.pkl', 'rb') as file:
#    train_data = pickle.load(file)


train_dataset, validation_dataset = train_test_split(list(zip(train_data['input_ids'], train_data['attention_mask'], train_data['label'])), test_size=0.1)

train_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in train_dataset]
validation_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in validation_dataset]

print("Train:", len(train_dataset))
print("Val:", len(validation_dataset))
#print("Test:", len(dataset['test']))

train_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in train_dataset],
                                      'attention_mask': [x['attention_mask'] for x in train_dataset],
                                      'labels': [x['label'] for x in train_dataset]})

validation_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in validation_dataset],
                                           'attention_mask': [x['attention_mask'] for x in validation_dataset],
                                           'labels': [x['label'] for x in validation_dataset]})


"""
with open('C:/Users/ege/git/NLP/datasets/yelp-train.pkl', 'wb') as file:
    pickle.dump(train_dataset_hf, file)

with open('C:/Users/ege/git/NLP/datasets/yelp-val.pkl', 'wb') as file:
    pickle.dump(validation_dataset_hf, file)
"""

In [22]:
"""
with open('C:/Users/ege/git/NLP/datasets/yelp-train.pkl', 'rb') as file:
    train_dataset_hf = pickle.load(file)

with open('C:/Users/ege/git/NLP/datasets/yelp-val.pkl', 'rb') as file:
    validation_dataset_hf = pickle.load(file)
"""

In [None]:
model_base = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)

optimizer = AdamW(filter(lambda p: p.requires_grad, model_base.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

trainer_base = Trainer(
    model=model_base,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [None]:
trainer_base.train()


base_metrics = trainer_base.evaluate()
print("Baseline Evaluation Metrics:", base_metrics)

In [None]:
model_bo = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)


for k, v in model_bo.named_parameters():
    v.requires_grad = (("classifier" in k) or ("bias" in k))


optimizer = AdamW(filter(lambda p: p.requires_grad, model_bo.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

bias_trainer = Trainer(
    model=model_bo,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

In [None]:
bias_trainer.train()

bias_o_metrics = bias_trainer.evaluate()
print("Bias-only Evaluation Metrics:", bias_o_metrics)

In [None]:
print(1)