In [1]:
print(0)

0


---
# LLM Finetuning

---

In [2]:
import torch

from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments #, BertModel
from datasets import load_dataset, Dataset

from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

from sklearn.metrics import accuracy_score

#import tensorflow as tf
#import numpy as np
#import nltk
#import datasets

import pickle





In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    labels = torch.tensor(labels)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):  
    return tokenizer(example['text'], padding='max_length', truncation=True)


In [6]:
modelsizes = {1:'prajjwal1/bert-tiny', 2:'prajjwal1/bert-mini', 3:'prajjwal1/bert-small'}

pre_t_model = modelsizes[1]

my_lr = 0.001

epox = 3

---
# IMDB

---

In [7]:

dataset = load_dataset('imdb')

tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)

train_data = tokenized_datasets.to_dict()

#with open(r'C:/Users/ege/git/NLP/datasets/imdb.pkl', 'wb') as file:
#    pickle.dump(train_data, file)


In [8]:

#with open(r'C:\Users\ege\git\NLP\datasets\imdb.pkl', 'rb') as file:
#    train_data = pickle.load(file)


train_dataset, validation_dataset = train_test_split(list(zip(train_data['input_ids'], train_data['attention_mask'], train_data['label'])), test_size=0.1)

#print(type(tokenized_datasets))

train_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in train_dataset]
validation_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in validation_dataset]

print("Train:", len(train_dataset))
print("Val:", len(validation_dataset))


train_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in train_dataset],
                                      'attention_mask': [x['attention_mask'] for x in train_dataset],
                                      'labels': [x['label'] for x in train_dataset]})

validation_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in validation_dataset],
                                           'attention_mask': [x['attention_mask'] for x in validation_dataset],
                                           'labels': [x['label'] for x in validation_dataset]})

#print(type(train_dataset_hf))

Train: 22500
Val: 2500


In [9]:

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=my_lr,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epox,
    weight_decay=0.01,
    report_to=[],
)


In [10]:
model_base = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)

print("\n The total trainable parameters on the baseline model:", sum([v.numel() for _,v in model_base.named_parameters() if v.requires_grad]))

optimizer = AdamW(filter(lambda p: p.requires_grad, model_base.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

trainer_base = Trainer(
    model=model_base,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 The total trainable parameters on the baseline model: 4386178


In [11]:
trainer_base.train()


base_metrics = trainer_base.evaluate()
print("Baseline Evaluation Metrics:", base_metrics)

  0%|          | 0/8439 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

{'loss': 0.6384, 'grad_norm': 2.881664752960205, 'learning_rate': 0.0009913572595317598, 'epoch': 0.18}
{'loss': 0.6465, 'grad_norm': 4.051089763641357, 'learning_rate': 0.0009657278259782398, 'epoch': 0.36}
{'loss': 0.6238, 'grad_norm': 10.590774536132812, 'learning_rate': 0.0009239977335096455, 'epoch': 0.53}
{'loss': 0.6243, 'grad_norm': 1.5676418542861938, 'learning_rate': 0.0008676096315616631, 'epoch': 0.71}
{'loss': 0.6134, 'grad_norm': 1.2421149015426636, 'learning_rate': 0.0007985129110568284, 'epoch': 0.89}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.5919720530509949, 'eval_accuracy': 0.7264, 'eval_runtime': 1.9941, 'eval_samples_per_second': 1253.696, 'eval_steps_per_second': 156.963, 'epoch': 1.0}
{'loss': 0.605, 'grad_norm': 1.2453041076660156, 'learning_rate': 0.0007190963120852617, 'epoch': 1.07}
{'loss': 0.609, 'grad_norm': 2.9887382984161377, 'learning_rate': 0.0006321053428620858, 'epoch': 1.24}
{'loss': 0.6144, 'grad_norm': 4.765998840332031, 'learning_rate': 0.0005405473648676087, 'epoch': 1.42}
{'loss': 0.6208, 'grad_norm': 11.15176773071289, 'learning_rate': 0.000447587625468245, 'epoch': 1.6}
{'loss': 0.621, 'grad_norm': 11.793036460876465, 'learning_rate': 0.00035643983227049005, 'epoch': 1.78}
{'loss': 0.6235, 'grad_norm': 3.8998804092407227, 'learning_rate': 0.00027025505215778884, 'epoch': 1.96}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.6148188710212708, 'eval_accuracy': 0.7028, 'eval_runtime': 1.8705, 'eval_samples_per_second': 1336.533, 'eval_steps_per_second': 167.334, 'epoch': 2.0}
{'loss': 0.6157, 'grad_norm': 3.6160497665405273, 'learning_rate': 0.00019201277587744715, 'epoch': 2.13}
{'loss': 0.6253, 'grad_norm': 4.627112865447998, 'learning_rate': 0.00012441791417960822, 'epoch': 2.31}
{'loss': 0.6361, 'grad_norm': 4.108634948730469, 'learning_rate': 6.98072864508354e-05, 'epoch': 2.49}
{'loss': 0.6599, 'grad_norm': 4.4647016525268555, 'learning_rate': 3.006883462019922e-05, 'epoch': 2.67}
{'loss': 0.6499, 'grad_norm': 1.1900697946548462, 'learning_rate': 6.576355190827588e-06, 'epoch': 2.84}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.6363691091537476, 'eval_accuracy': 0.6224, 'eval_runtime': 1.9294, 'eval_samples_per_second': 1295.728, 'eval_steps_per_second': 162.225, 'epoch': 3.0}
{'train_runtime': 126.9305, 'train_samples_per_second': 531.787, 'train_steps_per_second': 66.485, 'train_loss': 0.628055985214897, 'epoch': 3.0}


  0%|          | 0/313 [00:00<?, ?it/s]

Baseline Evaluation Metrics: {'eval_loss': 0.6363691091537476, 'eval_accuracy': 0.6224, 'eval_runtime': 2.0197, 'eval_samples_per_second': 1237.785, 'eval_steps_per_second': 154.971, 'epoch': 3.0}


In [12]:
model_bo = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)


print("\n The total trainable parameters before:", sum([v.numel() for _,v in model_bo.named_parameters() if v.requires_grad]))

for k, v in model_bo.named_parameters():
    v.requires_grad = (("classifier" in k) or ("bias" in k))

print(" after:", sum([v.numel() for _,v in model_bo.named_parameters() if v.requires_grad]))


optimizer = AdamW(filter(lambda p: p.requires_grad, model_bo.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

bias_trainer = Trainer(
    model=model_bo,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 The total trainable parameters before: 4386178
 after: 3330


In [13]:
bias_trainer.train()

bias_o_metrics = bias_trainer.evaluate()
print("Bias-only Evaluation Metrics:", bias_o_metrics)

  0%|          | 0/8439 [00:00<?, ?it/s]

{'loss': 0.6445, 'grad_norm': 1.7688113451004028, 'learning_rate': 0.0009913572595317598, 'epoch': 0.18}
{'loss': 0.5996, 'grad_norm': 2.289224863052368, 'learning_rate': 0.0009657278259782398, 'epoch': 0.36}
{'loss': 0.5717, 'grad_norm': 1.9591431617736816, 'learning_rate': 0.0009239977335096455, 'epoch': 0.53}
{'loss': 0.5653, 'grad_norm': 1.0617870092391968, 'learning_rate': 0.0008676096315616631, 'epoch': 0.71}
{'loss': 0.5559, 'grad_norm': 1.5690187215805054, 'learning_rate': 0.0007985129110568284, 'epoch': 0.89}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.5413217544555664, 'eval_accuracy': 0.7248, 'eval_runtime': 1.9397, 'eval_samples_per_second': 1288.886, 'eval_steps_per_second': 161.369, 'epoch': 1.0}
{'loss': 0.5431, 'grad_norm': 1.9532030820846558, 'learning_rate': 0.0007190963120852617, 'epoch': 1.07}
{'loss': 0.5252, 'grad_norm': 2.4829888343811035, 'learning_rate': 0.0006321053428620858, 'epoch': 1.24}
{'loss': 0.5116, 'grad_norm': 1.3111385107040405, 'learning_rate': 0.0005405473648676087, 'epoch': 1.42}
{'loss': 0.5089, 'grad_norm': 0.8862146735191345, 'learning_rate': 0.000447587625468245, 'epoch': 1.6}
{'loss': 0.5031, 'grad_norm': 2.504058599472046, 'learning_rate': 0.00035643983227049005, 'epoch': 1.78}
{'loss': 0.5045, 'grad_norm': 2.343111276626587, 'learning_rate': 0.00027025505215778884, 'epoch': 1.96}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.47035160660743713, 'eval_accuracy': 0.78, 'eval_runtime': 1.8754, 'eval_samples_per_second': 1333.015, 'eval_steps_per_second': 166.894, 'epoch': 2.0}
{'loss': 0.4906, 'grad_norm': 1.1119333505630493, 'learning_rate': 0.00019201277587744715, 'epoch': 2.13}
{'loss': 0.488, 'grad_norm': 1.352243423461914, 'learning_rate': 0.00012441791417960822, 'epoch': 2.31}
{'loss': 0.4909, 'grad_norm': 1.6634961366653442, 'learning_rate': 6.98072864508354e-05, 'epoch': 2.49}
{'loss': 0.4861, 'grad_norm': 2.299226999282837, 'learning_rate': 3.006883462019922e-05, 'epoch': 2.67}
{'loss': 0.4831, 'grad_norm': 1.7252436876296997, 'learning_rate': 6.576355190827588e-06, 'epoch': 2.84}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.46440595388412476, 'eval_accuracy': 0.7876, 'eval_runtime': 1.9311, 'eval_samples_per_second': 1294.581, 'eval_steps_per_second': 162.082, 'epoch': 3.0}
{'train_runtime': 106.3299, 'train_samples_per_second': 634.817, 'train_steps_per_second': 79.366, 'train_loss': 0.5270893880295123, 'epoch': 3.0}


  0%|          | 0/313 [00:00<?, ?it/s]

Bias-only Evaluation Metrics: {'eval_loss': 0.46440595388412476, 'eval_accuracy': 0.7876, 'eval_runtime': 1.9372, 'eval_samples_per_second': 1290.541, 'eval_steps_per_second': 161.576, 'epoch': 3.0}


---
# AG N

---

In [14]:

dataset = load_dataset('ag_news')

tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)

train_data = tokenized_datasets.to_dict()


#with open('C:/Users/ege/git/NLP/datasets/agn.pkl', 'wb') as file:
#    pickle.dump(train_data, file)


In [15]:
#with open('C:/Users/ege/git/NLP/datasets/agn.pkl', 'rb') as file:
#    train_data = pickle.load(file)

train_dataset, validation_dataset = train_test_split(
    list(zip(train_data['input_ids'], train_data['attention_mask'], train_data['label'])), 
    test_size=0.1
)

train_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in train_dataset]
validation_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in validation_dataset]

print("Train:", len(train_dataset))
print("Val:", len(validation_dataset))
#print("Test:", len(dataset['test']))


train_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in train_dataset],
                                      'attention_mask': [x['attention_mask'] for x in train_dataset],
                                      'labels': [x['label'] for x in train_dataset]})

validation_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in validation_dataset],
                                           'attention_mask': [x['attention_mask'] for x in validation_dataset],
                                           'labels': [x['label'] for x in validation_dataset]})


Train: 108000
Val: 12000


In [16]:
model_base = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=4).to(device)

optimizer = AdamW(filter(lambda p: p.requires_grad, model_base.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

trainer_base = Trainer(
    model=model_base,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
trainer_base.train()


base_metrics = trainer_base.evaluate()
print("Baseline Evaluation Metrics:", base_metrics)


  0%|          | 0/40500 [00:00<?, ?it/s]

{'loss': 0.623, 'grad_norm': 6.52887487411499, 'learning_rate': 0.0009996239762521157, 'epoch': 0.04}
{'loss': 0.602, 'grad_norm': 18.90766143798828, 'learning_rate': 0.000998496470583894, 'epoch': 0.07}
{'loss': 0.5608, 'grad_norm': 26.298551559448242, 'learning_rate': 0.0009966191788709695, 'epoch': 0.11}
{'loss': 0.5327, 'grad_norm': 3.426032304763794, 'learning_rate': 0.0009939949247383998, 'epoch': 0.15}
{'loss': 0.5353, 'grad_norm': 4.635737419128418, 'learning_rate': 0.0009906276553136876, 'epoch': 0.19}
{'loss': 0.459, 'grad_norm': 4.3768110275268555, 'learning_rate': 0.0009865224352899075, 'epoch': 0.22}
{'loss': 0.4585, 'grad_norm': 9.536322593688965, 'learning_rate': 0.0009816854393079391, 'epoch': 0.26}
{'loss': 0.4379, 'grad_norm': 2.895192861557007, 'learning_rate': 0.0009761239426692084, 'epoch': 0.3}
{'loss': 0.4027, 'grad_norm': 0.3314124047756195, 'learning_rate': 0.0009698463103929548, 'epoch': 0.33}
{'loss': 0.3984, 'grad_norm': 0.36109113693237305, 'learning_rate':

  0%|          | 0/1500 [00:00<?, ?it/s]

{'eval_loss': 0.33774396777153015, 'eval_accuracy': 0.9085833333333333, 'eval_runtime': 9.2516, 'eval_samples_per_second': 1297.068, 'eval_steps_per_second': 162.134, 'epoch': 1.0}
{'loss': 0.2448, 'grad_norm': 3.579455614089966, 'learning_rate': 0.0007330217598512688, 'epoch': 1.04}
{'loss': 0.2432, 'grad_norm': 5.710777282714844, 'learning_rate': 0.000715693032840626, 'epoch': 1.07}
{'loss': 0.2528, 'grad_norm': 0.19930502772331238, 'learning_rate': 0.0006980398830195779, 'epoch': 1.11}
{'loss': 0.2521, 'grad_norm': 0.14150141179561615, 'learning_rate': 0.0006800888624023555, 'epoch': 1.15}
{'loss': 0.2512, 'grad_norm': 3.360746145248413, 'learning_rate': 0.0006618669710291599, 'epoch': 1.19}
{'loss': 0.2686, 'grad_norm': 0.9398163557052612, 'learning_rate': 0.0006434016163555446, 'epoch': 1.22}
{'loss': 0.2524, 'grad_norm': 3.7759838104248047, 'learning_rate': 0.0006247205720289912, 'epoch': 1.26}
{'loss': 0.2325, 'grad_norm': 0.1811368614435196, 'learning_rate': 0.00060585193611470

  0%|          | 0/1500 [00:00<?, ?it/s]

{'eval_loss': 0.2597005069255829, 'eval_accuracy': 0.9251666666666667, 'eval_runtime': 8.9632, 'eval_samples_per_second': 1338.804, 'eval_steps_per_second': 167.35, 'epoch': 2.0}
{'loss': 0.1714, 'grad_norm': 1.8038910627365112, 'learning_rate': 0.00023339778359915425, 'epoch': 2.04}
{'loss': 0.1523, 'grad_norm': 0.5998036861419678, 'learning_rate': 0.00021719656225673072, 'epoch': 2.07}
{'loss': 0.1503, 'grad_norm': 0.06428992003202438, 'learning_rate': 0.0002014207041486068, 'epoch': 2.11}
{'loss': 0.1503, 'grad_norm': 2.199031352996826, 'learning_rate': 0.00018609393766395055, 'epoch': 2.15}
{'loss': 0.1462, 'grad_norm': 3.2295351028442383, 'learning_rate': 0.0001712393157154678, 'epoch': 2.19}
{'loss': 0.1461, 'grad_norm': 3.1945831775665283, 'learning_rate': 0.00015687918106563315, 'epoch': 2.22}
{'loss': 0.1544, 'grad_norm': 0.20841926336288452, 'learning_rate': 0.0001430351327210503, 'epoch': 2.26}
{'loss': 0.1407, 'grad_norm': 3.0840165615081787, 'learning_rate': 0.000129727993

  0%|          | 0/1500 [00:00<?, ?it/s]

{'eval_loss': 0.29290860891342163, 'eval_accuracy': 0.9286666666666666, 'eval_runtime': 8.8333, 'eval_samples_per_second': 1358.489, 'eval_steps_per_second': 169.811, 'epoch': 3.0}
{'train_runtime': 592.7552, 'train_samples_per_second': 546.6, 'train_steps_per_second': 68.325, 'train_loss': 0.2647604305126049, 'epoch': 3.0}


  0%|          | 0/1500 [00:00<?, ?it/s]

Baseline Evaluation Metrics: {'eval_loss': 0.29290860891342163, 'eval_accuracy': 0.9286666666666666, 'eval_runtime': 9.2696, 'eval_samples_per_second': 1294.553, 'eval_steps_per_second': 161.819, 'epoch': 3.0}


In [18]:
model_bo = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=4).to(device)

for k, v in model_bo.named_parameters():
    v.requires_grad = (("classifier" in k) or ("bias" in k))


optimizer = AdamW(filter(lambda p: p.requires_grad, model_bo.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

bias_trainer = Trainer(
    model=model_bo,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
bias_trainer.train()

bias_o_metrics = bias_trainer.evaluate()
print("Bias-only Evaluation Metrics:", bias_o_metrics)

  0%|          | 0/40500 [00:00<?, ?it/s]

{'loss': 0.7799, 'grad_norm': 1.6287389993667603, 'learning_rate': 0.0009996239762521157, 'epoch': 0.04}
{'loss': 0.5315, 'grad_norm': 1.3879408836364746, 'learning_rate': 0.000998496470583894, 'epoch': 0.07}
{'loss': 0.4902, 'grad_norm': 2.950274705886841, 'learning_rate': 0.0009966191788709695, 'epoch': 0.11}
{'loss': 0.4596, 'grad_norm': 2.4422459602355957, 'learning_rate': 0.0009939949247383998, 'epoch': 0.15}
{'loss': 0.4847, 'grad_norm': 2.190997362136841, 'learning_rate': 0.0009906276553136876, 'epoch': 0.19}
{'loss': 0.4569, 'grad_norm': 2.0108556747436523, 'learning_rate': 0.0009865224352899075, 'epoch': 0.22}
{'loss': 0.4472, 'grad_norm': 2.3280515670776367, 'learning_rate': 0.0009816854393079391, 'epoch': 0.26}
{'loss': 0.4306, 'grad_norm': 1.058864712715149, 'learning_rate': 0.0009761239426692084, 'epoch': 0.3}
{'loss': 0.423, 'grad_norm': 0.17063932120800018, 'learning_rate': 0.0009698463103929548, 'epoch': 0.33}
{'loss': 0.4322, 'grad_norm': 1.2033809423446655, 'learning_

  0%|          | 0/1500 [00:00<?, ?it/s]

{'eval_loss': 0.37920036911964417, 'eval_accuracy': 0.871, 'eval_runtime': 8.8836, 'eval_samples_per_second': 1350.801, 'eval_steps_per_second': 168.85, 'epoch': 1.0}
{'loss': 0.3925, 'grad_norm': 2.4410951137542725, 'learning_rate': 0.0007330217598512688, 'epoch': 1.04}
{'loss': 0.4197, 'grad_norm': 2.512885093688965, 'learning_rate': 0.000715693032840626, 'epoch': 1.07}
{'loss': 0.4143, 'grad_norm': 1.9350003004074097, 'learning_rate': 0.0006980398830195779, 'epoch': 1.11}
{'loss': 0.403, 'grad_norm': 0.9242910742759705, 'learning_rate': 0.0006800888624023555, 'epoch': 1.15}
{'loss': 0.3994, 'grad_norm': 2.113967180252075, 'learning_rate': 0.0006618669710291599, 'epoch': 1.19}
{'loss': 0.4356, 'grad_norm': 1.4415346384048462, 'learning_rate': 0.0006434016163555446, 'epoch': 1.22}
{'loss': 0.3956, 'grad_norm': 1.478810429573059, 'learning_rate': 0.0006247205720289912, 'epoch': 1.26}
{'loss': 0.3998, 'grad_norm': 0.8033536672592163, 'learning_rate': 0.0006058519361147064, 'epoch': 1.3}

  0%|          | 0/1500 [00:00<?, ?it/s]

{'eval_loss': 0.3634941875934601, 'eval_accuracy': 0.8771666666666667, 'eval_runtime': 8.8475, 'eval_samples_per_second': 1356.322, 'eval_steps_per_second': 169.54, 'epoch': 2.0}
{'loss': 0.4166, 'grad_norm': 1.6198127269744873, 'learning_rate': 0.00023339778359915425, 'epoch': 2.04}
{'loss': 0.4061, 'grad_norm': 1.3523964881896973, 'learning_rate': 0.00021719656225673072, 'epoch': 2.07}
{'loss': 0.3999, 'grad_norm': 1.3780587911605835, 'learning_rate': 0.0002014207041486068, 'epoch': 2.11}
{'loss': 0.3951, 'grad_norm': 1.654579520225525, 'learning_rate': 0.00018609393766395055, 'epoch': 2.15}
{'loss': 0.3908, 'grad_norm': 1.672650933265686, 'learning_rate': 0.0001712393157154678, 'epoch': 2.19}
{'loss': 0.3669, 'grad_norm': 2.9738245010375977, 'learning_rate': 0.00015687918106563315, 'epoch': 2.22}
{'loss': 0.4048, 'grad_norm': 0.7107691168785095, 'learning_rate': 0.0001430351327210503, 'epoch': 2.26}
{'loss': 0.3902, 'grad_norm': 2.18562388420105, 'learning_rate': 0.00012972799344549

  0%|          | 0/1500 [00:00<?, ?it/s]

{'eval_loss': 0.3626594543457031, 'eval_accuracy': 0.8791666666666667, 'eval_runtime': 8.9671, 'eval_samples_per_second': 1338.223, 'eval_steps_per_second': 167.278, 'epoch': 3.0}
{'train_runtime': 498.1922, 'train_samples_per_second': 650.351, 'train_steps_per_second': 81.294, 'train_loss': 0.4161712567364728, 'epoch': 3.0}


  0%|          | 0/1500 [00:00<?, ?it/s]

Bias-only Evaluation Metrics: {'eval_loss': 0.3626594543457031, 'eval_accuracy': 0.8791666666666667, 'eval_runtime': 9.0612, 'eval_samples_per_second': 1324.324, 'eval_steps_per_second': 165.54, 'epoch': 3.0}


---
# Yelp

---

In [20]:

dataset = load_dataset('fancyzhx/yelp_polarity')

tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)

train_data = tokenized_datasets.to_dict()


#with open('C:/Users/ege/git/NLP/datasets/yelp.pkl', 'wb') as file:
#    pickle.dump(train_data, file)


In [21]:

#with open('C:/Users/ege/git/NLP/datasets/yelp.pkl', 'rb') as file:
#    train_data = pickle.load(file)


train_dataset, validation_dataset = train_test_split(list(zip(train_data['input_ids'], train_data['attention_mask'], train_data['label'])), test_size=0.1)

train_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in train_dataset]
validation_dataset = [{'input_ids': x[0], 'attention_mask': x[1], 'label': x[2]} for x in validation_dataset]

print("Train:", len(train_dataset))
print("Val:", len(validation_dataset))
#print("Test:", len(dataset['test']))

train_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in train_dataset],
                                      'attention_mask': [x['attention_mask'] for x in train_dataset],
                                      'labels': [x['label'] for x in train_dataset]})

validation_dataset_hf = Dataset.from_dict({'input_ids': [x['input_ids'] for x in validation_dataset],
                                           'attention_mask': [x['attention_mask'] for x in validation_dataset],
                                           'labels': [x['label'] for x in validation_dataset]})


"""
with open('C:/Users/ege/git/NLP/datasets/yelp-train.pkl', 'wb') as file:
    pickle.dump(train_dataset_hf, file)

with open('C:/Users/ege/git/NLP/datasets/yelp-val.pkl', 'wb') as file:
    pickle.dump(validation_dataset_hf, file)
"""

Train: 504000
Val: 56000


"\nwith open('C:/Users/ege/git/NLP/datasets/yelp-train.pkl', 'wb') as file:\n    pickle.dump(train_dataset_hf, file)\n\nwith open('C:/Users/ege/git/NLP/datasets/yelp-val.pkl', 'wb') as file:\n    pickle.dump(validation_dataset_hf, file)\n"

In [22]:
"""
with open('C:/Users/ege/git/NLP/datasets/yelp-train.pkl', 'rb') as file:
    train_dataset_hf = pickle.load(file)

with open('C:/Users/ege/git/NLP/datasets/yelp-val.pkl', 'rb') as file:
    validation_dataset_hf = pickle.load(file)
"""

"\nwith open('C:/Users/ege/git/NLP/datasets/yelp-train.pkl', 'rb') as file:\n    train_dataset_hf = pickle.load(file)\n\nwith open('C:/Users/ege/git/NLP/datasets/yelp-val.pkl', 'rb') as file:\n    validation_dataset_hf = pickle.load(file)\n"

In [23]:
model_base = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)

optimizer = AdamW(filter(lambda p: p.requires_grad, model_base.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

trainer_base = Trainer(
    model=model_base,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer_base.train()


base_metrics = trainer_base.evaluate()
print("Baseline Evaluation Metrics:", base_metrics)

  0%|          | 0/189000 [00:00<?, ?it/s]

{'loss': 0.5474, 'grad_norm': 27.016056060791016, 'learning_rate': 0.0009999827315381878, 'epoch': 0.01}
{'loss': 0.5582, 'grad_norm': 1.2602927684783936, 'learning_rate': 0.0009999309273455508, 'epoch': 0.02}
{'loss': 0.5842, 'grad_norm': 1.9077694416046143, 'learning_rate': 0.0009998445910004079, 'epoch': 0.02}
{'loss': 0.5973, 'grad_norm': 3.1886630058288574, 'learning_rate': 0.0009997237284663364, 'epoch': 0.03}
{'loss': 0.5988, 'grad_norm': 1.525062084197998, 'learning_rate': 0.0009995683480917817, 'epoch': 0.04}
{'loss': 0.6403, 'grad_norm': 12.969871520996094, 'learning_rate': 0.0009993784606094616, 'epoch': 0.05}
{'loss': 0.6168, 'grad_norm': 6.565064430236816, 'learning_rate': 0.0009991540791356366, 'epoch': 0.06}
{'loss': 0.6212, 'grad_norm': 4.467358589172363, 'learning_rate': 0.0009988952191691935, 'epoch': 0.06}
{'loss': 0.5964, 'grad_norm': 88.54228973388672, 'learning_rate': 0.0009986018985905876, 'epoch': 0.07}
{'loss': 0.5739, 'grad_norm': 3.242130994796753, 'learning_

  0%|          | 0/7000 [00:00<?, ?it/s]

{'eval_loss': 0.4237401783466339, 'eval_accuracy': 0.829375, 'eval_runtime': 42.2303, 'eval_samples_per_second': 1326.062, 'eval_steps_per_second': 165.758, 'epoch': 1.0}
{'loss': 0.4245, 'grad_norm': 0.5322812795639038, 'learning_rate': 0.000746392598937408, 'epoch': 1.01}
{'loss': 0.441, 'grad_norm': 1.0613926649093628, 'learning_rate': 0.0007427681785900739, 'epoch': 1.02}
{'loss': 0.4427, 'grad_norm': 1.8324155807495117, 'learning_rate': 0.0007391269893106565, 'epoch': 1.02}
{'loss': 0.421, 'grad_norm': 1.2388644218444824, 'learning_rate': 0.0007354692826101063, 'epoch': 1.03}
{'loss': 0.439, 'grad_norm': 0.6214081645011902, 'learning_rate': 0.0007317953111402983, 'epoch': 1.04}
{'loss': 0.4318, 'grad_norm': 26.533971786499023, 'learning_rate': 0.0007281053286765773, 'epoch': 1.05}
{'loss': 0.4232, 'grad_norm': 1.0832068920135498, 'learning_rate': 0.0007243995901002281, 'epoch': 1.06}
{'loss': 0.4418, 'grad_norm': 2.0614871978759766, 'learning_rate': 0.0007206783513808714, 'epoch':

  0%|          | 0/7000 [00:00<?, ?it/s]

{'eval_loss': 0.37215036153793335, 'eval_accuracy': 0.8515714285714285, 'eval_runtime': 41.1002, 'eval_samples_per_second': 1362.524, 'eval_steps_per_second': 170.315, 'epoch': 2.0}
{'loss': 0.3591, 'grad_norm': 5.145537853240967, 'learning_rate': 0.0002464098673992169, 'epoch': 2.01}
{'loss': 0.3466, 'grad_norm': 0.7021046876907349, 'learning_rate': 0.00024283725124451996, 'epoch': 2.02}
{'loss': 0.3261, 'grad_norm': 4.521476745605469, 'learning_rate': 0.00023928239831024807, 'epoch': 2.02}
{'loss': 0.3557, 'grad_norm': 37.373878479003906, 'learning_rate': 0.0002357455541437694, 'epoch': 2.03}
{'loss': 0.3647, 'grad_norm': 1.9204293489456177, 'learning_rate': 0.00023222696304851776, 'epoch': 2.04}
{'loss': 0.3402, 'grad_norm': 6.177971363067627, 'learning_rate': 0.00022872686806711714, 'epoch': 2.05}
{'loss': 0.344, 'grad_norm': 2.4199740886688232, 'learning_rate': 0.0002252455109645942, 'epoch': 2.06}
{'loss': 0.3405, 'grad_norm': 1.5955873727798462, 'learning_rate': 0.00022178313221

  0%|          | 0/7000 [00:00<?, ?it/s]

{'eval_loss': 0.26412487030029297, 'eval_accuracy': 0.8984285714285715, 'eval_runtime': 42.8509, 'eval_samples_per_second': 1306.856, 'eval_steps_per_second': 163.357, 'epoch': 3.0}
{'train_runtime': 2682.002, 'train_samples_per_second': 563.758, 'train_steps_per_second': 70.47, 'train_loss': 0.3945663015375692, 'epoch': 3.0}


  0%|          | 0/7000 [00:00<?, ?it/s]

Baseline Evaluation Metrics: {'eval_loss': 0.26412487030029297, 'eval_accuracy': 0.8984285714285715, 'eval_runtime': 43.4737, 'eval_samples_per_second': 1288.135, 'eval_steps_per_second': 161.017, 'epoch': 3.0}


In [25]:
model_bo = BertForSequenceClassification.from_pretrained(pre_t_model, num_labels=2).to(device)


for k, v in model_bo.named_parameters():
    v.requires_grad = (("classifier" in k) or ("bias" in k))


optimizer = AdamW(filter(lambda p: p.requires_grad, model_bo.parameters()), lr=my_lr)

num_training_steps = len(train_dataset_hf) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = CosineAnnealingLR(optimizer, T_max=num_training_steps)

bias_trainer = Trainer(
    model=model_bo,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=validation_dataset_hf,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
bias_trainer.train()

bias_o_metrics = bias_trainer.evaluate()
print("Bias-only Evaluation Metrics:", bias_o_metrics)

  0%|          | 0/189000 [00:00<?, ?it/s]

{'loss': 0.5965, 'grad_norm': 1.5719420909881592, 'learning_rate': 0.0009999827315381878, 'epoch': 0.01}
{'loss': 0.543, 'grad_norm': 1.2778270244598389, 'learning_rate': 0.0009999309273455508, 'epoch': 0.02}
{'loss': 0.501, 'grad_norm': 2.6090776920318604, 'learning_rate': 0.0009998445910004079, 'epoch': 0.02}
{'loss': 0.4877, 'grad_norm': 1.8127431869506836, 'learning_rate': 0.0009997237284663364, 'epoch': 0.03}
{'loss': 0.4636, 'grad_norm': 1.4270974397659302, 'learning_rate': 0.0009995683480917817, 'epoch': 0.04}
{'loss': 0.4473, 'grad_norm': 3.2317471504211426, 'learning_rate': 0.0009993784606094616, 'epoch': 0.05}
{'loss': 0.4383, 'grad_norm': 1.7092515230178833, 'learning_rate': 0.0009991540791356366, 'epoch': 0.06}
{'loss': 0.4335, 'grad_norm': 4.4599289894104, 'learning_rate': 0.0009988952191691935, 'epoch': 0.06}
{'loss': 0.4092, 'grad_norm': 1.508431077003479, 'learning_rate': 0.0009986018985905876, 'epoch': 0.07}
{'loss': 0.4268, 'grad_norm': 2.352168321609497, 'learning_ra

  0%|          | 0/7000 [00:00<?, ?it/s]

{'eval_loss': 0.3011280298233032, 'eval_accuracy': 0.8739285714285714, 'eval_runtime': 42.462, 'eval_samples_per_second': 1318.827, 'eval_steps_per_second': 164.853, 'epoch': 1.0}
{'loss': 0.3375, 'grad_norm': 2.1949007511138916, 'learning_rate': 0.000746392598937408, 'epoch': 1.01}
{'loss': 0.3452, 'grad_norm': 0.9887603521347046, 'learning_rate': 0.0007427681785900739, 'epoch': 1.02}
{'loss': 0.3494, 'grad_norm': 1.8697959184646606, 'learning_rate': 0.0007391269893106565, 'epoch': 1.02}
{'loss': 0.3548, 'grad_norm': 0.2774420380592346, 'learning_rate': 0.0007354692826101063, 'epoch': 1.03}
{'loss': 0.357, 'grad_norm': 2.3704824447631836, 'learning_rate': 0.0007317953111402983, 'epoch': 1.04}
{'loss': 0.343, 'grad_norm': 2.596898317337036, 'learning_rate': 0.0007281053286765773, 'epoch': 1.05}
{'loss': 0.3415, 'grad_norm': 1.284769058227539, 'learning_rate': 0.0007243995901002281, 'epoch': 1.06}
{'loss': 0.3417, 'grad_norm': 2.996037483215332, 'learning_rate': 0.0007206783513808714, '

  0%|          | 0/7000 [00:00<?, ?it/s]

{'eval_loss': 0.308550089597702, 'eval_accuracy': 0.8688571428571429, 'eval_runtime': 44.7734, 'eval_samples_per_second': 1250.743, 'eval_steps_per_second': 156.343, 'epoch': 2.0}
{'loss': 0.3343, 'grad_norm': 3.1756298542022705, 'learning_rate': 0.0002464098673992169, 'epoch': 2.01}
{'loss': 0.3361, 'grad_norm': 1.007728099822998, 'learning_rate': 0.00024283725124451996, 'epoch': 2.02}
{'loss': 0.3301, 'grad_norm': 3.272610664367676, 'learning_rate': 0.00023928239831024807, 'epoch': 2.02}
{'loss': 0.3477, 'grad_norm': 2.4069912433624268, 'learning_rate': 0.0002357455541437694, 'epoch': 2.03}
{'loss': 0.3312, 'grad_norm': 1.4213545322418213, 'learning_rate': 0.00023222696304851776, 'epoch': 2.04}
{'loss': 0.3226, 'grad_norm': 1.8949302434921265, 'learning_rate': 0.00022872686806711714, 'epoch': 2.05}
{'loss': 0.3389, 'grad_norm': 2.0771470069885254, 'learning_rate': 0.0002252455109645942, 'epoch': 2.06}
{'loss': 0.3272, 'grad_norm': 1.2956396341323853, 'learning_rate': 0.00022178313221

  0%|          | 0/7000 [00:00<?, ?it/s]

{'eval_loss': 0.2939109206199646, 'eval_accuracy': 0.8771607142857143, 'eval_runtime': 44.3504, 'eval_samples_per_second': 1262.673, 'eval_steps_per_second': 157.834, 'epoch': 3.0}
{'train_runtime': 2273.6144, 'train_samples_per_second': 665.02, 'train_steps_per_second': 83.128, 'train_loss': 0.34639559863862535, 'epoch': 3.0}


  0%|          | 0/7000 [00:00<?, ?it/s]

Bias-only Evaluation Metrics: {'eval_loss': 0.2939109206199646, 'eval_accuracy': 0.8771607142857143, 'eval_runtime': 45.9629, 'eval_samples_per_second': 1218.373, 'eval_steps_per_second': 152.297, 'epoch': 3.0}


In [27]:
print(1)

1
