In [None]:
import pandas as pd 
from datasets import load_dataset, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
def prepare_datasets(tokenizer, dataset_name="financial_phrasebank", subset_name="sentences_50agree", max_length=128, random_state=42):
    # Load the dataset
    dataset = load_dataset(dataset_name, subset_name, trust_remote_code=True)
    
    # Convert to Pandas DataFrame
    df = pd.DataFrame(dataset['train'])

    # Stratify split into train, validation, and test
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['sentence'], df['label'], test_size=0.2, stratify=df['label'], random_state=random_state
    )
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts, train_labels, test_size=0.1, stratify=train_labels, random_state=random_state
    )

    # Create DataFrames for each split
    train_df = pd.DataFrame({'sentence': train_texts, 'label': train_labels})
    val_df = pd.DataFrame({'sentence': val_texts, 'label': val_labels})
    test_df = pd.DataFrame({'sentence': test_texts, 'label': test_labels})

    # Convert DataFrames to Hugging Face Dataset format
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Define tokenization function
    def tokenize_function(example):
        return tokenizer(
            example["sentence"], 
            padding="max_length", 
            truncation=True, 
            max_length=max_length
        )

    # Tokenize datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Remove raw text and prepare for Hugging Face Trainer
    train_dataset = train_dataset.remove_columns(["sentence"])
    val_dataset = val_dataset.remove_columns(["sentence"])
    test_dataset = test_dataset.remove_columns(["sentence"])

    train_dataset = train_dataset.rename_column("label", "labels")
    val_dataset = val_dataset.rename_column("label", "labels")
    test_dataset = test_dataset.rename_column("label", "labels")

    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


def get_training_args(MODEL_NAME):
    
    training_args = TrainingArguments(
        output_dir=f"./results/{MODEL_NAME}",        # Directory to save checkpoints
        evaluation_strategy="epoch",                # Evaluate at the end of each epoch
        learning_rate=5e-5,                         # Typical learning rate for BERT
        per_device_train_batch_size=16,             # Adjust based on hardware
        per_device_eval_batch_size=16,    
        num_train_epochs=3,                         # Number of training epochs
        weight_decay=0.01,                          # Regularization
        logging_dir=f"./logs/{MODEL_NAME}",         # Directory for logs
        logging_steps=10,                           # Log every 10 steps
        save_total_limit=2,                         # Limit number of saved checkpoints
        save_strategy="epoch",                      # Save at the end of each epoch
        report_to=["tensorboard"],                  # Enable TensorBoard logging
        load_best_model_at_end=True,                # Automatically load the best model at the end
    )

    return training_args

## BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3
)

train_dataset, val_dataset, test_dataset = prepare_datasets(tokenizer)

In [None]:
training_args = get_training_args("bert-base-uncased")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pd.DataFrame(pred_labels, columns=["prediction"]).to_csv("predictions/BERT.csv", index=False)

## LoRA

In [None]:
training_args = get_training_args("lora")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3
)

train_dataset, val_dataset, test_dataset = prepare_datasets(tokenizer)

In [None]:
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # Apply LoRA to attention layers
    lora_dropout=0.1,  # Dropout rate
    bias="none",  # Options: "none", "all", or "lora_only"
    task_type="SEQ_CLS"  # Task type: Sequence Classification
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pd.DataFrame(pred_labels, columns=["prediction"]).to_csv("predictions/LoRA.csv", index=False)

## Distil-BERT

In [None]:
training_args = get_training_args("distilbert")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

train_dataset, val_dataset, test_dataset = prepare_datasets(tokenizer)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

 20%|█▉        | 130/654 [00:49<03:23,  2.57it/s]

{'loss': 0.4206, 'grad_norm': 7.684927940368652, 'learning_rate': 4.00611620795107e-05, 'epoch': 0.6}


 21%|██▏       | 140/654 [00:53<03:19,  2.58it/s]

{'loss': 0.4357, 'grad_norm': 8.195291519165039, 'learning_rate': 3.929663608562692e-05, 'epoch': 0.64}


 23%|██▎       | 150/654 [00:57<03:14,  2.59it/s]

{'loss': 0.3931, 'grad_norm': 2.8741869926452637, 'learning_rate': 3.8532110091743125e-05, 'epoch': 0.69}


 24%|██▍       | 160/654 [01:01<03:11,  2.58it/s]

{'loss': 0.4071, 'grad_norm': 7.499188423156738, 'learning_rate': 3.7767584097859326e-05, 'epoch': 0.73}


 26%|██▌       | 170/654 [01:05<03:07,  2.59it/s]

{'loss': 0.3406, 'grad_norm': 7.340969562530518, 'learning_rate': 3.7003058103975534e-05, 'epoch': 0.78}


 28%|██▊       | 180/654 [01:09<03:03,  2.58it/s]

{'loss': 0.3621, 'grad_norm': 5.2327752113342285, 'learning_rate': 3.623853211009174e-05, 'epoch': 0.83}


 29%|██▉       | 190/654 [01:13<02:59,  2.59it/s]

{'loss': 0.314, 'grad_norm': 8.225922584533691, 'learning_rate': 3.5474006116207956e-05, 'epoch': 0.87}


 31%|███       | 200/654 [01:16<02:55,  2.59it/s]

{'loss': 0.5109, 'grad_norm': 8.109097480773926, 'learning_rate': 3.4709480122324164e-05, 'epoch': 0.92}


 32%|███▏      | 210/654 [01:20<02:52,  2.58it/s]

{'loss': 0.3817, 'grad_norm': 4.40444803237915, 'learning_rate': 3.394495412844037e-05, 'epoch': 0.96}


 33%|███▎      | 218/654 [01:23<02:47,  2.60it/s]
 33%|███▎      | 218/654 [01:30<02:47,  2.60it/s]

{'eval_loss': 0.4215472340583801, 'eval_accuracy': 0.8422680412371134, 'eval_runtime': 7.085, 'eval_samples_per_second': 136.91, 'eval_steps_per_second': 8.61, 'epoch': 1.0}


 34%|███▎      | 220/654 [01:32<15:09,  2.10s/it]

{'loss': 0.4244, 'grad_norm': 3.974126100540161, 'learning_rate': 3.318042813455658e-05, 'epoch': 1.01}


 35%|███▌      | 230/654 [01:36<03:04,  2.29it/s]

{'loss': 0.2293, 'grad_norm': 10.558562278747559, 'learning_rate': 3.241590214067278e-05, 'epoch': 1.06}


 37%|███▋      | 240/654 [01:40<02:40,  2.57it/s]

{'loss': 0.2811, 'grad_norm': 6.891148090362549, 'learning_rate': 3.1651376146788995e-05, 'epoch': 1.1}


 38%|███▊      | 250/654 [01:44<02:36,  2.58it/s]

{'loss': 0.2531, 'grad_norm': 11.127967834472656, 'learning_rate': 3.08868501529052e-05, 'epoch': 1.15}


 40%|███▉      | 260/654 [01:48<02:32,  2.58it/s]

{'loss': 0.2891, 'grad_norm': 6.166630744934082, 'learning_rate': 3.012232415902141e-05, 'epoch': 1.19}


 41%|████▏     | 270/654 [01:52<02:28,  2.58it/s]

{'loss': 0.2955, 'grad_norm': 6.203314781188965, 'learning_rate': 2.9357798165137618e-05, 'epoch': 1.24}


 43%|████▎     | 280/654 [01:56<02:24,  2.58it/s]

{'loss': 0.37, 'grad_norm': 4.4997711181640625, 'learning_rate': 2.8593272171253826e-05, 'epoch': 1.28}


 44%|████▍     | 290/654 [01:59<02:21,  2.58it/s]

{'loss': 0.2656, 'grad_norm': 4.20886754989624, 'learning_rate': 2.782874617737003e-05, 'epoch': 1.33}


 46%|████▌     | 300/654 [02:03<02:17,  2.58it/s]

{'loss': 0.3003, 'grad_norm': 9.955462455749512, 'learning_rate': 2.7064220183486238e-05, 'epoch': 1.38}


 47%|████▋     | 310/654 [02:07<02:12,  2.59it/s]

{'loss': 0.2582, 'grad_norm': 7.390269756317139, 'learning_rate': 2.629969418960245e-05, 'epoch': 1.42}


 49%|████▉     | 320/654 [02:11<02:09,  2.58it/s]

{'loss': 0.2941, 'grad_norm': 4.191864490509033, 'learning_rate': 2.5535168195718656e-05, 'epoch': 1.47}


 50%|█████     | 330/654 [02:15<02:05,  2.58it/s]

{'loss': 0.2196, 'grad_norm': 6.550832748413086, 'learning_rate': 2.4770642201834864e-05, 'epoch': 1.51}


 52%|█████▏    | 340/654 [02:19<02:01,  2.58it/s]

{'loss': 0.2248, 'grad_norm': 3.8720765113830566, 'learning_rate': 2.4006116207951072e-05, 'epoch': 1.56}


 54%|█████▎    | 350/654 [02:23<01:58,  2.56it/s]

{'loss': 0.2363, 'grad_norm': 3.7438924312591553, 'learning_rate': 2.324159021406728e-05, 'epoch': 1.61}


 55%|█████▌    | 360/654 [02:27<01:53,  2.58it/s]

{'loss': 0.2289, 'grad_norm': 0.926530122756958, 'learning_rate': 2.2477064220183487e-05, 'epoch': 1.65}


 57%|█████▋    | 370/654 [02:30<01:50,  2.58it/s]

{'loss': 0.1098, 'grad_norm': 8.342782974243164, 'learning_rate': 2.1712538226299695e-05, 'epoch': 1.7}


 58%|█████▊    | 380/654 [02:34<01:46,  2.58it/s]

{'loss': 0.2361, 'grad_norm': 8.82999324798584, 'learning_rate': 2.0948012232415903e-05, 'epoch': 1.74}


 60%|█████▉    | 390/654 [02:38<01:42,  2.58it/s]

{'loss': 0.2424, 'grad_norm': 8.839984893798828, 'learning_rate': 2.018348623853211e-05, 'epoch': 1.79}


 61%|██████    | 400/654 [02:42<01:38,  2.58it/s]

{'loss': 0.2238, 'grad_norm': 1.3690845966339111, 'learning_rate': 1.9418960244648318e-05, 'epoch': 1.83}


 63%|██████▎   | 410/654 [02:46<01:34,  2.58it/s]

{'loss': 0.2417, 'grad_norm': 8.317001342773438, 'learning_rate': 1.8654434250764526e-05, 'epoch': 1.88}


 64%|██████▍   | 420/654 [02:50<01:32,  2.53it/s]

{'loss': 0.2087, 'grad_norm': 8.061782836914062, 'learning_rate': 1.7889908256880737e-05, 'epoch': 1.93}


 66%|██████▌   | 430/654 [02:54<01:26,  2.59it/s]

{'loss': 0.2635, 'grad_norm': 10.510335922241211, 'learning_rate': 1.712538226299694e-05, 'epoch': 1.97}


 67%|██████▋   | 436/654 [02:56<01:24,  2.59it/s]
 67%|██████▋   | 436/654 [03:03<01:24,  2.59it/s]

{'eval_loss': 0.46196478605270386, 'eval_accuracy': 0.8402061855670103, 'eval_runtime': 7.1334, 'eval_samples_per_second': 135.979, 'eval_steps_per_second': 8.551, 'epoch': 2.0}


 67%|██████▋   | 440/654 [03:06<04:19,  1.21s/it]

{'loss': 0.1515, 'grad_norm': 3.7035465240478516, 'learning_rate': 1.636085626911315e-05, 'epoch': 2.02}


 69%|██████▉   | 450/654 [03:09<01:23,  2.44it/s]

{'loss': 0.1453, 'grad_norm': 3.6326639652252197, 'learning_rate': 1.559633027522936e-05, 'epoch': 2.06}


 70%|███████   | 460/654 [03:13<01:15,  2.59it/s]

{'loss': 0.0978, 'grad_norm': 2.01401948928833, 'learning_rate': 1.4831804281345565e-05, 'epoch': 2.11}


 72%|███████▏  | 470/654 [03:17<01:11,  2.59it/s]

{'loss': 0.1437, 'grad_norm': 16.026092529296875, 'learning_rate': 1.4067278287461774e-05, 'epoch': 2.16}


 73%|███████▎  | 480/654 [03:21<01:06,  2.60it/s]

{'loss': 0.1919, 'grad_norm': 5.962028980255127, 'learning_rate': 1.3302752293577984e-05, 'epoch': 2.2}


 75%|███████▍  | 490/654 [03:25<01:05,  2.52it/s]

{'loss': 0.0848, 'grad_norm': 9.906692504882812, 'learning_rate': 1.253822629969419e-05, 'epoch': 2.25}


 76%|███████▋  | 500/654 [03:29<00:59,  2.59it/s]

{'loss': 0.0865, 'grad_norm': 1.6986289024353027, 'learning_rate': 1.1773700305810397e-05, 'epoch': 2.29}


 78%|███████▊  | 510/654 [03:33<00:55,  2.59it/s]

{'loss': 0.1025, 'grad_norm': 12.729138374328613, 'learning_rate': 1.1009174311926607e-05, 'epoch': 2.34}


 80%|███████▉  | 520/654 [03:37<00:51,  2.59it/s]

{'loss': 0.0731, 'grad_norm': 3.1806912422180176, 'learning_rate': 1.0244648318042814e-05, 'epoch': 2.39}


 81%|████████  | 530/654 [03:40<00:47,  2.60it/s]

{'loss': 0.0616, 'grad_norm': 3.846494197845459, 'learning_rate': 9.480122324159022e-06, 'epoch': 2.43}


 83%|████████▎ | 540/654 [03:44<00:43,  2.59it/s]

{'loss': 0.141, 'grad_norm': 1.93793785572052, 'learning_rate': 8.71559633027523e-06, 'epoch': 2.48}


 84%|████████▍ | 550/654 [03:48<00:40,  2.59it/s]

{'loss': 0.1267, 'grad_norm': 2.4167611598968506, 'learning_rate': 7.951070336391438e-06, 'epoch': 2.52}


 86%|████████▌ | 560/654 [03:52<00:36,  2.60it/s]

{'loss': 0.1131, 'grad_norm': 0.253402441740036, 'learning_rate': 7.186544342507645e-06, 'epoch': 2.57}


 87%|████████▋ | 570/654 [03:56<00:32,  2.59it/s]

{'loss': 0.1327, 'grad_norm': 5.263981819152832, 'learning_rate': 6.422018348623854e-06, 'epoch': 2.61}


 89%|████████▊ | 580/654 [04:00<00:28,  2.58it/s]

{'loss': 0.1067, 'grad_norm': 5.214234828948975, 'learning_rate': 5.657492354740062e-06, 'epoch': 2.66}


 90%|█████████ | 590/654 [04:04<00:24,  2.60it/s]

{'loss': 0.1091, 'grad_norm': 5.230186462402344, 'learning_rate': 4.892966360856269e-06, 'epoch': 2.71}


 92%|█████████▏| 600/654 [04:07<00:20,  2.59it/s]

{'loss': 0.113, 'grad_norm': 3.4041225910186768, 'learning_rate': 4.128440366972477e-06, 'epoch': 2.75}


 93%|█████████▎| 610/654 [04:11<00:16,  2.63it/s]

{'loss': 0.1018, 'grad_norm': 1.8650617599487305, 'learning_rate': 3.363914373088685e-06, 'epoch': 2.8}


 95%|█████████▍| 620/654 [04:15<00:13,  2.60it/s]

{'loss': 0.1666, 'grad_norm': 16.223546981811523, 'learning_rate': 2.599388379204893e-06, 'epoch': 2.84}


 96%|█████████▋| 630/654 [04:19<00:09,  2.59it/s]

{'loss': 0.1742, 'grad_norm': 1.7135577201843262, 'learning_rate': 1.8348623853211011e-06, 'epoch': 2.89}


 98%|█████████▊| 640/654 [04:23<00:05,  2.59it/s]

{'loss': 0.0886, 'grad_norm': 3.7095298767089844, 'learning_rate': 1.0703363914373088e-06, 'epoch': 2.94}


 99%|█████████▉| 650/654 [04:27<00:01,  2.59it/s]

{'loss': 0.0867, 'grad_norm': 0.2855675220489502, 'learning_rate': 3.0581039755351683e-07, 'epoch': 2.98}


100%|██████████| 654/654 [04:28<00:00,  2.59it/s]
100%|██████████| 654/654 [04:36<00:00,  2.59it/s]

{'eval_loss': 0.5613306760787964, 'eval_accuracy': 0.8298969072164949, 'eval_runtime': 7.1046, 'eval_samples_per_second': 136.531, 'eval_steps_per_second': 8.586, 'epoch': 3.0}


100%|██████████| 654/654 [04:37<00:00,  2.36it/s]

{'train_runtime': 277.6209, 'train_samples_per_second': 37.692, 'train_steps_per_second': 2.356, 'train_loss': 0.3004509835341655, 'epoch': 3.0}





TrainOutput(global_step=654, training_loss=0.3004509835341655, metrics={'train_runtime': 277.6209, 'train_samples_per_second': 37.692, 'train_steps_per_second': 2.356, 'total_flos': 346540894838784.0, 'train_loss': 0.3004509835341655, 'epoch': 3.0})

In [13]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pd.DataFrame(pred_labels, columns=["prediction"]).to_csv("predictions/DistilBERT.csv", index=False)

100%|██████████| 61/61 [00:06<00:00,  8.83it/s]
