In [None]:
import pandas as pd 
from datasets import load_dataset, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
def prepare_datasets(tokenizer, dataset_name="financial_phrasebank", subset_name="sentences_50agree", max_length=128, random_state=42):
    # Load the dataset
    dataset = load_dataset(dataset_name, subset_name, trust_remote_code=True)
    
    # Convert to Pandas DataFrame
    df = pd.DataFrame(dataset['train'])

    # Stratify split into train, validation, and test
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['sentence'], df['label'], test_size=0.2, stratify=df['label'], random_state=random_state
    )
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts, train_labels, test_size=0.1, stratify=train_labels, random_state=random_state
    )

    # Create DataFrames for each split
    train_df = pd.DataFrame({'sentence': train_texts, 'label': train_labels})
    val_df = pd.DataFrame({'sentence': val_texts, 'label': val_labels})
    test_df = pd.DataFrame({'sentence': test_texts, 'label': test_labels})

    # Convert DataFrames to Hugging Face Dataset format
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Define tokenization function
    def tokenize_function(example):
        return tokenizer(
            example["sentence"], 
            padding="max_length", 
            truncation=True, 
            max_length=max_length
        )

    # Tokenize datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Remove raw text and prepare for Hugging Face Trainer
    train_dataset = train_dataset.remove_columns(["sentence"])
    val_dataset = val_dataset.remove_columns(["sentence"])
    test_dataset = test_dataset.remove_columns(["sentence"])

    train_dataset = train_dataset.rename_column("label", "labels")
    val_dataset = val_dataset.rename_column("label", "labels")
    test_dataset = test_dataset.rename_column("label", "labels")

    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, val_dataset, test_dataset

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

training_args = TrainingArguments(
    output_dir="./results",            # Directory to save checkpoints
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    learning_rate=5e-5,               # Typical learning rate for BERT
    per_device_train_batch_size=16,   # Adjust based on hardware
    per_device_eval_batch_size=16,    
    num_train_epochs=3,               # Number of training epochs
    weight_decay=0.01,                # Regularization
    logging_dir="./logs",             # Directory for logs
    logging_steps=10,                 # Log every 10 steps
    save_total_limit=2,               # Limit number of saved checkpoints
    save_strategy="epoch",            # Save at the end of each epoch
    report_to=["tensorboard"],        # Enable TensorBoard logging
    load_best_model_at_end=True,      # Automatically load the best model at the end
)

## BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3
)

train_dataset, val_dataset, test_dataset = prepare_datasets(tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pd.DataFrame(pred_labels, columns=["prediction"]).to_csv("predictions/BERT.csv", index=False)

## LoRA

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3
)

train_dataset, val_dataset, test_dataset = prepare_datasets(tokenizer)

In [None]:
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # Apply LoRA to attention layers
    lora_dropout=0.1,  # Dropout rate
    bias="none",  # Options: "none", "all", or "lora_only"
    task_type="SEQ_CLS"  # Task type: Sequence Classification
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pd.DataFrame(pred_labels, columns=["prediction"]).to_csv("predictions/LoRA.csv", index=False)

## Distil-BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

train_dataset, val_dataset, test_dataset = prepare_datasets(tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
pd.DataFrame(pred_labels, columns=["prediction"]).to_csv("predictions/DistilBERT.csv", index=False)