In [1]:
import pandas as pd 
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=3  # 3 sentiment classes: positive, negative, neutral
)

def tokenize_function(example):
    return tokenizer(
        example["sentence"], 
        padding="max_length", 
        truncation=True, 
        max_length=128  # Adjust as needed
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence"])  # Remove raw text
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")  # Rename for Trainer
tokenized_dataset.set_format("torch")  # Convert to PyTorch tensors
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [None]:
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # Apply LoRA to attention layers
    lora_dropout=0.1,  # Dropout rate
    bias="none",  # Options: "none", "all", or "lora_only"
    task_type="SEQ_CLS"  # Task type: Sequence Classification
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save checkpoints
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=5e-5,  # Typical learning rate for BERT
    per_device_train_batch_size=16,  # Adjust based on hardware
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Regularization
    logging_dir="./logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    save_total_limit=2,  # Limit number of saved checkpoints
    save_strategy="epoch",  # Save at the end of each epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()