# Fine-tuning BERT for Text Classification

## 1. Setup and Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

## 2. Configuration

In [None]:
data_path = "clean_data.csv"
model_name = "bert-base-uncased"

## 3. Load Dataset

In [None]:
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print(f"Error: '{data_path}' not found. Please ensure the cleaned data is in the correct directory.")
except Exception as e:
    print(f"Error loading CSV: {e}")

df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != '']

if df.empty:
    print("Error: Dataset is empty after loading/cleaning. Cannot proceed with training.")
else:
    print(f"Dataset loaded. Total samples: {len(df)}")
    print("Label distribution: \
", df['label'].value_counts())

## 4. Label Encoding

In [None]:
unique_labels = df['label'].unique()
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}

num_labels = len(unique_labels)
print(f"Found {num_labels} unique labels: {unique_labels}")
print(f"Label to ID mapping: {label_to_id}")

df['labels'] = df['label'].map(label_to_id)

## 5. Create Hugging Face Dataset

In [None]:
hf_dataset = Dataset.from_pandas(df[['text', 'labels']])

## 6. Split Dataset

In [None]:
train_test_split_dataset = hf_dataset.train_test_split(test_size=0.2, stratify_by_column="labels", seed=42)
train_dataset = train_test_split_dataset["train"]
eval_dataset = train_test_split_dataset["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

## 7. Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

## 8. Model Loading

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels,
    id2label=id_to_label, 
    label2id=label_to_id
)

## 9. Training Setup

In [None]:
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    
    return {"accuracy": acc["accuracy"], "f1_weighted": f1["f1"]}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

## 10. Training

In [None]:
if torch.cuda.is_available():
    print("CUDA is available! Training will use GPU.")
else:
    print("CUDA is not available. Training will use CPU, which will be significantly slower.")

trainer.train()

## 11. Evaluation

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

## 12. Save Model

In [None]:
model_save_path = "./fine_tuned_bert_classifier"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f" \
Fine-tuned model and tokenizer saved to: '{model_save_path}'")