# Fine-tuning BERT for Text Classification

## 1. Setup and Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

## 2. Configuration

In [2]:
data_path = "clean_data.csv"
model_name = "bert-base-uncased"

## 3. Load Dataset

In [3]:
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    print(f"Error: '{data_path}' not found. Please ensure the cleaned data is in the correct directory.")
except Exception as e:
    print(f"Error loading CSV: {e}")

df = df.dropna(subset=['plain_text'])
df = df[df['plain_text'].str.strip() != '']

if df.empty:
    print("Error: Dataset is empty after loading/cleaning. Cannot proceed with training.")
else:
    print(f"Dataset loaded. Total samples: {len(df)}")
    print("Label distribution: \
", df['label'].value_counts())

Dataset loaded. Total samples: 888
Label distribution:  label
applied    500
rejects    388
Name: count, dtype: int64


## 4. Label Encoding

In [4]:
unique_labels = df['label'].unique()
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}

num_labels = len(unique_labels)
print(f"Found {num_labels} unique labels: {unique_labels}")
print(f"Label to ID mapping: {label_to_id}")

df['labels'] = df['label'].map(label_to_id)

Found 2 unique labels: ['applied' 'rejects']
Label to ID mapping: {'applied': 0, 'rejects': 1}


## 5. Create Hugging Face Dataset

In [5]:
hf_dataset = Dataset.from_pandas(df[['plain_text', 'labels']])

## 6. Split Dataset

In [6]:
train_test_split_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split_dataset["train"]
eval_dataset = train_test_split_dataset["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

Training samples: 710
Evaluation samples: 178


## 7. Tokenization

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["plain_text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/710 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

## 8. Model Loading

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 9. Training Setup

In [26]:
import evaluate

accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {"accuracy": acc["accuracy"], "f1_weighted": f1["f1"]}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Set save_strategy to match eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.005,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer, # Deprecated
    # processing_class=AutoTokenizer, # Use processing_class instead
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## 10. Training

In [27]:
if torch.cuda.is_available():
    print("CUDA is available! Training will use GPU.")
else:
    print("CUDA is not available. Training will use CPU, which will be significantly slower.")

trainer.train()

CUDA is available! Training will use GPU.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,No log,0.929433,0.910112,0.908994
2,0.030500,0.564734,0.938202,0.938415
3,0.060800,0.790825,0.921348,0.921962
4,0.030100,0.48184,0.949438,0.9495
5,0.013400,0.469944,0.949438,0.9495
6,0.023500,0.413782,0.949438,0.949613
7,0.022100,0.43064,0.955056,0.955056
8,0.015300,0.563156,0.932584,0.933006
9,0.010900,0.408313,0.955056,0.955056
10,0.010900,0.419472,0.955056,0.955056


TrainOutput(global_step=1335, training_loss=0.01631294431143932, metrics={'train_runtime': 914.7786, 'train_samples_per_second': 11.642, 'train_steps_per_second': 1.459, 'total_flos': 2802132739584000.0, 'train_loss': 0.01631294431143932, 'epoch': 15.0})

## 11. Evaluation

In [28]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.4306398034095764, 'eval_accuracy': 0.9550561797752809, 'eval_f1_weighted': 0.9550561797752809, 'eval_runtime': 1.4635, 'eval_samples_per_second': 121.625, 'eval_steps_per_second': 15.716, 'epoch': 15.0}


## 12. Save Model

In [29]:
model_save_path = "./fine_tuned_bert_classifier"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f" \
Fine-tuned model and tokenizer saved to: '{model_save_path}'")

 Fine-tuned model and tokenizer saved to: './fine_tuned_bert_classifier'


In [30]:
import os
import zipfile

# Define the directory to compress and the name of the zip file
directory_to_compress = "./fine_tuned_bert_classifier"
zip_file_name = "fine_tuned_bert_classifier.zip"

# Create a ZipFile object in write mode
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the directory and add each file to the zip file
    for root, dirs, files in os.walk(directory_to_compress):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip, preserving directory structure relative to the compressed directory
            zipf.write(file_path, os.path.relpath(file_path, directory_to_compress))

print(f"Successfully compressed '{directory_to_compress}' into '{zip_file_name}'")

Successfully compressed './fine_tuned_bert_classifier' into 'fine_tuned_bert_classifier.zip'


Now you can download the compressed model file using the following code:

In [31]:
from google.colab import files

files.download(zip_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>