<a href="https://colab.research.google.com/github/gKorada/MLPractice/blob/main/BERT_Fine_tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else
                      "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
df = pd.read_csv("/content/drive/MyDrive/consumer_complaints.csv")
df = df.sample(frac = 0.05, random_state=42)

  df = pd.read_csv("/content/drive/MyDrive/consumer_complaints.csv")


In [None]:
df = df[['consumer_complaint_narrative', 'product']]

df = df.dropna(subset=['consumer_complaint_narrative'])

labels = df['product'].unique()
label_map = {label: idx for idx, label in enumerate(labels)}

df['label'] = df['product'].map(label_map)

print(f"Dataset loaded with {len(df)} valid records")
print(f"Number of unique product classes: {len(labels)}")

Dataset loaded with 3433 valid records
Number of unique product classes: 11


In [None]:
class ConsumerComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Convert to appropriate format and return
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
#defining funtion parameters
bert_model_name = 'bert-base-uncased'
batch_size = 8
max_length = 256
learning_rate = 2e-5
epochs = 3

tokenizer = BertTokenizer.from_pretrained(bert_model_name)


train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])

#Data initilization and Loading
train_dataset = ConsumerComplaintDataset(
        train_df['consumer_complaint_narrative'].values,
        train_df['label'].values,
        tokenizer,
        max_length
    )

test_dataset = ConsumerComplaintDataset(
        test_df['consumer_complaint_narrative'].values,
        test_df['label'].values,
        tokenizer,
        max_length
    )

train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=batch_size)

num_classes = len(label_map)
#Model Initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=11)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask,labels = labels)
        loss = outputs.loss
        total_loss += loss.item()

        #Backwards Pass

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Average training loss: {avg_train_loss:.4f}")

Epoch 1/3 - Average training loss: 1.5775
Epoch 2/3 - Average training loss: 0.7670
Epoch 3/3 - Average training loss: 0.4986


In [None]:
model.eval()
all_preds = []
all_labels = []
all_probs = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        #getting Logits
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits

        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)

# Calculate AUC (One-vs-Rest for multi-class)
all_probs = np.array(all_probs)
all_labels = np.array(all_labels)
n_classes = all_probs.shape[1]

# One-hot encode the labels for AUC calculation
one_hot_labels = np.zeros((len(all_labels), n_classes))
for i, label in enumerate(all_labels):
    one_hot_labels[i, label] = 1

auc = roc_auc_score(one_hot_labels, all_probs, multi_class='ovr')

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

Evaluating: 100%|██████████| 129/129 [00:19<00:00,  6.54it/s]

Accuracy: 0.8068
AUC: 0.9445



