In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset from Google Drive into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PhD_Thesis_Experiments/GitHub_ToChair/sample_complaints_2years_006_balanced.csv')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


In [None]:
# 1. Map hierarchical labels to unique integer IDs
unique_labels = df['hierarchical_label'].unique()
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(unique_labels)

# Add the integer label column to the dataframe
df['label_id'] = df['hierarchical_label'].apply(lambda x: label_to_id[x])

# 2. Split data
X_train, X_test, y_train_id, y_test_id, y_train_full, y_test_full = train_test_split(
    df['consumer_complaint_narrative'],
    df['label_id'],
    df['hierarchical_label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label_id'] # Use label_id for stratification
)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Choose a pre-trained BERT model and tokenizer
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 256 # Adjust based on complaint length and compute resources
BATCH_SIZE = 16 # Common batch size for BERT fine-tuning

class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = ComplaintDataset(X_train, y_train_id, tokenizer, MAX_LEN)
test_dataset = ComplaintDataset(X_test, y_test_id, tokenizer, MAX_LEN)

train_data_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_data_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
#from torch.optim import AdamW # <-- Use torch.optim.AdamW or import AdamW from a specific submodule if needed
#from transformers import AdamW
#from transformers.optimization import AdamW
from torch.optim import AdamW

# 1. Model Initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels, # Defined in the previous data preparation section
    id2label=id_to_label,  # Defined in the previous data preparation section
    label2id=label_to_id   # Defined in the previous data preparation section
)
model.to(device)

# 2. Optimizer and Scheduler
EPOCHS = 10
# Using AdamW from torch.optim or another source.
# NOTE: If you still get an error, try importing AdamW from
# the 'transformers.optimization' module or just use the PyTorch built-in one.
#optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# 3. Training Loop (Rest of the code remains the same)
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0

    # ... (training logic)
    for batch in data_loader:
        # Move data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping

        # Update weights and learning rate
        optimizer.step()
        scheduler.step()

    return total_loss / len(data_loader)

# Run training (e.g., for 3 epochs)
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    avg_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler)
    print(f'Train Loss: {avg_loss:.4f}')

print("\nModel fine-tuning setup successful. Training has started (or completed for 3 epochs).")

Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Train Loss: 1.4923
Epoch 2/10
Train Loss: 0.5293
Epoch 3/10
Train Loss: 0.1873
Epoch 4/10
Train Loss: 0.0566
Epoch 5/10
Train Loss: 0.0169
Epoch 6/10
Train Loss: 0.0080
Epoch 7/10
Train Loss: 0.0035
Epoch 8/10
Train Loss: 0.0013
Epoch 9/10
Train Loss: 0.0007
Epoch 10/10
Train Loss: 0.0004

Model fine-tuning setup successful. Training has started (or completed for 3 epochs).


In [None]:
# Assuming 'model' is the variable holding your BertForSequenceClassification model

# Define a path and filename for your saved model
OUTPUT_MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/PhD_Thesis_Experiments/GitHub_ToChair/bert_finetuned_epoch15.pt'

# Save the model's state dictionary
torch.save(model.state_dict(), OUTPUT_MODEL_PATH)

print(f"Model saved successfully to: {OUTPUT_MODEL_PATH}")

Model saved successfully to: /content/drive/MyDrive/Colab Notebooks/PhD_Thesis_Experiments/GitHub_ToChair/bert_finetuned_epoch15.pt


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Function to split hierarchical labels into product and sub-product
def split_hierarchical_label(label):
    if '::' in label:
        return label.split('::')
    else:
        return [label, 'None'] # Handle cases with no sub-product

# Function to calculate hierarchical metrics
def hierarchical_metrics(y_true, y_pred):
    product_true = [split_hierarchical_label(label)[0] for label in y_true]
    sub_product_true = [split_hierarchical_label(label)[1] for label in y_true]
    product_pred = [split_hierarchical_label(label)[0] for label in y_pred]
    sub_product_pred = [split_hierarchical_label(label)[1] for label in y_pred]

    # Calculate metrics at the product level
    product_precision = precision_score(product_true, product_pred, average='weighted', zero_division=0)
    product_recall = recall_score(product_true, product_pred, average='weighted', zero_division=0)
    product_f1 = f1_score(product_true, product_pred, average='weighted', zero_division=0)

    # Calculate metrics at the sub-product level (only for non-None sub-products)
    # We need to filter for cases where both true and predicted sub-products are not 'None'
    valid_sub_product_true = [sub for i, sub in enumerate(sub_product_true) if sub != 'None' and sub_product_pred[i] != 'None']
    valid_sub_product_pred = [sub for i, sub in enumerate(sub_product_pred) if sub != 'None' and sub_product_true[i] != 'None']


    sub_product_precision = precision_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0
    sub_product_recall = recall_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0
    sub_product_f1 = f1_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0


    # A simple way to combine scores (can be weighted based on importance)
    # Here, we'll just average them
    hierarchical_precision = (product_precision + sub_product_precision) / 2
    hierarchical_recall = (product_recall + sub_product_recall) / 2
    hierarchical_f1 = (product_f1 + sub_product_f1) / 2

    return {
        'product_precision': product_precision,
        'product_recall': product_recall,
        'product_f1': product_f1,
        'sub_product_precision': sub_product_precision,
        'sub_product_recall': sub_product_recall,
        'sub_product_f1': sub_product_f1,
        'hierarchical_precision': hierarchical_precision,
        'hierarchical_recall': hierarchical_recall,
        'hierarchical_f1': hierarchical_f1
    }


In [None]:
def get_predictions(model, data_loader, device, id_to_label):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            # Get the class with the highest probability
            _, preds = torch.max(outputs.logits, dim=1)

            # Convert integer predictions back to full hierarchical label strings
            batch_predictions = [id_to_label[pred.item()] for pred in preds]
            predictions.extend(batch_predictions)

    return predictions

# Get predictions on the test set
y_pred_full = get_predictions(model, test_data_loader, device, id_to_label)

In [None]:
# The y_test_full series needs to be converted to a list of strings
y_true_full = y_test_full.tolist()

# Calculate the metrics
evaluation_results = hierarchical_metrics(y_true_full, y_pred_full)

## ðŸ“Š Hierarchical Evaluation Metrics
print("\nHierarchical Evaluation Results:")
for metric, score in evaluation_results.items():
    print(f"**{metric.replace('_', ' ').title()}:** {score:.4f}")


Hierarchical Evaluation Results:
**Product Precision:** 0.9897
**Product Recall:** 0.9897
**Product F1:** 0.9897
**Sub Product Precision:** 0.9835
**Sub Product Recall:** 0.9834
**Sub Product F1:** 0.9834
**Hierarchical Precision:** 0.9866
**Hierarchical Recall:** 0.9866
**Hierarchical F1:** 0.9866
