In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Install dependencies ---
!pip install -q transformers accelerate bitsandbytes tqdm torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# --- Imports ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
# --- Load Model & Tokenizer ---
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Fix: Set pad_token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# --- Hierarchical Metric Functions ---
def split_hierarchical_label(label):
    if '::' in label:
        return label.split('::')
    else:
        return [label, 'None']

def hierarchical_metrics(y_true, y_pred):
    product_true = [split_hierarchical_label(label)[0] for label in y_true]
    sub_product_true = [split_hierarchical_label(label)[1] for label in y_true]
    product_pred = [split_hierarchical_label(label)[0] for label in y_pred]
    sub_product_pred = [split_hierarchical_label(label)[1] for label in y_pred]

    product_precision = precision_score(product_true, product_pred, average='weighted', zero_division=0)
    product_recall = recall_score(product_true, product_pred, average='weighted', zero_division=0)
    product_f1 = f1_score(product_true, product_pred, average='weighted', zero_division=0)

    valid_sub_product_true = [sub for i, sub in enumerate(sub_product_true) if sub != 'None' and sub_product_pred[i] != 'None']
    valid_sub_product_pred = [sub for i, sub in enumerate(sub_product_pred) if sub != 'None' and sub_product_true[i] != 'None']

    sub_product_precision = precision_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0
    sub_product_recall = recall_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0
    sub_product_f1 = f1_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0

    hierarchical_precision = (product_precision + sub_product_precision) / 2
    hierarchical_recall = (product_recall + sub_product_recall) / 2
    hierarchical_f1 = (product_f1 + sub_product_f1) / 2

    return {
        'product_precision': product_precision,
        'product_recall': product_recall,
        'product_f1': product_f1,
        'sub_product_precision': sub_product_precision,
        'sub_product_recall': sub_product_recall,
        'sub_product_f1': sub_product_f1,
        'hierarchical_precision': hierarchical_precision,
        'hierarchical_recall': hierarchical_recall,
        'hierarchical_f1': hierarchical_f1
    }

In [None]:
import pandas as pd

# Load the dataset from Google Drive into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PhD_Thesis_Experiments/GitHub_ToChair/sample_complaints_2years_006_balanced.csv')

In [None]:
labels = df["hierarchical_label"].unique().tolist()
complaints = df["consumer_complaint_narrative"].tolist()
predictions = []

batch_size = 16  # A100 can easily handle 16–24
max_new_tokens = 15  # Slightly shorter for faster runs

for i in tqdm(range(0, len(complaints), batch_size)):
    batch_texts = complaints[i:i+batch_size]

    prompts = [
        f"""Classify the following consumer complaint into one of these categories:
{', '.join(labels)}.
Return the label in the format Product::Sub-product (if applicable).

Complaint: {text}
Answer:"""
        for text in batch_texts
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=model.config.pad_token_id
        )

    batch_preds = [
        tokenizer.decode(out, skip_special_tokens=True).split("Answer:")[-1].strip()
        for out in outputs
    ]
    predictions.extend(batch_preds)

100%|██████████| 1000/1000 [19:51<00:00,  1.19s/it]


In [None]:
# --- Evaluate ---
results = hierarchical_metrics(df["hierarchical_label"].tolist(), predictions)
print("\nHierarchical Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


Hierarchical Evaluation Results:
product_precision: 0.6102
product_recall: 0.5156
product_f1: 0.5326
sub_product_precision: 0.5945
sub_product_recall: 0.2526
sub_product_f1: 0.2887
hierarchical_precision: 0.6023
hierarchical_recall: 0.3841
hierarchical_f1: 0.4107
