In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Hierarchical Metric Functions ---
def split_hierarchical_label(label):
    if '::' in label:
        return label.split('::')
    else:
        return [label, 'None']

def hierarchical_metrics(y_true, y_pred):
    product_true = [split_hierarchical_label(label)[0] for label in y_true]
    sub_product_true = [split_hierarchical_label(label)[1] for label in y_true]
    product_pred = [split_hierarchical_label(label)[0] for label in y_pred]
    sub_product_pred = [split_hierarchical_label(label)[1] for label in y_pred]

    product_precision = precision_score(product_true, product_pred, average='weighted', zero_division=0)
    product_recall = recall_score(product_true, product_pred, average='weighted', zero_division=0)
    product_f1 = f1_score(product_true, product_pred, average='weighted', zero_division=0)

    valid_sub_product_true = [sub for i, sub in enumerate(sub_product_true) if sub != 'None' and sub_product_pred[i] != 'None']
    valid_sub_product_pred = [sub for i, sub in enumerate(sub_product_pred) if sub != 'None' and sub_product_true[i] != 'None']

    sub_product_precision = precision_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0
    sub_product_recall = recall_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0
    sub_product_f1 = f1_score(valid_sub_product_true, valid_sub_product_pred, average='weighted', zero_division=0) if valid_sub_product_true else 0

    hierarchical_precision = (product_precision + sub_product_precision) / 2
    hierarchical_recall = (product_recall + sub_product_recall) / 2
    hierarchical_f1 = (product_f1 + sub_product_f1) / 2

    return {
        'product_precision': product_precision,
        'product_recall': product_recall,
        'product_f1': product_f1,
        'sub_product_precision': sub_product_precision,
        'sub_product_recall': sub_product_recall,
        'sub_product_f1': sub_product_f1,
        'hierarchical_precision': hierarchical_precision,
        'hierarchical_recall': hierarchical_recall,
        'hierarchical_f1': hierarchical_f1
    }

In [None]:
import pandas as pd

# Load the dataset from Google Drive into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PhD_Thesis_Experiments/GitHub_ToChair/sample_complaints_2years_006_balanced.csv')

In [None]:
# --- Install dependencies ---
!pip install -q transformers accelerate bitsandbytes peft datasets torch tqdm scikit-learn pandas


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# --- Imports ---
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import Dataset
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

In [None]:
# --- Load Dataset ---
labels = df["hierarchical_label"].unique().tolist()

# Prepare prompts
def make_prompt(example):
    return f"""Classify the following consumer complaint into one of these categories:
{', '.join(labels)}.
Return the label in the format Product::Sub-product (if applicable).

Complaint: {example}
Answer:"""

df["prompt"] = df["consumer_complaint_narrative"].apply(make_prompt)
df["target"] = df["hierarchical_label"]

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df[["prompt", "target"]])

# --- Tokenizer & Model ---
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# --- LoRA Config ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

In [None]:
def combine_prompt_target(example):
    return {"text": example["prompt"] + " " + example["target"]}

In [None]:
dataset = dataset.map(combine_prompt_target)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [None]:
max_length = 512

def tokenize_fn(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # For causal LM, labels are same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

dataset = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [None]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
# from datasets import Dataset
# dataset = Dataset.from_pandas(df[["prompt", "target"]])
# dataset = dataset.map(tokenize_fn, batched=True)


In [None]:
# --- Tokenization ---
max_length = 512
def tokenize_fn(batch):
    return tokenizer(
        batch["prompt"],
        batch["target"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )

# dataset = dataset.map(tokenize_fn, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, padding=True)

In [None]:
# --- Training ---
training_args = TrainingArguments(
    output_dir="./mistral_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
10,3.0906
20,2.9107
30,2.6266
40,2.7205
50,2.5487
60,2.7598
70,2.5801
80,2.5656
90,2.6912
100,2.5752


TrainOutput(global_step=6000, training_loss=2.3801529359817506, metrics={'train_runtime': 5667.5346, 'train_samples_per_second': 8.469, 'train_steps_per_second': 1.059, 'total_flos': 1.049978373931008e+18, 'train_loss': 2.3801529359817506, 'epoch': 3.0})

In [None]:
# Save LoRA adapter
model.save_pretrained("./mistral_lora")
tokenizer.save_pretrained("./mistral_lora")


('./mistral_lora/tokenizer_config.json',
 './mistral_lora/special_tokens_map.json',
 './mistral_lora/chat_template.jinja',
 './mistral_lora/tokenizer.model',
 './mistral_lora/added_tokens.json',
 './mistral_lora/tokenizer.json')

In [None]:
# --- Inference using LoRA model ---
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, "./mistral_lora")

predictions = []
batch_size = 16
max_new_tokens = 15

for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df["consumer_complaint_narrative"].tolist()[i:i+batch_size]

    prompts = [
        f"""Classify the following consumer complaint into one of these categories:
{', '.join(labels)}.
Return the label in the format Product::Sub-product (if applicable).

Complaint: {text}
Answer:""" for text in batch_texts
    ]

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=model.config.pad_token_id
        )

    batch_preds = [
        tokenizer.decode(out, skip_special_tokens=True).split("Answer:")[-1].strip()
        for out in outputs
    ]
    predictions.extend(batch_preds)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/1000 [00:01<29:59,  1.80s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/1000 [00:03<27:21,  1.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/1000 [00:04<26:31,  1.60s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/1000 [00:06<26:06,  1.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/1000 [00:07<25:55,  1.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 6/1000 [00:09<25:42,  1.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 7/1000 [00:11<25:34,  1.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 8/1000 [00:12<25:34,  1.55s/it]Setting `pad_token_id` to `eos_t

In [None]:
# --- Evaluate ---
results = hierarchical_metrics(df["hierarchical_label"].tolist(), predictions)
print("\nHierarchical Evaluation Results:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


Hierarchical Evaluation Results:
product_precision: 0.9339
product_recall: 0.7262
product_f1: 0.8125
sub_product_precision: 0.8477
sub_product_recall: 0.1154
sub_product_f1: 0.1404
hierarchical_precision: 0.8908
hierarchical_recall: 0.4208
hierarchical_f1: 0.4764
