In [2]:
import pandas as pd
from datasets import Dataset, ClassLabel
from sklearn.model_selection import train_test_split

# --- 1. Load our Sampled Data ---
data_file = 'complaints_sample_100k.csv' # Make sure this is uploaded to Colab
df = pd.read_csv(data_file)
df = df.dropna(subset=['Consumer complaint narrative', 'Product'])

print(f"Loaded {len(df)} rows.")

# --- 2. Create Training and Validation Sets ---
# We'll split the data again. The 'test' set here is what the model
# will use during training to check its own performance (a validation set).
df_train, df_val = train_test_split(
    df,
    test_size=0.2, # 20% for validation
    random_state=42,
    stratify=df['Product']
)

print(f"Training samples: {len(df_train)}")
print(f"Validation samples: {len(df_val)}")

# --- 3. Convert to Hugging Face Dataset object ---
# This is the format the 'transformers' library expects
train_dataset = Dataset.from_pandas(df_train.reset_index(drop=True))
val_dataset = Dataset.from_pandas(df_val.reset_index(drop=True))

# --- 4. CRITICAL: Create Label Mappings ---
# LLMs don't understand text labels like "Mortgage". They need numbers (0, 1, 2...).
# We must create a mapping from strings to integers (id2label, label2id)

# Get a sorted list of unique product names
class_names = sorted(df['Product'].unique())

# Create the ClassLabel feature
cl = ClassLabel(names=class_names)

# Create the two mapping dictionaries we will need later
label2id = {name: i for i, name in enumerate(class_names)}
id2label = {i: name for i, name in enumerate(class_names)}

num_labels = len(class_names)

print(f"\nNumber of unique labels: {num_labels}")
print(f"Example label2id mapping: {list(label2id.items())[:5]}")

# --- 5. Create a function to convert text labels to integer IDs ---
def map_labels(example):
    example['label'] = label2id[example['Product']]
    return example

# Apply this function to both datasets
train_dataset = train_dataset.map(map_labels, batched=False)
val_dataset = val_dataset.map(map_labels, batched=False)

# We can also rename the text column to what the model expects
train_dataset = train_dataset.rename_column("Consumer complaint narrative", "text")
val_dataset = val_dataset.rename_column("Consumer complaint narrative", "text")

print("\n--- Example Data Point ---")
print(train_dataset[0])

Loaded 100000 rows.
Training samples: 80000
Validation samples: 20000

Number of unique labels: 20
Example label2id mapping: [('Bank account or service', 0), ('Checking or savings account', 1), ('Consumer Loan', 2), ('Credit card', 3), ('Credit card or prepaid card', 4)]


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]


--- Example Data Point ---
{'Product': 'Student loan', 'text': "Nelnet contacted me concerning my student loan and their representative said I needed to consolidate the loan. I was concerned, due to financial losses I was counting on loan forgiveness. I was paying the loan on a regular basis and was under an income repayment plan. I was told the billing would resume in two months. This consolidation occurred for the months of XX/XX/XXXX and XXXX of XXXX. I was deeply concerned when Nelnet wanted to do this because I did not want any lapse in my repayment play. I did not ask for a consolidation. The loans had previously been consolidated early on. The representative reassured me that was not the case and my payment history would be continuous. Now after applying for loan forgiveness, I am informed I was placed on administrative forbearance by Nelnet for those two months so my record was interrupted. I specifically said that I did not want any lapse in my payment schedule. Now only one 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig


model_id = "meta-llama/Meta-Llama-3.1-8B"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",       # Use "nf4" (NormalFloat 4) for high precision
    bnb_4bit_compute_dtype=torch.bfloat16, # Compute in bfloat16 for speed
    bnb_4bit_use_double_quant=True, # Use double quantization for better accuracy
)

# --- 2. Load the Tokenizer ---
# The tokenizer converts text to numbers (tokens)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)

# Llama models don't have a default padding token.
# We'll set it to the End-of-Sequence (EOS) token.
tokenizer.pad_token = tokenizer.eos_token

model_config = {"pad_token_id": tokenizer.eos_token_id}

print("Tokenizer loaded.")

# --- 3. Load the 4-bit Model ---
# We specify 'AutoModelForSequenceClassification'. This adds a classification head (a simple linear layer)on top of the Llama 3 model, ready for fine-tuning.
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=bnb_config, # Apply our 4-bit config
    device_map="auto",              # Automatically use the GPU
    num_labels=num_labels,          
    id2label=id2label,              
    label2id=label2id,
    **model_config                  
)

print("Model loaded in 4-bit.")
print(model) 

Tokenizer loaded.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded in 4-bit.
LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128001)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSN

In [None]:
# --- 1. Create Tokenization Function ---
def tokenize_function(examples):
    # Tokenize the 'text' field.
    # padding="max_length" ensures all sequences are the same size.
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True, # ensures no sequence is longer than the model can handle
        max_length=512 
    )

# --- 2. Apply Tokenization to Datasets ---
# We use .map() to apply the function to every example in our datasets.
# batched=True processes multiple examples at once for speed.
print("\nTokenizing training dataset...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

print("Tokenizing validation dataset...")
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# --- 3. Clean Up Columns ---
# The model only needs 'input_ids', 'attention_mask', and 'label'.
# We can remove the old text columns to save space.
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["Product", "text"])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["Product", "text"])

# Tell the dataset to return PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_val_dataset.set_format("torch")

print("\n--- Example Tokenized Data Point ---")
print(tokenized_train_dataset[0])


Tokenizing training dataset...


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Tokenizing validation dataset...


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]


--- Example Tokenized Data Point ---
{'label': tensor(18), 'input_ids': tensor([128000,     45,    301,   4816,  25559,    757,  18815,    856,   5575,
         11941,    323,    872,  18740,   1071,    358,   4460,    311,  74421,
           279,  11941,     13,    358,    574,  11920,     11,   4245,    311,
          6020,  18151,    358,    574,  26060,    389,  11941,  53150,     13,
           358,    574,  12798,    279,  11941,    389,    264,   5912,   8197,
           323,    574,   1234,    459,   8070,  71118,   3197,     13,    358,
           574,   3309,    279,  34631,   1053,  16063,    304,   1403,   4038,
            13,   1115,  60732,  10222,    369,    279,   4038,    315,  30388,
            14,   6277,     14,  24769,    323,  20572,     55,    315,  20572,
            55,     13,    358,    574,  17693,  11920,    994,  89461,   4816,
          4934,    311,    656,    420,   1606,    358,   1550,    539,   1390,
           904,  90417,    304,    856,  71118,

In [6]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# This function will be called by the Trainer at each evaluation step
def compute_metrics(eval_pred):
    # eval_pred is a tuple containing (logits, labels)
    logits, labels = eval_pred

    # We get the model's predictions by finding the class with the highest logit
    predictions = np.argmax(logits, axis=-1)

    # Calculate basic accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate the weighted F1-score, just like our baseline
    f1 = f1_score(labels, predictions, average="weighted")

    # Return a dictionary of the metrics
    return {"accuracy": accuracy, "f1": f1}

In [7]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# --- 1. Prepare model for 4-bit training ---
model = prepare_model_for_kbit_training(model)

# --- 2. Define LoRA Config ---
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # Sequence Classification task
    r=16,                       # Rank of the adapter
    lora_alpha=32,              # Scaling factor
    lora_dropout=0.05,          # Dropout
    target_modules=["q_proj", "v_proj"] # Attach LoRA to attention layers
)

# --- 3. Wrap the model with LoRA ---
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 6,897,664 || all params: 7,511,904,256 || trainable%: 0.0918


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# --- 1. Define Training Arguments ---
training_args = TrainingArguments(
    output_dir="./llama3-classification-v1",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=100,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

# --- 2. Data Collator ---
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 3. Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# --- 4. START TRAINING ---
print("Starting training...")
trainer.train()

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
50,7.6672,1.844932,0.46015,0.408718
100,6.3316,1.473828,0.51875,0.479638


  return fn(*args, **kwargs)


TrainOutput(global_step=100, training_loss=9.033110885620117, metrics={'train_runtime': 13513.4535, 'train_samples_per_second': 0.118, 'train_steps_per_second': 0.007, 'total_flos': 3.4340377460736e+16, 'train_loss': 9.033110885620117, 'epoch': 0.02})

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# --- OPTIMIZATION: Use a tiny validation set for speed ---
small_val_dataset = tokenized_val_dataset.select(range(500))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- REVISED ARGUMENTS: 4-Hour Run ---
training_args = TrainingArguments(
    output_dir="./llama3-classification-final",
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    max_steps=1200,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=100,         # Check progress every 100 steps
    save_strategy="steps",
    save_steps=100,         # Save progress every 100 steps
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=small_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting optimized training run")
trainer.train()

# Save the final model
trainer.save_model("./llama3-classification-final")
print("Model saved!")

  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting optimized training run


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1
100,12.822,1.332682,0.542,0.446772
200,8.6375,1.003595,0.648,0.638427
300,7.5537,1.087633,0.646,0.59239
400,6.9416,0.866296,0.714,0.695252
500,7.3103,0.883998,0.686,0.654253
600,7.4147,0.804509,0.726,0.710095
700,5.9699,0.867924,0.672,0.631216
800,6.7732,0.731438,0.746,0.734822
900,5.7815,0.730173,0.738,0.727012
1000,6.0856,0.750126,0.738,0.724813


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Model saved!


In [None]:
from transformers import TrainingArguments, Trainer

# 1. Define NEW arguments just for fast evaluation
# We use batch size 16 to speed up evaluation
eval_args = TrainingArguments(
    output_dir="./eval_temp",
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True 
)

# 2. Create a NEW Trainer instance
eval_trainer = Trainer(
    model=model,
    args=eval_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting Fast Evaluation (Batch Size 16)")

# 3. Predict
fast_val_dataset = tokenized_val_dataset.shuffle(seed=42).select(range(3000))

# 2. Run Prediction on Validation Set
predictions_output = eval_trainer.predict(fast_val_dataset)

# 4. Process Results
import numpy as np
from sklearn.metrics import classification_report, f1_score

y_preds = np.argmax(predictions_output.predictions, axis=-1)
y_true = predictions_output.label_ids

final_f1 = f1_score(y_true, y_preds, average='weighted')
print(f"\n=== FINAL OFFICIAL F1 SCORE: {final_f1:.4f} ===")

all_label_ids = sorted(id2label.keys())
all_label_names = [id2label[i] for i in all_label_ids]

print("\nClassification Report:")
print(classification_report(y_true, y_preds, target_names=all_label_names, labels=all_label_ids))

  eval_trainer = Trainer(


Starting Fast Evaluation (Batch Size 16)



=== FINAL OFFICIAL F1 SCORE: 0.7333 ===

Classification Report:
                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.67      0.17      0.27        12
                                                 Checking or savings account       0.79      0.83      0.81       139
                                                               Consumer Loan       0.00      0.00      0.00         5
                                                                 Credit card       0.31      0.24      0.27        79
                                                 Credit card or prepaid card       0.48      0.56      0.52        91
                                                            Credit reporting       0.00      0.00      0.00        31
                         Credit reporting or other personal consumer reports       0.82      0.80      0.81 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
