In [15]:
import polars as pl
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

In [16]:
# pipe = pipeline(
#     "fill-mask",
#     model="answerdotai/ModernBERT-base",
#     dtype=torch.bfloat16
# )

In [17]:
full_df = pl.read_parquet(r"data_proc_steps\training_data\clean_data.parquet")
# Randomly sample 100,000 rows
n_samples = 100
df = full_df.sample(
    n=n_samples,
    with_replacement=False, # Standard practice: don't sample the same row twice
    shuffle=True,           # Ensures the sampling is random
    seed=42                 # Optional: Set a seed for reproducibility
)
df

Date received,Product,Issue,Sub-issue,Consumer complaint narrative,Complaint ID
str,str,str,str,str,str
"""2024-07-25""","""Credit reporting or other pers…","""Incorrect information on your …","""Information belongs to someone…","""I am writing to delete the fol…","""9601904"""
"""2025-02-17""","""Credit reporting or other pers…","""Improper use of your report""","""Reporting company used your re…","""CFPB, Can you please make them…","""12105387"""
"""2024-12-20""","""Credit reporting or other pers…","""Problem with a company's inves…","""Investigation took more than 3…","""In accordance with the Fair Cr…","""11234232"""
"""2023-08-15""","""Credit reporting, credit repai…","""Incorrect information on your …","""Account information incorrect""","""I had my XXXX XXXX bankruptcy …","""7407313"""
"""2024-04-30""","""Credit reporting or other pers…","""Problem with a company's inves…","""Difficulty submitting a disput…","""These accounts and inquiries a…","""8889585"""
…,…,…,…,…,…
"""2023-02-03""","""Vehicle loan or lease""","""Managing the loan or lease""","""Problem with the interest rate""","""Had an auto loan with Wells Fa…","""6530429"""
"""2025-06-13""","""Credit reporting or other pers…","""Incorrect information on your …","""Personal information incorrect""","""Equifax XXXX. XXXX XXXX XXXX G…","""14075894"""
"""2024-06-25""","""Credit reporting or other pers…","""Improper use of your report""","""Reporting company used your re…","""I am writing to formally submi…","""9345010"""
"""2019-12-26""","""Credit card or prepaid card""","""Problem with a credit reportin…","""Was not notified of investigat…","""I was shocked when I reviewed …","""3477215"""


In [18]:
# Convert Polars DataFrame to Hugging Face Dataset
dataset = Dataset.from_polars(df)

# Get unique labels and create mapping dictionaries
unique_labels = sorted(df["Issue"].unique().to_list())
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

print(f"Number of classes: {num_labels}")
print(f"Sample labels: {unique_labels[:5]}")

# Map text labels to integers (required for training)
def encode_labels(example):
    example["labels"] = label_to_id[example["Issue"]]
    return example

dataset = dataset.map(encode_labels, remove_columns=["Issue"])

# Split into train/test (80/20 split)
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
raw_datasets = DatasetDict({
    'train': dataset_split['train'],
    'test': dataset_split['test']
})

print(f"Training samples: {len(raw_datasets['train'])}")
print(f"Test samples: {len(raw_datasets['test'])}")

Number of classes: 22
Sample labels: ['Account opening, closing, or management', 'Attempts to collect debt not owed', 'Closing on a mortgage', 'Closing your account', 'Closing/Cancelling account']


Map: 100%|██████████| 100/100 [00:00<00:00, 9796.57 examples/s]

Training samples: 80
Test samples: 20





In [19]:
print("\nTokenizing dataset...")
model_checkpoint = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["Consumer complaint narrative"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Select and rename columns for the trainer
tokenized_datasets = tokenized_datasets.remove_columns(["Consumer complaint narrative"])
tokenized_datasets.set_format("torch")



Tokenizing dataset...


Map: 100%|██████████| 80/80 [00:00<00:00, 2922.17 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 1965.42 examples/s]


In [28]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Date received', 'Product', 'Sub-issue', 'Complaint ID', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 80
    })
    test: Dataset({
        features: ['Date received', 'Product', 'Sub-issue', 'Complaint ID', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
})

In [20]:
# Load ModernBERT with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id,
    #torch_dtype=torch.bfloat16
)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate weighted F1-score and accuracy
    f1_weighted = f1_score(labels, predictions, average="weighted")
    accuracy = accuracy_score(labels, predictions)
    
    return {"accuracy": accuracy, "f1_weighted": f1_weighted}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=4,  # Reduced batch size since no mixed precision
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    # Remove fp16=True - train in full precision
    dataloader_num_workers=2,
    report_to="none",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [21]:
# --- 5. Train the model ---
print("Starting training...")
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,No log,3.367567,0.1,0.095556
2,No log,3.199143,0.15,0.092481
3,No log,3.093473,0.1,0.019048
4,No log,3.004783,0.1,0.019048


TrainOutput(global_step=80, training_loss=2.77697811126709, metrics={'train_runtime': 730.5018, 'train_samples_per_second': 0.438, 'train_steps_per_second': 0.11, 'total_flos': 109057629880320.0, 'train_loss': 2.77697811126709, 'epoch': 4.0})

In [22]:
# --- 6. Evaluate on Test Set ---
print("\nEvaluating on test set...")
results = trainer.evaluate()
print("\nFinal Results:")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1 Score (weighted): {results['eval_f1_weighted']:.4f}")


Evaluating on test set...



Final Results:
Accuracy: 0.1000
F1 Score (weighted): 0.0956


In [23]:
# --- 7. Save the Final Model ---
print("\nSaving model...")
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")
print("Model saved to ./final_model")


Saving model...
Model saved to ./final_model


In [27]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="./final_model",
    device=0 # Use GPU
)

result = classifier("Your complaint text here")
print(result)

Device set to use cuda:0


[{'label': 'Problem caused by your funds being low', 'score': 0.11127723008394241}]
