In [1]:
import os
from pathlib import Path
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch.nn import CrossEntropyLoss

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)

# 1. Paths & config
ROOT_DIR = Path.cwd().parent

DATA_PATH = ROOT_DIR / "data" / "intent_dataset.csv"
MODEL_DIR = ROOT_DIR / "intent_model"
LOG_DIR  = ROOT_DIR / "logs"

MODEL_NAME = "bert-base-uncased"
MAX_LENGTH = 64
TEST_SIZE = 0.50
RANDOM_SEED = 42

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

# 2. Reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)

# 3. Load dataset
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

df = pd.read_csv(DATA_PATH)

# Try to find text column
TEXT_COL_CANDIDATES = ["text", "query", "question", "utterance"]
LABEL_COL_CANDIDATES = ["label", "intent", "target"]

text_col = next((c for c in TEXT_COL_CANDIDATES if c in df.columns), None)
label_col = next((c for c in LABEL_COL_CANDIDATES if c in df.columns), None)

if text_col is None:
    raise ValueError(
        f"Could not find a text column. Tried: {TEXT_COL_CANDIDATES}. "
        f"Columns in CSV: {list(df.columns)}"
    )

if label_col is None:
    raise ValueError(
        f"Could not find a label/intent column. Tried: {LABEL_COL_CANDIDATES}. "
        f"Columns in CSV: {list(df.columns)}"
    )

df = df[[text_col, label_col]].dropna().reset_index(drop=True)

# Normalise labels to str (e.g. "filter", "aggregate", ...)
df[label_col] = df[label_col].astype(str)

labels = sorted(df[label_col].unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

df["label_id"] = df[label_col].map(label2id)

print("Label mapping:")
for l, i in label2id.items():
    print(f"  {i}: {l}")

# 4. Train / validation split
train_df, val_df = train_test_split(
    df,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=df["label_id"],
)

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

# 5. Tokenizer & dataset class
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

train_ds = IntentDataset(
    train_df[text_col].tolist(),
    train_df["label_id"].tolist(),
    tokenizer,
    max_len=MAX_LENGTH,
)

val_ds = IntentDataset(
    val_df[text_col].tolist(),
    val_df["label_id"].tolist(),
    tokenizer,
    max_len=MAX_LENGTH,
)

# 6. Model with class weights
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

# Class weights: inverse frequency
label_counts = train_df["label_id"].value_counts().sort_index()
class_weights = (1.0 / label_counts).values
class_weights = class_weights / class_weights.sum() * len(class_weights)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights_tensor)


class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
    
        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
    
        loss = loss_fct(logits, labels)
    
        if return_outputs:
            return loss, outputs
        return loss


# 7. Metrics
def compute_metrics(eval_pred):
    preds, labels_ids = eval_pred
    preds = np.argmax(preds, axis=1)

    acc = accuracy_score(labels_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels_ids, preds, average="weighted", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 8. TrainingArguments (logs + early stopping)
#    NOTE: requires transformers >= 4.x
training_args = TrainingArguments(
    output_dir=str(MODEL_DIR),
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir=str(LOG_DIR),
    logging_steps=50,
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # stop if no improvement for 2 evals
    early_stopping_threshold=0.0,
)

trainer = WeightedTrainer(
    class_weights=class_weights_tensor,
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 9. Train
print("Starting training...")
train_result = trainer.train()
trainer.save_state()

metrics = train_result.metrics
metrics["train_samples"] = len(train_ds)
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("Evaluating on validation set...")
eval_metrics = trainer.evaluate(eval_dataset=val_ds)
eval_metrics["eval_samples"] = len(val_ds)
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

# 10. Save best model + tokenizer
print(f"Saving best model and tokenizer to {MODEL_DIR}")
trainer.save_model(str(MODEL_DIR))
tokenizer.save_pretrained(str(MODEL_DIR))

print("Done. This model can now be loaded in app.py using model='intent_model'.")

  from scipy.sparse import csr_matrix, issparse
  from .autonotebook import tqdm as notebook_tqdm



Label mapping:
  0: aggregate
  1: compare
  2: count
  3: filter
  4: trend
Train size: 600, Val size: 600


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Class weights: tensor([1.0143, 1.0592, 1.0318, 0.9137, 0.9811])
Starting training...




Step,Training Loss
50,1.1124
100,0.3854
150,0.2849
200,0.2698
250,0.3136
300,0.266
350,0.2854
400,0.2363
450,0.2411
500,0.2359




***** train metrics *****
  epoch                    =        8.0
  total_flos               =   147028GF
  train_loss               =     0.3362
  train_runtime            = 0:16:43.19
  train_samples            =        600
  train_samples_per_second =      4.785
  train_steps_per_second   =      0.598
Evaluating on validation set...




***** eval metrics *****
  epoch                   =        8.0
  eval_accuracy           =     0.8733
  eval_f1                 =     0.8732
  eval_loss               =     0.3839
  eval_precision          =     0.8768
  eval_recall             =     0.8733
  eval_runtime            = 0:00:29.00
  eval_samples            =        600
  eval_samples_per_second =     20.686
  eval_steps_per_second   =      2.586
Saving best model and tokenizer to C:\Users\dangk\OneDrive\Desktop\Fall 2025\ADA\Project\LLMs-powered-natural-language-query-system-for-healthcare\intent_model
Done. This model can now be loaded in app.py using model='intent_model'.


In [2]:
# Test after trained
from transformers import pipeline

clf = pipeline("text-classification", model="../intent_model", tokenizer="../intent_model")

print(clf("How many patients were admitted last year?"))
print(clf("Show average billing by insurance provider"))
print(clf("List diabetic patients"))
print(clf("Compare male and female patients"))
print(clf("Show patients over age 60"))


Device set to use cpu


[{'label': 'count', 'score': 0.9980754852294922}]
[{'label': 'aggregate', 'score': 0.9981518387794495}]
[{'label': 'filter', 'score': 0.9985373020172119}]
[{'label': 'compare', 'score': 0.9982607960700989}]
[{'label': 'filter', 'score': 0.9980431795120239}]
