In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np

# 1. Load dataset
df = pd.read_csv("../data/intent_dataset_1000.csv")

# Clean & ensure labels are strings
df["label"] = df["label"].astype(str).str.strip().str.lower()

# Build label mappings
labels = sorted(df["label"].unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

# Map to numeric labels (INTS, not lists)
df["labels"] = df["label"].map(label2id).astype(int)

# Ensure "text" column is string
df["text"] = df["text"].astype(str)

# 2. Train-test split (no stratify needed)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert HF Dataset
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# 4. Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=True   # Important
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# REMOVE STRING COLUMNS
columns_to_remove = ["label", "__index_level_0__"]  # second one appears sometimes
for col in columns_to_remove:
    if col in train_ds.column_names:
        train_ds = train_ds.remove_columns(col)
    if col in test_ds.column_names:
        test_ds = test_ds.remove_columns(col)

# 5. Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# 7. Training arguments (no evaluation_strategy for older versions)
args = TrainingArguments(
    output_dir="../intent_model",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="../logs"
)

# 8. Metrics
def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {"accuracy": (y_pred == y_true).mean()}

# 9. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 10. Train model
trainer.train()

# 11. Save everything
trainer.save_model("intent_model")
tokenizer.save_pretrained("intent_model")

print("Training completed! Model saved to intent_model/")


  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse





Map: 100%|█████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 14735.21 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 13613.45 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


Training completed! Model saved to intent_model/


In [2]:
# Test after trained
from transformers import pipeline

clf = pipeline("text-classification", model="intent_model", tokenizer="intent_model")

print(clf("How many patients were admitted last year?"))
print(clf("Show average billing by insurance provider"))
print(clf("List diabetic patients"))
print(clf("Compare male and female patients"))
print(clf("Show admissions trend for the last 12 months"))


Device set to use cpu


[{'label': 'count', 'score': 0.9978487491607666}]
[{'label': 'aggregate', 'score': 0.9985129237174988}]
[{'label': 'filter', 'score': 0.9985072016716003}]
[{'label': 'compare', 'score': 0.9983828067779541}]
[{'label': 'trend', 'score': 0.9976175427436829}]
