## 1. Install Dependencies

In [1]:
!pip install -q transformers>=4.46.0 "datasets>=3.0.0,<4.0.0" trl>=0.12.0 peft>=0.13.0 accelerate>=1.0.0 bitsandbytes>=0.44.0 scikit-learn pandas

## 2. Check GPU & Mount Google Drive

In [2]:
import torch

# Check GPU
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"VRAM: {gpu_memory:.1f} GB")
else:
    print("No GPU available! Go to Runtime > Change runtime type > GPU")
    raise RuntimeError("GPU required")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create output directory for model
import os
OUTPUT_DIR = "/content/drive/MyDrive/models/smollm2-bengali-nid-intent"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Model output directory: {OUTPUT_DIR}")

# ============================================================
# DATASET PATH - Upload your CSVs to this folder in Google Drive
# ============================================================
DATASET_DIR = "/content/drive/MyDrive"
os.makedirs(DATASET_DIR, exist_ok=True)

print(f"\n>>> Upload your CSV files to: {DATASET_DIR}")
print("    - sts_train.csv")
print("    - sts_eval.csv")
print("    - tag_answer.csv")
print("\nOr change DATASET_DIR to where your files are located.")

GPU: NVIDIA A100-SXM4-80GB
VRAM: 85.2 GB
Mounted at /content/drive
Model output directory: /content/drive/MyDrive/models/smollm2-bengali-nid-intent

>>> Upload your CSV files to: /content/drive/MyDrive
    - sts_train.csv
    - sts_eval.csv
    - tag_answer.csv

Or change DATASET_DIR to where your files are located.


## 3. Verify Dataset Files

Make sure your CSV files are in Google Drive at the path shown above.

In [3]:
import os

# Check if files exist in Google Drive
train_path = f"{DATASET_DIR}/sts_train.csv"
eval_path = f"{DATASET_DIR}/sts_eval.csv"
tag_path = f"{DATASET_DIR}/tag_answer.csv"

files_status = {
    "sts_train.csv": os.path.exists(train_path),
    "sts_eval.csv": os.path.exists(eval_path),
    "tag_answer.csv": os.path.exists(tag_path),
}

print("Dataset files status:")
for fname, exists in files_status.items():
    status = "✓ Found" if exists else "✗ Missing"
    print(f"  {status}: {fname}")

if not all(files_status.values()):
    missing = [f for f, exists in files_status.items() if not exists]
    print(f"\n❌ Missing files: {missing}")
    print(f"Please upload them to: {DATASET_DIR}")
    raise FileNotFoundError(f"Missing dataset files in {DATASET_DIR}")
else:
    print(f"\n✓ All files found in {DATASET_DIR}")

Dataset files status:
  ✓ Found: sts_train.csv
  ✓ Found: sts_eval.csv
  ✓ Found: tag_answer.csv

✓ All files found in /content/drive/MyDrive


## 4. Load and Analyze Dataset

In [4]:
import pandas as pd
from collections import Counter

# Load CSV files from Google Drive
print("Loading dataset files from Google Drive...")
train_df = pd.read_csv(train_path)
# ============================================================
# SAMPLE 50% OF TRAINING DATA FOR FASTER TRAINING
# ============================================================
train_df = train_df.sample(frac=0.5, random_state=42).reset_index(drop=True)
print(">>> Using 50% of training data for faster training <<<")
eval_df = pd.read_csv(eval_path)
tag_answer_df = pd.read_csv(tag_path)

print(f"Train samples: {len(train_df)}")
print(f"Eval samples: {len(eval_df)}")
print(f"Unique tags in train: {train_df['tag'].nunique()}")
print(f"Unique tags in eval: {eval_df['tag'].nunique()}")
print(f"Tags with answers: {len(tag_answer_df)}")

# Show sample
print(f"\nSample from training data:")
print(f"  Question: {train_df.iloc[0]['question']}")
print(f"  Tag: {train_df.iloc[0]['tag']}")

Loading dataset files from Google Drive...
>>> Using 50% of training data for faster training <<<
Train samples: 39308
Eval samples: 11457
Unique tags in train: 407
Unique tags in eval: 403
Tags with answers: 407

Sample from training data:
  Question: বাবা-মায়ের পরিচয় জানি না—ভোটার নাম নিবন্ধন করা যাবে?
  Tag: orphan_no_parent_information_registration


In [5]:
# Build intent labels from training data
INTENT_TAGS = sorted(train_df['tag'].unique().tolist())
print(f"Total unique intents: {len(INTENT_TAGS)}")

# Create mappings
ID2INTENT = {i: intent for i, intent in enumerate(INTENT_TAGS)}
INTENT2ID = {intent: i for i, intent in enumerate(INTENT_TAGS)}

# Show top 15 tags by frequency
print(f"\nTop 15 tags by frequency:")
tag_counts = train_df['tag'].value_counts()
for tag, count in tag_counts.head(15).items():
    print(f"  {tag}: {count}")

Total unique intents: 407

Top 15 tags by frequency:
  fraction: 243
  permanent_address_change_fees: 190
  spouse_name_correction_new: 125
  migration_post_office_code_change: 120
  address_type_difference_confusion: 117
  offline_fee_payment_not_allowed: 117
  land_office_nid_verification_problem: 117
  nid_card_cancellation_procedure: 116
  address_wife_property_ownership: 116
  abroad_smart_card_priority_request: 116
  smart_card_security_breach_concerns: 116
  abroad_current_address_issue: 115
  migration_personal_attendance_required: 115
  living_person_marked_dead_correction: 115
  abroad_address_correction_not_possible: 114


## 5. Configuration

In [6]:
# ============================================================
# CONFIGURATION - Few-Shot Task Learning with Qwen
# ============================================================

# Model - Qwen2.5-0.5B-Instruct for better multilingual support
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"

# Data - Longer sequences for 5-shot examples + reasoning
MAX_SEQ_LENGTH = 2048  # A100 80GB

# Training (adjusted for longer sequences)
NUM_EPOCHS = 2        # Fast training
BATCH_SIZE = 16       # A100 80GB - balanced
EVAL_BATCH_SIZE = 32  # A100 80GB - balanced
GRAD_ACCUM_STEPS = 4  # Effective batch = 64
LEARNING_RATE = 2e-5  # Lower for instruction tuning
WARMUP_RATIO = 0.03   # Less warmup
EARLY_STOPPING_PATIENCE = 3  # Stop if no improvement for 3 evals

# LoRA config for Qwen
LORA_R = 32           # Moderate rank
LORA_ALPHA = 64       # 2x rank
LORA_DROPOUT = 0.05
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Few-shot settings
NUM_FEW_SHOT_EXAMPLES = 5  # Full 5-shot on H100
NUM_CONTEXT_TAGS = 20      # Full context on H100

# Seed
SEED = 42

# Instruction template (Bengali-aware)
INSTRUCTION_TEMPLATE = "Classify the intent of this Bengali customer query: {text}"

print(f"Model: {MODEL_NAME}")
print(f"Number of intents: {len(INTENT_TAGS)}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"Few-shot examples: {NUM_FEW_SHOT_EXAMPLES}")
print(f"Context tags: {NUM_CONTEXT_TAGS}")
print(f"Epochs: {NUM_EPOCHS} (with early stopping patience={EARLY_STOPPING_PATIENCE})")
print(f"Batch size: {BATCH_SIZE} x {GRAD_ACCUM_STEPS} = {BATCH_SIZE * GRAD_ACCUM_STEPS} effective")
print(f"LoRA rank: {LORA_R}")

Model: Qwen/Qwen2.5-0.5B-Instruct
Number of intents: 407
Max sequence length: 2048
Few-shot examples: 5
Context tags: 20
Epochs: 2 (with early stopping patience=3)
Batch size: 16 x 4 = 64 effective
LoRA rank: 32


## 6. Prepare Dataset for SFT

In [7]:
from datasets import Dataset
from tqdm import tqdm
import random

# ============================================================
# STEP 1: Create Tag Descriptions from tag_answer.csv
# ============================================================

def create_tag_descriptions(tag_answer_df):
    """Extract Bengali descriptions from tag_answer.csv"""
    descriptions = {}
    for _, row in tag_answer_df.iterrows():
        tag = row['tag']
        answer = row['answer']
        # First sentence (up to Bengali period ।) as description, max 150 chars
        short_desc = answer.split('।')[0].strip()[:150]
        descriptions[tag] = short_desc
    return descriptions

TAG_DESCRIPTIONS = create_tag_descriptions(tag_answer_df)
print(f"Created descriptions for {len(TAG_DESCRIPTIONS)} tags")
print(f"Sample: {list(TAG_DESCRIPTIONS.items())[0]}")

# ============================================================
# STEP 2: Build Few-Shot Example Pool
# ============================================================

def build_example_pool(train_df, examples_per_tag=10):
    """Create pool of examples for each tag"""
    pool = {}
    for tag in train_df['tag'].unique():
        samples = train_df[train_df['tag'] == tag]['question'].tolist()
        pool[tag] = samples[:min(examples_per_tag, len(samples))]
    return pool

EXAMPLE_POOL = build_example_pool(train_df, examples_per_tag=10)
print(f"Built example pool for {len(EXAMPLE_POOL)} tags")

# ============================================================
# STEP 3: Reasoning Generation Function
# ============================================================

def generate_reasoning(question, tag, tag_description):
    """Generate detailed CoT reasoning for classification"""
    tag_readable = tag.replace('_', ' ')
    return (
        f'The query contains keywords related to "{tag_readable}". '
        f'This matches the intent description: "{tag_description}". '
        f'Based on the semantic match, this query belongs to {tag}.'
    )

# ============================================================
# STEP 4: Select Relevant Tags for Context
# ============================================================

def select_relevant_tags(target_tag, all_tags, k=20):
    """Select target tag + semantically similar distractors"""
    selected = [target_tag]

    # Same prefix tags (e.g., all address_change_* tags)
    prefix = target_tag.split('_')[0]
    same_prefix = [t for t in all_tags if t.startswith(prefix) and t != target_tag]
    selected.extend(same_prefix[:7])

    # Random diverse tags to fill remaining slots
    remaining = [t for t in all_tags if t not in selected]
    random.shuffle(remaining)
    selected.extend(remaining[:k - len(selected)])

    return selected[:k]

# ============================================================
# STEP 5: Main Formatting Function (Qwen Chat Template)
# ============================================================

def format_for_sft_few_shot(row, all_tags, tag_descriptions, example_pool, num_examples=5, num_tags=20):
    """Create Qwen chat format with few-shot examples and CoT reasoning"""
    question = row['question']
    target_tag = row['tag']

    # Select relevant tags for context
    relevant_tags = select_relevant_tags(target_tag, all_tags, k=num_tags)

    # Format tag descriptions
    tag_lines = []
    for t in relevant_tags:
        if t in tag_descriptions:
            tag_lines.append(f"- {t}: {tag_descriptions[t]}")

    # Select few-shot examples from different tags
    available_example_tags = [t for t in relevant_tags if t in example_pool and t != target_tag]
    if len(available_example_tags) < num_examples:
        # Add more tags if not enough
        extra_tags = [t for t in all_tags if t in example_pool and t not in available_example_tags]
        random.shuffle(extra_tags)
        available_example_tags.extend(extra_tags[:num_examples - len(available_example_tags)])

    example_tags = random.sample(available_example_tags, min(num_examples, len(available_example_tags)))

    # Format examples with reasoning
    examples = []
    for i, et in enumerate(example_tags, 1):
        q = random.choice(example_pool[et])
        r = generate_reasoning(q, et, tag_descriptions.get(et, et.replace('_', ' ')))
        examples.append(f"Example {i}:\nQuery: {q}\nReasoning: {r}\nIntent: {et}")

    # Generate reasoning for target question
    target_reasoning = generate_reasoning(
        question, target_tag,
        tag_descriptions.get(target_tag, target_tag.replace('_', ' '))
    )

    # Build Qwen chat format messages
    system_content = f"""You are an intent classifier for Bengali NID (National ID) customer service.

Your task: Given a Bengali customer query, analyze the text and classify it into the correct intent.

Available intents with descriptions:
{chr(10).join(tag_lines)}

Instructions:
1. Read the Bengali query carefully
2. Identify key Bengali words/phrases
3. Match them to the most relevant intent description
4. Explain your reasoning
5. Output the intent tag"""

    user_content = f"""Classify the following Bengali query.

{chr(10).join(examples)}

Now classify this query:
Query: {question}"""

    assistant_content = f"""Reasoning: {target_reasoning}
Intent: {target_tag}"""

    return {
        "messages": [
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ],
        "intent": target_tag
    }

# ============================================================
# STEP 6: Build Training and Eval Datasets
# ============================================================

print("\nFormatting training data with 5-shot CoT...")
random.seed(SEED)  # For reproducibility
train_formatted = []
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Train"):
    formatted = format_for_sft_few_shot(
        row, INTENT_TAGS, TAG_DESCRIPTIONS, EXAMPLE_POOL,
        num_examples=NUM_FEW_SHOT_EXAMPLES, num_tags=NUM_CONTEXT_TAGS
    )
    train_formatted.append(formatted)

train_dataset = Dataset.from_list(train_formatted)

print("Formatting evaluation data...")
eval_formatted = []
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Eval"):
    formatted = format_for_sft_few_shot(
        row, INTENT_TAGS, TAG_DESCRIPTIONS, EXAMPLE_POOL,
        num_examples=NUM_FEW_SHOT_EXAMPLES, num_tags=NUM_CONTEXT_TAGS
    )
    eval_formatted.append(formatted)

eval_dataset = Dataset.from_list(eval_formatted)

print(f"\nTrain dataset: {len(train_dataset)} samples")
print(f"Eval dataset: {len(eval_dataset)} samples")

# ============================================================
# DEBUG: Token length statistics
# ============================================================
print(f"\n{'='*60}")
print("TOKEN LENGTH DEBUG (sampling 100 examples)")
print(f"{'='*60}")

# Sample 100 examples and check token lengths
sample_indices = random.sample(range(len(train_formatted)), min(100, len(train_formatted)))
token_lengths = []
for idx in sample_indices:
    sample = train_formatted[idx]
    # Apply chat template to get full text
    full_text = ''.join([m['content'] for m in sample['messages']])
    tokens = len(full_text.split())  # Rough word count
    token_lengths.append(tokens)

print(f"  Min word count: {min(token_lengths)}")
print(f"  Max word count: {max(token_lengths)}")
print(f"  Mean word count: {sum(token_lengths)/len(token_lengths):.1f}")
print(f"  Max seq length setting: {MAX_SEQ_LENGTH}")

# Show formatted sample
print(f"\n{'='*60}")
print("SAMPLE FORMATTED TRAINING EXAMPLE:")
print(f"{'='*60}")
sample = train_dataset[0]
for msg in sample['messages']:
    print(f"\n[{msg['role'].upper()}]")
    print(msg['content'][:500] + '...' if len(msg['content']) > 500 else msg['content'])

# Debug: Show tag distribution in examples
print(f"\n{'='*60}")
print("TAG DISTRIBUTION DEBUG")
print(f"{'='*60}")
tag_counts = {}
for item in train_formatted[:1000]:  # Sample first 1000
    tag = item['intent']
    tag_counts[tag] = tag_counts.get(tag, 0) + 1
top_tags = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)[:10]
print(f"  Top 10 tags in first 1000 samples:")
for tag, count in top_tags:
    print(f"    {tag}: {count}")

Created descriptions for 407 tags
Sample: ('abroad_address_change_impossible', 'উপজেলা নির্বাচন অফিসার বরাবর আবেদন করুন')
Built example pool for 407 tags

Formatting training data with 5-shot CoT...


Train: 100%|██████████| 39308/39308 [00:11<00:00, 3395.56it/s]


Formatting evaluation data...


Eval: 100%|██████████| 11457/11457 [00:03<00:00, 3153.37it/s]



Train dataset: 39308 samples
Eval dataset: 11457 samples

TOKEN LENGTH DEBUG (sampling 100 examples)
  Min word count: 574
  Max word count: 760
  Mean word count: 667.1
  Max seq length setting: 2048

SAMPLE FORMATTED TRAINING EXAMPLE:

[SYSTEM]
You are an intent classifier for Bengali NID (National ID) customer service.

Your task: Given a Bengali customer query, analyze the text and classify it into the correct intent.

Available intents with descriptions:
- orphan_no_parent_information_registration: অবশ্যই, আপনি ভোটার হওয়ার বয়সে পৌঁছালে আবেদন করতে পারবেন
- orphan_address_without_parents: সাধারণ স্থানান্তর কার্যক্রম সম্পন্ন হওয়ার বার্তা প্রাপ্ত হলেও কিছুদিন/ কিছুসময় এর জন্য ডাটা লক অবস্থায় থাকতে পারে
- fire_victim_identification_proce...

[USER]
Classify the following Bengali query.

Example 1:
Query: হারিয়ে ফেলা স্মার্ট কার্ডের জায়গায় স্মার্ট কার্ড পাওয়া যাবে কি?
Reasoning: The query contains keywords related to "smart card second print not available". This matches the intent desc

## 7. Load Model and Apply LoRA

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, TaskType, get_peft_model

# Load tokenizer
print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# IMPORTANT: Use left-padding for decoder-only models during batched generation
tokenizer.padding_side = 'left'

# Debug: Tokenizer info
print(f"\n{'='*60}")
print("TOKENIZER DEBUG INFO")
print(f"{'='*60}")
print(f"  Vocab size: {tokenizer.vocab_size}")
print(f"  Pad token: {tokenizer.pad_token} (id={tokenizer.pad_token_id})")
print(f"  EOS token: {tokenizer.eos_token} (id={tokenizer.eos_token_id})")
print(f"  Padding side: {tokenizer.padding_side}")
print(f"  Has chat template: {tokenizer.chat_template is not None}")

# Test chat template
test_msgs = [{"role": "user", "content": "test"}]
test_prompt = tokenizer.apply_chat_template(test_msgs, tokenize=False, add_generation_prompt=True)
print(f"  Chat template test: '{test_prompt[:100]}...'")

# Load model
print(f"\n{'='*60}")
print(f"LOADING MODEL: {MODEL_NAME}")
print(f"{'='*60}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Debug: Model info
print(f"\nMODEL DEBUG INFO")
print(f"  Total parameters: {model.num_parameters():,}")
print(f"  Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  Model dtype: {model.dtype}")
print(f"  Device: {model.device}")
print(f"  Model class: {model.__class__.__name__}")

# Check model architecture
if hasattr(model, 'config'):
    print(f"  Hidden size: {model.config.hidden_size}")
    print(f"  Num layers: {model.config.num_hidden_layers}")
    print(f"  Num heads: {model.config.num_attention_heads}")

Loading tokenizer: Qwen/Qwen2.5-0.5B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


TOKENIZER DEBUG INFO
  Vocab size: 151643
  Pad token: <|endoftext|> (id=151643)
  EOS token: <|im_end|> (id=151645)
  Padding side: left
  Has chat template: True
  Chat template test: '<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|...'

LOADING MODEL: Qwen/Qwen2.5-0.5B-Instruct


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]


MODEL DEBUG INFO
  Total parameters: 494,032,768
  Trainable parameters: 494,032,768
  Model dtype: torch.bfloat16
  Device: cuda:0
  Model class: Qwen2ForCausalLM
  Hidden size: 896
  Num layers: 24
  Num heads: 14


In [9]:
# Configure LoRA
print(f"\n{'='*60}")
print("LORA CONFIGURATION DEBUG")
print(f"{'='*60}")
print(f"  Rank (r): {LORA_R}")
print(f"  Alpha: {LORA_ALPHA}")
print(f"  Dropout: {LORA_DROPOUT}")
print(f"  Target modules: {LORA_TARGET_MODULES}")

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA
model = get_peft_model(model, lora_config)

print(f"\nLoRA applied successfully!")
model.print_trainable_parameters()

# Debug: Show which layers have LoRA
print(f"\nLoRA MODULES DEBUG:")
lora_layers = [name for name, _ in model.named_modules() if 'lora' in name.lower()]
print(f"  Total LoRA modules: {len(lora_layers)}")
print(f"  Sample LoRA layers: {lora_layers[:5]}")


LORA CONFIGURATION DEBUG
  Rank (r): 32
  Alpha: 64
  Dropout: 0.05
  Target modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

LoRA applied successfully!
trainable params: 17,596,416 || all params: 511,629,184 || trainable%: 3.4393

LoRA MODULES DEBUG:
  Total LoRA modules: 1512
  Sample LoRA layers: ['base_model.model.model.layers.0.self_attn.q_proj.lora_dropout', 'base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default', 'base_model.model.model.layers.0.self_attn.q_proj.lora_A', 'base_model.model.model.layers.0.self_attn.q_proj.lora_A.default', 'base_model.model.model.layers.0.self_attn.q_proj.lora_B']


## 8. Train

In [10]:
from trl import SFTTrainer, SFTConfig
from transformers import set_seed, TrainerCallback, EarlyStoppingCallback
from sklearn.metrics import accuracy_score
import re

# Set seed
set_seed(SEED)

# ============================================================
# CUSTOM CALLBACK: Intent Accuracy + Early Stopping (Updated for CoT)
# ============================================================

def extract_intent_from_cot(response, intent_tags):
    """Extract intent from CoT response (looks for 'Intent: tag_name' pattern)."""
    response = response.strip()

    # Look for 'Intent: tag_name' pattern
    intent_match = re.search(r'Intent:\s*(\S+)', response, re.IGNORECASE)
    if intent_match:
        predicted = intent_match.group(1).strip().lower()
        # Find matching intent tag
        for intent in intent_tags:
            if intent.lower() == predicted:
                return intent

    # Fallback: check if any intent tag is in the response
    response_lower = response.lower()
    for intent in intent_tags:
        if intent.lower() in response_lower:
            return intent

    return None

def create_eval_prompt(question, tag_descriptions, example_pool, intent_tags, num_examples=3, num_tags=15):
    """Create evaluation prompt in the same format as training."""
    # Select relevant tags
    relevant_tags = list(tag_descriptions.keys())[:num_tags]
    tag_lines = [f"- {t}: {tag_descriptions[t]}" for t in relevant_tags if t in tag_descriptions]

    # Select few examples
    examples = []
    available_tags = [t for t in relevant_tags if t in example_pool]
    for i, et in enumerate(available_tags[:num_examples], 1):
        q = example_pool[et][0] if example_pool[et] else "sample query"
        desc = tag_descriptions.get(et, et.replace('_', ' '))
        r = f'The query relates to "{et.replace("_", " ")}". This matches: "{desc}".'
        examples.append(f"Example {i}:\nQuery: {q}\nReasoning: {r}\nIntent: {et}")

    system = f"""You are an intent classifier for Bengali NID customer service.

Available intents:
{chr(10).join(tag_lines[:num_tags])}

Instructions: Analyze the query, explain reasoning, output intent tag."""

    user = f"""Classify this Bengali query.

{chr(10).join(examples)}

Now classify:
Query: {question}"""

    return [{"role": "system", "content": system}, {"role": "user", "content": user}]

class IntentAccuracyCallback(TrainerCallback):
    """Callback to compute intent accuracy during training with CoT format."""

    def __init__(self, eval_df, tokenizer, intent_tags, tag_descriptions, example_pool,
                 max_seq_length, sample_size=300, batch_size=8, patience=3):
        self.eval_sample = eval_df.sample(n=min(sample_size, len(eval_df)), random_state=42).reset_index(drop=True)
        self.tokenizer = tokenizer
        self.intent_tags = intent_tags
        self.tag_descriptions = tag_descriptions
        self.example_pool = example_pool
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.patience = patience
        self.best_accuracy = 0.0
        self.no_improve_count = 0
        self.history = []
        self.train_losses = []
        print(f"[CALLBACK INIT] Eval samples: {len(self.eval_sample)}, batch_size={batch_size}")

    def on_train_begin(self, args, state, control, **kwargs):
        print(f"\n{'='*60}")
        print("[DEBUG] TRAINING STARTED")
        print(f"{'='*60}")
        print(f"  Max steps: {state.max_steps}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            self.train_losses.append(logs['loss'])
            if state.global_step % 100 == 0:
                avg = sum(self.train_losses[-10:]) / min(10, len(self.train_losses))
                print(f"  [Step {state.global_step}] loss={logs['loss']:.4f} | avg_10={avg:.4f}")

    def _compute_accuracy_batched(self, model):
        """Compute intent accuracy on sample using batched inference."""
        model.eval()
        predictions = []
        true_labels = self.eval_sample['tag'].tolist()
        debug_responses = []

        for i in range(0, len(self.eval_sample), self.batch_size):
            batch_df = self.eval_sample.iloc[i:i+self.batch_size]

            # Create prompts in chat format
            batch_prompts = []
            for q in batch_df['question']:
                messages = create_eval_prompt(
                    q, self.tag_descriptions, self.example_pool,
                    self.intent_tags, num_examples=3, num_tags=15
                )
                prompt = self.tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                batch_prompts.append(prompt)

            inputs = self.tokenizer(batch_prompts, return_tensors="pt", padding=True,
                                   truncation=True, max_length=self.max_seq_length)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,  # More tokens for reasoning + intent
                    do_sample=False,
                    pad_token_id=self.tokenizer.pad_token_id,
                )

            input_len = inputs['input_ids'].shape[1]
            for idx, output in enumerate(outputs):
                response = self.tokenizer.decode(output[input_len:], skip_special_tokens=True)
                if i == 0 and idx < 3:
                    debug_responses.append(response[:150])
                predictions.append(extract_intent_from_cot(response, self.intent_tags))

        # Debug output
        print(f"\n    [DEBUG] Sample CoT outputs (first 3):")
        for resp in debug_responses[:3]:
            print(f"      Raw: '{resp}'")
        print(f"    [DEBUG] Extracted intents vs true (first 3):")
        for pred, true in zip(predictions[:3], true_labels[:3]):
            print(f"      True: {true} | Predicted: {pred}")

        # Compute accuracy
        valid_pairs = [(p, t) for p, t in zip(predictions, true_labels) if p is not None]
        num_none = sum(1 for p in predictions if p is None)
        print(f"    [DEBUG] Valid predictions: {len(valid_pairs)}/{len(predictions)} ({num_none} returned None)")

        if len(valid_pairs) == 0:
            return 0.0
        valid_preds, valid_true = zip(*valid_pairs)
        return accuracy_score(valid_true, valid_preds)

    def on_evaluate(self, args, state, control, model, **kwargs):
        accuracy = self._compute_accuracy_batched(model)
        self.history.append({'step': state.global_step, 'intent_accuracy': accuracy})

        print(f"\n>>> Intent Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

        if accuracy > self.best_accuracy + 0.001:
            self.best_accuracy = accuracy
            self.no_improve_count = 0
            print(f"    [NEW BEST] Best accuracy: {self.best_accuracy:.4f}")
        else:
            self.no_improve_count += 1
            print(f"    No improvement for {self.no_improve_count}/{self.patience} evals")

        if self.no_improve_count >= self.patience:
            print(f"\n*** EARLY STOPPING: Intent accuracy hasn't improved for {self.patience} evals ***")
            control.should_training_stop = True

        return control

# ============================================================
# TRAINING CONFIGURATION
# ============================================================

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,

    # Training schedule
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,

    # Optimizer (lower LR for instruction tuning)
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",

    # Mixed precision
    bf16=True,

    # Logging & saving
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=500,  # Less frequent to save time
    save_strategy="steps",
    save_steps=1000,  # Must be multiple of eval_steps
    save_total_limit=3,

    # Data - Chat template format
    max_length=MAX_SEQ_LENGTH,
    packing=False,

    # Other
    seed=SEED,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Create callbacks with updated parameters
intent_callback = IntentAccuracyCallback(
    eval_df=eval_df,
    tokenizer=tokenizer,
    intent_tags=INTENT_TAGS,
    tag_descriptions=TAG_DESCRIPTIONS,
    example_pool=EXAMPLE_POOL,
    max_seq_length=MAX_SEQ_LENGTH,
    sample_size=500,  # H100 - larger eval sample
    batch_size=EVAL_BATCH_SIZE,
    patience=EARLY_STOPPING_PATIENCE,
)

# Early stopping on validation loss
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=EARLY_STOPPING_PATIENCE,
    early_stopping_threshold=0.001,
)

# Initialize trainer with chat template support
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    callbacks=[early_stopping_callback, intent_callback],
)

# ============================================================
# DEBUG: Training setup info
# ============================================================
print(f"\n{'='*60}")
print("TRAINING SETUP DEBUG INFO")
print(f"{'='*60}")
print(f"  Model: {MODEL_NAME}")
print(f"  Max sequence length: {MAX_SEQ_LENGTH}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Gradient accumulation: {GRAD_ACCUM_STEPS}")
print(f"  Effective batch size: {BATCH_SIZE * GRAD_ACCUM_STEPS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Total training steps: {len(train_dataset) // (BATCH_SIZE * GRAD_ACCUM_STEPS) * NUM_EPOCHS}")
print(f"  Eval every: {training_args.eval_steps} steps")
print(f"  Save every: {training_args.save_steps} steps")
print(f"  Early stopping patience: {EARLY_STOPPING_PATIENCE}")

# Debug: Check model state before training
print(f"\nMODEL STATE BEFORE TRAINING:")
print(f"  Model in training mode: {model.training}")
print(f"  Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"  Total params: {model.num_parameters():,}")
print(f"  Trainable %: {100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / model.num_parameters():.2f}%")

# Debug: Quick generation test
print(f"\nQUICK GENERATION TEST (before training):")
test_prompt = tokenizer.apply_chat_template(
    [{"role": "user", "content": "Hello, test"}],
    tokenize=False, add_generation_prompt=True
)
test_inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    test_out = model.generate(**test_inputs, max_new_tokens=20, do_sample=False, pad_token_id=tokenizer.pad_token_id)
test_response = tokenizer.decode(test_out[0][test_inputs['input_ids'].shape[1]:], skip_special_tokens=True)
print(f"  Test output: '{test_response[:100]}'")
print(f"\nTrainer ready!")

[CALLBACK INIT] Eval samples: 500, batch_size=32


Tokenizing train dataset:   0%|          | 0/39308 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/39308 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/11457 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/11457 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



TRAINING SETUP DEBUG INFO
  Model: Qwen/Qwen2.5-0.5B-Instruct
  Max sequence length: 2048
  Batch size: 16
  Gradient accumulation: 4
  Effective batch size: 64
  Learning rate: 2e-05
  Epochs: 2
  Total training steps: 1228
  Eval every: 500 steps
  Save every: 1000 steps
  Early stopping patience: 3

MODEL STATE BEFORE TRAINING:
  Model in training mode: True
  Trainable params: 17,596,416
  Total params: 511,629,184
  Trainable %: 3.44%

QUICK GENERATION TEST (before training):
  Test output: 'Hello! How can I assist you today?'

Trainer ready!


In [None]:
# Train!
print("Starting training...")
print(f"Dataset: ~78k Bengali NID queries, 407 intents")
print(f"Epochs: {NUM_EPOCHS} (with early stopping)")
print(f"Early stopping patience: {EARLY_STOPPING_PATIENCE} evals")
print("")
print("During training you will see:")
print("  - Training loss (token generation)")
print("  - Validation loss (token generation)")
print("  - Intent Accuracy (tag detection on 500 samples)")
print("-" * 50)

trainer.train()

# ============================================================
# POST-TRAINING DEBUG INFO
# ============================================================
print(f"\n{'='*60}")
print("TRAINING COMPLETE - DEBUG SUMMARY")
print(f"{'='*60}")
print(f"  Best intent accuracy: {intent_callback.best_accuracy:.4f} ({intent_callback.best_accuracy*100:.2f}%)")
print(f"  Total evaluations: {len(intent_callback.history)}")
print(f"  Training loss history (last 5): {intent_callback.train_losses[-5:] if intent_callback.train_losses else 'N/A'}")

# Show accuracy progression
print(f"\nACCURACY PROGRESSION:")
for h in intent_callback.history:
    print(f"  Step {h['step']}: {h['intent_accuracy']:.4f}")

# Quick generation test after training
print(f"\nPOST-TRAINING GENERATION TEST:")
test_q = "একাউন্ট লক হয়ে গেছে কি করব?"
test_msgs = create_eval_prompt(test_q, TAG_DESCRIPTIONS, EXAMPLE_POOL, INTENT_TAGS)
test_prompt = tokenizer.apply_chat_template(test_msgs, tokenize=False, add_generation_prompt=True)
test_inputs = tokenizer(test_prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH).to(model.device)
with torch.no_grad():
    test_out = model.generate(**test_inputs, max_new_tokens=100, do_sample=False, pad_token_id=tokenizer.pad_token_id)
test_response = tokenizer.decode(test_out[0][test_inputs['input_ids'].shape[1]:], skip_special_tokens=True)
print(f"  Query: {test_q}")
print(f"  Output: {test_response[:200]}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Starting training...
Dataset: ~78k Bengali NID queries, 407 intents
Epochs: 2 (with early stopping)
Early stopping patience: 3 evals

During training you will see:
  - Training loss (token generation)
  - Validation loss (token generation)
  - Intent Accuracy (tag detection on 500 samples)
--------------------------------------------------

[DEBUG] TRAINING STARTED
  Max steps: 1230


Step,Training Loss,Validation Loss


## 9. Save Model

In [None]:
# Save model to Google Drive
print(f"Saving model to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Also save intent mappings
import json
with open(f"{OUTPUT_DIR}/intent_mappings.json", "w", encoding="utf-8") as f:
    json.dump({"id2intent": ID2INTENT, "intent2id": INTENT2ID}, f, ensure_ascii=False, indent=2)

print(f"Model saved successfully!")
print(f"Files in {OUTPUT_DIR}:")
!ls -la {OUTPUT_DIR}

## 10. Evaluate

In [None]:
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import Counter
import re

def extract_intent_cot(response, intent_tags):
    """Extract intent from CoT model response."""
    response = response.strip()

    # Look for 'Intent: tag_name' pattern
    intent_match = re.search(r'Intent:\s*(\S+)', response, re.IGNORECASE)
    if intent_match:
        predicted = intent_match.group(1).strip().lower()
        for intent in intent_tags:
            if intent.lower() == predicted:
                return intent

    # Fallback: check if any intent tag appears
    response_lower = response.lower()
    for intent in intent_tags:
        if intent.lower() in response_lower:
            return intent

    return None

def create_inference_prompt(question, tag_descriptions, example_pool, num_examples=3, num_tags=15):
    """Create prompt for inference using few-shot format."""
    relevant_tags = list(tag_descriptions.keys())[:num_tags]
    tag_lines = [f"- {t}: {tag_descriptions[t]}" for t in relevant_tags]

    examples = []
    available_tags = [t for t in relevant_tags if t in example_pool]
    for i, et in enumerate(available_tags[:num_examples], 1):
        q = example_pool[et][0] if example_pool[et] else "sample"
        desc = tag_descriptions.get(et, '')
        r = f'This query relates to "{et.replace("_", " ")}". Matches description: "{desc[:80]}"'
        examples.append(f"Example {i}:\nQuery: {q}\nReasoning: {r}\nIntent: {et}")

    messages = [
        {"role": "system", "content": f"""You are an intent classifier for Bengali NID customer service.

Available intents:
{chr(10).join(tag_lines)}

Instructions: Analyze query, explain reasoning, output intent tag."""},
        {"role": "user", "content": f"""{chr(10).join(examples)}

Now classify:
Query: {question}"""}
    ]
    return messages

def evaluate_model_cot(model, tokenizer, eval_df, tag_descriptions, example_pool,
                       batch_size=EVAL_BATCH_SIZE, num_samples=None):
    """Evaluate model with CoT few-shot format."""
    model.eval()

    if num_samples:
        eval_df = eval_df.sample(n=min(num_samples, len(eval_df)), random_state=42).reset_index(drop=True)

    predictions = []
    true_labels = []
    raw_outputs = []

    num_batches = (len(eval_df) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(eval_df), batch_size), total=num_batches, desc="Evaluating (CoT)"):
        batch_df = eval_df.iloc[i:i+batch_size]

        # Create prompts with chat template
        batch_prompts = []
        for q in batch_df['question']:
            messages = create_inference_prompt(q, tag_descriptions, example_pool)
            prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            batch_prompts.append(prompt)

        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True,
                          truncation=True, max_length=MAX_SEQ_LENGTH)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,  # More tokens for reasoning
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
            )

        input_len = inputs['input_ids'].shape[1]
        for j, output in enumerate(outputs):
            response = tokenizer.decode(output[input_len:], skip_special_tokens=True)
            if len(raw_outputs) < 5:
                raw_outputs.append(response[:200])
            predictions.append(extract_intent_cot(response, INTENT_TAGS))

        true_labels.extend(batch_df['tag'].tolist())

    # Show sample outputs
    print("\nSample CoT outputs:")
    for i, resp in enumerate(raw_outputs[:3]):
        print(f"  [{i+1}] {resp}")

    return predictions, true_labels

# Evaluate on subset
print(f"\n{'='*60}")
print("EVALUATION DEBUG")
print(f"{'='*60}")
print(f"  Model device: {model.device}")
print(f"  Model training mode: {model.training}")
print(f"  Batch size: {EVAL_BATCH_SIZE}")
print(f"  Num samples: 1000")
print(f"\nStarting evaluation...")

predictions, true_labels = evaluate_model_cot(
    model, tokenizer, eval_df, TAG_DESCRIPTIONS, EXAMPLE_POOL, num_samples=1000
)

# Debug: Prediction distribution
print(f"\nPREDICTION DEBUG:")
null_preds = sum(1 for p in predictions if p is None)
print(f"  Null predictions: {null_preds}/{len(predictions)} ({100*null_preds/len(predictions):.1f}%)")
pred_counts = {}
for p in predictions:
    if p: pred_counts[p] = pred_counts.get(p, 0) + 1
top_preds = sorted(pred_counts.items(), key=lambda x: x[1], reverse=True)[:5]
print(f"  Top 5 predicted tags: {top_preds}")

In [None]:
# Compute metrics
valid_mask = [p is not None for p in predictions]
valid_preds = [INTENT2ID.get(p, -1) for p in predictions]
valid_true = [INTENT2ID.get(t, -1) for t in true_labels]

# Filter valid
filtered_preds = [p for p, m in zip(valid_preds, valid_mask) if m and p != -1]
filtered_true = [t for t, m, p in zip(valid_true, valid_mask, valid_preds) if m and p != -1]

# Calculate metrics
if len(filtered_preds) > 0:
    accuracy = accuracy_score(filtered_true, filtered_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        filtered_true, filtered_preds, average="weighted", zero_division=0
    )
else:
    accuracy = precision = recall = f1 = 0.0

print("=" * 50)
print("EVALUATION RESULTS")
print("=" * 50)
print(f"Total samples: {len(predictions)}")
print(f"Valid predictions: {sum(valid_mask)} ({100*sum(valid_mask)/len(predictions):.1f}%)")
print(f"")
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("=" * 50)

# Show top confusions
print("\nTop 10 Confusions:")
confusions = [(t, p) for p, t in zip(predictions, true_labels) if p != t and p is not None]
for (true, pred), count in Counter(confusions).most_common(10):
    print(f"  {true} -> {pred}: {count}")

## 11. Interactive Inference

In [None]:
def classify_intent_cot(query, model, tokenizer, tag_descriptions, example_pool):
    """Classify intent for a single Bengali query using CoT format."""
    messages = create_inference_prompt(query, tag_descriptions, example_pool)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
        )

    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    intent = extract_intent_cot(response, INTENT_TAGS)

    return intent, response

# Get answer for intent from tag_answer_df
def get_answer(intent):
    """Get Bengali answer for an intent."""
    row = tag_answer_df[tag_answer_df['tag'] == intent]
    if len(row) > 0:
        return row.iloc[0]['answer']
    return "উত্তর পাওয়া যায়নি।"

# Test with sample Bengali queries
test_queries = [
    "আমার এনআইডি একাউন্ট লক হয়ে গেছে, কিভাবে আনলক করবো?",
    "কার্ড হারিয়ে গেলে কি করতে হবে?",
    "জাতীয় পরিচয়পত্রে নাম সংশোধন করতে চাই",
    "ভোটার আইডি কার্ডের ঠিকানা পরিবর্তন করতে কি কি লাগবে?",
    "স্মার্ট কার্ড কবে পাবো?",
]

# Test with CoT inference
print("Testing with Bengali queries (CoT inference):")
print("=" * 70)
for query in test_queries:
    intent, reasoning = classify_intent_cot(query, model, tokenizer, TAG_DESCRIPTIONS, EXAMPLE_POOL)
    answer = get_answer(intent) if intent else "Intent not recognized"
    print(f"Query: {query}")
    print(f"Model output: {reasoning[:150]}..." if len(reasoning) > 150 else f"Model output: {reasoning}")
    print(f"Intent: {intent}")
    print(f"Answer: {answer[:100]}..." if len(answer) > 100 else f"Answer: {answer}")
    print("-" * 70)

## 12. Push to HuggingFace Hub

In [None]:
from huggingface_hub import login
login()

In [None]:
# Push model to Hub
HF_REPO_NAME = "ehzawad/smollm2-bengali-nid-intent"

print(f"Pushing model to HuggingFace Hub: {HF_REPO_NAME}")

model.push_to_hub(HF_REPO_NAME)
tokenizer.push_to_hub(HF_REPO_NAME)

print(f"\nModel uploaded successfully!")
print(f"View at: https://huggingface.co/{HF_REPO_NAME}")

## Done!

Your Bengali NID intent classification model is now:
- Saved to Google Drive at: `/content/drive/MyDrive/models/smollm2-bengali-nid-intent`
- Pushed to HuggingFace Hub

To load the model later:
```python
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
model = PeftModel.from_pretrained(base_model, "ehzawad/smollm2-bengali-nid-intent")
tokenizer = AutoTokenizer.from_pretrained("ehzawad/smollm2-bengali-nid-intent")
```