In [None]:
# === Cell 1: Install & Import Libraries ===

# Ensure necessary libraries are installed (run once if needed)
# !pip install transformers[torch] datasets pandas scikit-learn accelerate tensorboard -U # Added tensorboard just in case

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
import numpy as np
import warnings
import json # To save label mapping
import os # To help with saving/loading paths

warnings.filterwarnings("ignore") # Optional: Hide warnings

print("Libraries imported successfully.")

# Check available device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available(): # For MacOS
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
else:
    device = torch.device("cpu")
    print("Using CPU")

Libraries imported successfully.
Using GPU: Tesla T4


In [None]:
# === Cell 2: Configuration ===

# --- Parameters ---
# IMPORTANT: Update this path to your intent dataset CSV
csv_file_path = 'intent_dataset_sampled.csv'
text_column = 'text'  # Column with the user utterance
intent_column = 'intent' # Column with the intent label

# Model choice: DistilBERT is still a good default for speed/performance
model_name = 'distilbert-base-uncased'

# Training parameters
output_dir = './intent_model_results' # Directory for model outputs
logging_dir = './intent_model_logs'   # Directory for logs
num_train_epochs = 3
# Adjust batch sizes based on your GPU memory (use values that worked before or tune)
per_device_train_batch_size = 32
per_device_eval_batch_size = 64

learning_rate = 2e-5
weight_decay = 0.01
test_size = 0.2 # Use 20% of data for testing
random_state = 42 # For reproducibility

# --- Label Mapping ---
# num_labels will be determined automatically from the data in Cell 4
# We will create mappings: intent_string -> integer_id and integer_id -> intent_string

label_map_file = os.path.join(output_dir, 'label_mapping.json') # File to save label mapping

print("Configuration set for Intent Classification.")

Configuration set for Intent Classification.


In [None]:
# === Cell 3: Load Data ===

try:
    df = pd.read_csv(csv_file_path)
    # Basic cleaning: remove leading/trailing whitespace from columns
    df.columns = df.columns.str.strip()
    df[text_column] = df[text_column].str.strip()
    df[intent_column] = df[intent_column].str.strip()

    print(f"Successfully loaded data from {csv_file_path}")
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    print(f"\nIntent distribution ({intent_column} column):")
    print(df[intent_column].value_counts())
except FileNotFoundError:
    print(f"Error: File not found at {csv_file_path}. Please check the path.")
    # Optional: raise SystemExit()
except KeyError as e:
     print(f"Error: Column '{e}' not found. Check column names in config (text_column, intent_column).")
     # Optional: raise SystemExit()
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    # Optional: raise SystemExit()

Successfully loaded data from intent_dataset_sampled.csv
Dataset shape: (1912, 2)

First 5 rows:
                                                text   intent
0                    can I contact customer service?  Contact
1  can you help me talking with customer assistance?  Contact
2                           wanna chat with a person  Contact
3  help me check what hours I can contact custome...  Contact
4             I have got to speak to a damn operator  Contact

Intent distribution (intent column):
intent
Contact     478
Feedback    478
Payment     478
Refund      478
Name: count, dtype: int64


In [None]:
# === Cell 4: Preprocess Data, Encode Labels & Create Datasets ===

if 'df' in locals(): # Proceed only if df was loaded successfully
    # 1. Handle missing values
    df = df.dropna(subset=[text_column, intent_column])
    df[text_column] = df[text_column].astype(str)
    df[intent_column] = df[intent_column].astype(str)
    print(f"Shape after dropping NaNs: {df.shape}")

    # 2. Encode Intent Labels
    unique_intents = sorted(df[intent_column].unique())
    num_labels = len(unique_intents) # Determine num_labels dynamically
    print(f"\nFound {num_labels} unique intents: {unique_intents}")

    # Create mappings
    intent2id = {intent: i for i, intent in enumerate(unique_intents)}
    id2intent = {i: intent for i, intent in enumerate(unique_intents)}

    # Apply mapping to create the 'labels' column (integer representation)
    df['labels'] = df[intent_column].map(intent2id)

    # Check if any intents failed to map (shouldn't happen with this logic)
    if df['labels'].isnull().any():
        print("Warning: Some intents failed to map to IDs. Check data.")
        # Optional: df = df.dropna(subset=['labels'])

    print("\nDataFrame sample with encoded labels:")
    print(df[[text_column, intent_column, 'labels']].head())

    # 3. Select relevant columns for Dataset (text and integer labels)
    df_final = df[[text_column, 'labels']].rename(columns={text_column: 'text'})

    # 4. Convert pandas DataFrame to Hugging Face Dataset
    hg_dataset = Dataset.from_pandas(df_final)

    # Optional: Cast 'labels' column explicitly to ClassLabel for richer dataset info
    # This helps ensure consistency and can be useful for some downstream tasks
    # It also embeds the label names into the dataset features
    class_label_feature = ClassLabel(num_classes=num_labels, names=unique_intents)
    hg_dataset = hg_dataset.cast_column("labels", class_label_feature)
    print("\nCasted 'labels' column to ClassLabel feature.")


    # 5. Split into train and test sets
    train_test_split_dataset = hg_dataset.train_test_split(test_size=test_size, seed=random_state, stratify_by_column="labels") # Stratify if possible

    # Create a DatasetDict structure
    dataset_dict = DatasetDict({
        'train': train_test_split_dataset['train'],
        'test': train_test_split_dataset['test']
    })

    print("\nDataset structure:")
    print(dataset_dict)
    print("\nTrain dataset features:")
    print(dataset_dict['train'].features) # Show features including ClassLabel info


    # 6. Save the label mapping (important for inference later!)
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    try:
        with open(label_map_file, 'w') as f:
            json.dump({'intent2id': intent2id, 'id2intent': id2intent}, f, indent=4)
        print(f"\nLabel mapping saved to {label_map_file}")
    except Exception as e:
        print(f"\nError saving label mapping: {e}")

else:
    print("\nSkipping preprocessing as data loading failed.")

Shape after dropping NaNs: (1912, 2)

Found 4 unique intents: ['Contact', 'Feedback', 'Payment', 'Refund']

DataFrame sample with encoded labels:
                                                text   intent  labels
0                    can I contact customer service?  Contact       0
1  can you help me talking with customer assistance?  Contact       0
2                           wanna chat with a person  Contact       0
3  help me check what hours I can contact custome...  Contact       0
4             I have got to speak to a damn operator  Contact       0


Casting the dataset:   0%|          | 0/1912 [00:00<?, ? examples/s]


Casted 'labels' column to ClassLabel feature.

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1529
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 383
    })
})

Train dataset features:
{'text': Value(dtype='string', id=None), 'labels': ClassLabel(names=['Contact', 'Feedback', 'Payment', 'Refund'], id=None)}

Label mapping saved to ./intent_model_results/label_mapping.json


In [None]:
# === Cell 5: Tokenization ===
# (This cell remains largely the same as the sentiment analysis version)

if 'dataset_dict' in locals():
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer loaded for model: {model_name}")

    def tokenize_function(examples):
        # Adjust max_length if needed based on typical utterance length
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) # Maybe shorter max_length is fine for intents

    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text"]) # Remove original text column
    tokenized_datasets.set_format("torch") # Set format for PyTorch

    print("\nTokenized dataset structure:")
    print(tokenized_datasets)
    print("\nExample of tokenized input:")
    print(tokenized_datasets['train'][0])
else:
    print("\nSkipping tokenization as dataset creation failed.")

Tokenizer loaded for model: distilbert-base-uncased


Map:   0%|          | 0/1529 [00:00<?, ? examples/s]

Map:   0%|          | 0/383 [00:00<?, ? examples/s]


Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1529
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 383
    })
})

Example of tokenized input:
{'labels': tensor(0), 'input_ids': tensor([ 101, 2097, 2017, 2265, 2033, 2012, 2054, 2051, 8013, 2326, 2800, 2003,
        1029,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  

In [None]:
# === Cell 6: Load Model ===
# (This cell also remains largely the same)

if 'tokenized_datasets' in locals() and 'num_labels' in globals():
    # Load model with the dynamically determined number of labels
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    model.to(device)
    print(f"\nModel '{model_name}' loaded for sequence classification with {num_labels} labels.")
    print(f"Model moved to device: {device}")
else:
    print("\nSkipping model loading as tokenization or label determination failed.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model 'distilbert-base-uncased' loaded for sequence classification with 4 labels.
Model moved to device: cuda


In [None]:
# === Cell 7: Define Metrics ===
# (This cell remains the same - weighted metrics are suitable for multi-class intent)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Use 'weighted' averaging for potentially imbalanced intents
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("Metrics computation function defined (for multi-class intent).")

Metrics computation function defined (for multi-class intent).


In [None]:
# === Cell 8: Training Arguments ===
# (Remains similar, uses parameters from Cell 2)

if 'model' in locals():
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        logging_dir=logging_dir,
        logging_steps=50, # Log every 50 steps
        # Use evaluation/saving strategy suitable for your transformers version
        # Option 1: Newer versions
        # evaluation_strategy="epoch",
        # save_strategy="epoch",
        # Option 2: Older versions (use if Option 1 gives errors)
        eval_strategy="epoch", # Use the older name if needed
        save_strategy="epoch", # Use the older name if needed
        # evaluate_during_training=True, # Alternative older version
        # save_steps=logging_steps,      # Alternative older version

        load_best_model_at_end=True,
        metric_for_best_model="f1", # Optimize for weighted F1
        greater_is_better=True,
        report_to="tensorboard", # Or "none" if tensorboard not installed/wanted
        fp16=torch.cuda.is_available(), # Enable mixed precision on CUDA
        dataloader_num_workers=4 # Optional: for potentially faster data loading
    )

    print("TrainingArguments defined.")
else:
    print("\nSkipping TrainingArguments definition as model loading failed.")

TrainingArguments defined.


In [None]:
# === Cell 9: Initialize Trainer ===
# (Remains the same)

if 'model' in locals() and 'tokenized_datasets' in locals() and 'training_args' in locals():
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    print("Trainer initialized successfully.")
else:
    print("\nSkipping Trainer initialization due to previous errors.")

Trainer initialized successfully.


In [None]:
# === Cell 10: Fine-tuning ===
# (Remains the same)

if 'trainer' in locals():
    print("\nStarting model fine-tuning for Intent Classification...")
    try:
        train_result = trainer.train()
        print("Training finished.")
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        # trainer.save_metrics("train", metrics)
    except Exception as e:
        print(f"\nAn error occurred during training: {e}")
else:
    print("\nSkipping training.")


Starting model fine-tuning for Intent Classification...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.211356,0.997389,0.997389,0.997416,0.997389
2,0.810300,0.046453,0.997389,0.997389,0.997416,0.997389
3,0.102700,0.035705,0.997389,0.997389,0.997416,0.997389


Training finished.
***** train metrics *****
  epoch                    =        3.0
  total_flos               =   141479GF
  train_loss               =     0.3307
  train_runtime            = 0:00:36.20
  train_samples_per_second =    126.687
  train_steps_per_second   =      3.977


In [None]:
# === Cell 11: Save the Final Model & Label Mapping ===

if 'trainer' in locals():
    # Best model already saved by Trainer if load_best_model_at_end=True
    # Optionally, save final state explicitly (should be the best model)
    final_model_dir = os.path.join(output_dir, "final_model")
    print(f"\nSaving the final model state to {final_model_dir}...")
    try:
        trainer.save_model(final_model_dir)
        tokenizer.save_pretrained(final_model_dir) # Save tokenizer with model
        print(f"Final model and tokenizer saved to {final_model_dir}")

        # Re-save label mapping here too, just to be sure it's with the final model
        if 'intent2id' in globals() and 'id2intent' in globals():
             label_map_final_path = os.path.join(final_model_dir, 'label_mapping.json')
             with open(label_map_final_path, 'w') as f:
                 json.dump({'intent2id': intent2id, 'id2intent': id2intent}, f, indent=4)
             print(f"Label mapping also saved to {label_map_final_path}")

    except Exception as e:
        print(f"\nAn error occurred during final model saving: {e}")
else:
    print("\nSkipping final model saving.")


Saving the final model state to ./intent_model_results/final_model...
Final model and tokenizer saved to ./intent_model_results/final_model
Label mapping also saved to ./intent_model_results/final_model/label_mapping.json


In [None]:
# === Cell 12: Inference (Testing on New Data) ===

import json
import os

# --- Ensure paths are correct ---
# Use the directory where the final model and tokenizer were saved
model_load_path = os.path.join(output_dir, "final_model")
# Path to the label mapping file saved earlier
label_map_path = os.path.join(model_load_path, 'label_mapping.json') # Look inside final model dir

try:
    # 1. Load the label mapping
    if os.path.exists(label_map_path):
        with open(label_map_path, 'r') as f:
            saved_maps = json.load(f)
            # Important: Convert string keys back to integers for id2intent
            id2intent_loaded = {int(k): v for k, v in saved_maps['id2intent'].items()}
            intent2id_loaded = saved_maps['intent2id']
        print(f"Label mapping loaded from {label_map_path}")
        print(f"id2intent map: {id2intent_loaded}")
    else:
        print(f"Error: Label mapping file not found at {label_map_path}")
        id2intent_loaded = None # Set to None to prevent pipeline usage below if map missing

    # Proceed only if mapping was loaded
    if id2intent_loaded:
        # 2. Load the pipeline
        # Use device=0 for CUDA, device=-1 for CPU (pipeline handles MPS automatically usually)
        intent_pipeline = pipeline(
            "text-classification",
            model=model_load_path,
            tokenizer=model_load_path,
            device=0 if torch.cuda.is_available() else -1
        )
        print(f"\nIntent classification pipeline loaded from {model_load_path}.")

        # --- Test with some example utterances ---
        utterances = [
            "can I talk to someone?",
            "I want to know my account balance", # Example of a different intent
            "help contacting support",
            "what time do you close?", # Example
            "speak to agent"
        ]

        print("\n--- Testing Pipeline ---")
        results = intent_pipeline(utterances)

        for text, result in zip(utterances, results):
            # The pipeline might output LABEL_0, LABEL_1 etc.
            # We need to extract the ID and map it back using our loaded map
            try:
                predicted_id = int(result['label'].split('_')[-1])
                predicted_intent = id2intent_loaded.get(predicted_id, "Unknown ID")
            except Exception as e:
                print(f"Could not parse label {result['label']}: {e}")
                predicted_intent = "Error Parsing Label"

            print(f"\nUtterance: {text}")
            print(f"Predicted Intent: {predicted_intent} (Confidence: {result['score']:.4f})")

    else:
        print("\nCannot run inference pipeline because label mapping failed to load.")


except FileNotFoundError:
    print(f"\nError loading model/tokenizer. Ensure path is correct: {model_load_path}")
except Exception as e:
     print(f"\nAn error occurred during inference setup or execution: {e}")

Device set to use cuda:0


Label mapping loaded from ./intent_model_results/final_model/label_mapping.json
id2intent map: {0: 'Contact', 1: 'Feedback', 2: 'Payment', 3: 'Refund'}

Intent classification pipeline loaded from ./intent_model_results/final_model.

--- Testing Pipeline ---

Utterance: can I talk to someone?
Predicted Intent: Contact (Confidence: 0.9928)

Utterance: I want to know my account balance
Predicted Intent: Payment (Confidence: 0.9352)

Utterance: help contacting support
Predicted Intent: Contact (Confidence: 0.9925)

Utterance: what time do you close?
Predicted Intent: Contact (Confidence: 0.9743)

Utterance: speak to agent
Predicted Intent: Contact (Confidence: 0.9932)
