<a href="https://colab.research.google.com/github/dushyant3615/AI_Voice_Chatbot/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip -q install torch transformers datasets pandas speechrecognition

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
pip -q install pydub

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
from google.colab import userdata
userdata.get('wandb_login')

'89473432b2be8cead22eda17275063a514ec2e88'

In [35]:
!pip install -q evaluate

In [37]:
import os
import json
import pandas as pd
import speech_recognition as sr
from pydub import AudioSegment
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset, ClassLabel # Removed load_metric from datasets
import torch
import numpy as np
from sklearn.model_selection import train_test_split
import random
from evaluate import load_metric # Import load_metric from evaluate

# Set seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Load Clinc AI dataset
def load_clinc_data(file_path):
    print(f"Loading Clinc AI data from: {file_path}")
    with open(file_path, "r") as file:
        data = json.load(file)

    queries = [item[0] for item in data]  # User query
    intents = [item[1] for item in data]  # Intent
    print(f"Loaded {len(queries)} queries from Clinc AI dataset.")
    return pd.DataFrame({"query": queries, "intent": intents})

clinc_data = load_clinc_data("/content/drive/MyDrive/AI_voice_chatbot/Clinc_AI_Dataset/train.json")
print(f"Clinc AI data shape: {clinc_data.shape}")

# Load Mozilla Common Voice data (optional, not used for training here)
def load_mozilla_data(csv_path, audio_folder):
    print(f"Loading Mozilla Common Voice data from CSV: {csv_path}")
    df = pd.read_csv(csv_path, sep="\t")  # TSV file
    print(f"Loaded {len(df)} entries from Mozilla Common Voice CSV.")

    sentences = df['sentence'].tolist()
    audio_files = [os.path.join(audio_folder, row['path']) for _, row in df.iterrows()]

    return pd.DataFrame({"sentence": sentences, "audio_path": audio_files})

mozilla_data = load_mozilla_data(
    "/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/validated.tsv",
    "/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/clips"
)
print(f"Mozilla Common Voice data shape before filtering: {mozilla_data.shape}")

# Audio to text transcription function (optional)
def audio_to_text(audio_path):
    recognizer = sr.Recognizer()

    if not os.path.exists(audio_path):
        print(f"File not found: {audio_path}")
        return ""

    if audio_path.endswith(".mp3"):
        try:
            audio = AudioSegment.from_mp3(audio_path)
            wav_path = audio_path.replace(".mp3", ".wav")
            audio.export(wav_path, format="wav")
            audio_path = wav_path
        except Exception as e:
            print(f"Error converting {audio_path} to WAV: {e}")
            return ""

    try:
        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
            return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        print(f"Could not transcribe audio: {audio_path}")
        return ""
    except sr.RequestError:
        print(f"API error for audio: {audio_path}")
        return ""
    except Exception as e:
        print(f"Unexpected error for audio: {audio_path}: {e}")
        return ""

# Filter missing audio files (optional)
initial_mozilla_rows = mozilla_data.shape[0]
mozilla_data = mozilla_data[mozilla_data["audio_path"].apply(os.path.exists)].copy()
print(f"Mozilla Common Voice data shape after filtering missing audio files: {mozilla_data.shape} ({initial_mozilla_rows - mozilla_data.shape[0]} rows filtered)")

# Transcribe audio (optional, can be slow)
# print("Starting audio transcription for Mozilla Common Voice dataset...")
# mozilla_data["transcribed_text"] = mozilla_data["audio_path"].apply(audio_to_text)
# print("Audio transcription complete.")

# Prepare labeled intent dataset (Clinc AI)
print("Preparing labeled intent dataset (Clinc AI)...")
intent_data = clinc_data.rename(columns={"query": "text", "intent": "label"}).copy()

# Build label mapping
label_to_id = {label: idx for idx, label in enumerate(sorted(intent_data["label"].unique()))}
id_to_label = {v: k for k, v in label_to_id.items()}

# Map labels to integers
intent_data["label"] = intent_data["label"].map(label_to_id)

# Force label dtype to int64
intent_data["label"] = intent_data["label"].astype("int64")

print(f"Number of unique intents: {len(label_to_id)}")

# Split dataset into train and validation sets
train_df, val_df = train_test_split(intent_data, test_size=0.1, stratify=intent_data["label"], random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Remove default index column if present
for ds in [train_dataset, val_dataset]:
    if "__index_level_0__" in ds.column_names:
        ds = ds.remove_columns("__index_level_0__")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Load pre-trained BERT model for sequence classification
print(f"Loading pre-trained BERT model (bert-base-uncased) with {len(label_to_id)} labels...")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_to_id),
)

# Define compute metrics function for evaluation
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    seed=42,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting model training (only Clinc dataset)...")
trainer.train()
print("Training complete.")

# Save model and tokenizer
model.save_pretrained("trained_chatbot_model")
tokenizer.save_pretrained("trained_chatbot_model")

print("Model saved successfully!")

ImportError: cannot import name 'load_metric' from 'evaluate' (/usr/local/lib/python3.12/dist-packages/evaluate/__init__.py)

In [28]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained("trained_chatbot_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("trained_chatbot_model")
print("Saved model and tokenizer loaded successfully.")

# Recreate the id_to_label mapping (assuming label_to_id was created during training)
# If label_to_id is not available, you might need to load it from a saved file or re-create it
# based on the unique labels in your training data.
# For this example, we'll assume label_to_id is available from the previous training cell execution.
# If you are running this cell independently, you might need to load or define label_to_id here.
# Example (if you saved label_to_id as a JSON file):
# import json
# with open("label_to_id.json", "r") as f:
#     label_to_id = json.load(f)
# id_to_label = {idx: label for label, idx in label_to_id.items()}

# *** IMPORTANT ***
# If you ran the training cell, label_to_id should be available in the environment.
# If not, you need to recreate it based on your training data or load it.
# Assuming label_to_id is available from a previous cell execution:
try:
    id_to_label = {idx: label for label, idx in label_to_id.items()}
    print("id_to_label mapping created from existing label_to_id.")
except NameError:
    print("label_to_id not found. Please ensure the training cell was run or load label_to_id.")
    # As a fallback, you might attempt to infer labels from the model's config if available,
    # but the most reliable way is to use the mapping from training.
    id_to_label = {i: f"LABEL_{i}" for i in range(loaded_model.config.num_labels)}
    print(f"Using dummy id_to_label mapping: {id_to_label}")


def predict_intent(query):
    """
    Predicts the intent of a given text query using the loaded model.

    Args:
        query (str): The input text query.

    Returns:
        tuple: A tuple containing the predicted intent label (str) and the confidence score (float).
    """
    # Tokenize the input text query
    inputs = loaded_tokenizer(query, return_tensors="pt", padding=True, truncation=True)

    # Pass the tokenized input through the loaded model to get prediction scores
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = loaded_model(**inputs)

    # Apply softmax to get probabilities and find the predicted class
    probs = outputs.logits.softmax(dim=-1)
    pred_id = probs.argmax().item()
    confidence = probs.max().item()

    # Map the predicted class index back to the original intent label string
    # Ensure id_to_label is accessible (defined globally or passed)
    try:
        pred_label = id_to_label[pred_id]
    except NameError:
        print("Error: id_to_label mapping is not available. Cannot interpret prediction.")
        return f"Unknown (ID: {pred_id})", confidence
    except KeyError:
        print(f"Error: Predicted ID {pred_id} not found in id_to_label mapping.")
        # Fallback if predicted ID is not in the mapping (shouldn't happen with correct setup)
        return f"Unknown (ID: {pred_id})", confidence


    return pred_label, confidence

# Example usage
query = "what is the weather like today?"
intent, confidence = predict_intent(query)
print(f"Query: '{query}' -> Predicted Intent: '{intent}' (confidence: {confidence:.2f})")

query = "tell me a joke"
intent, confidence = predict_intent(query)
print(f"Query: '{query}' -> Predicted Intent: '{intent}' (confidence: {confidence:.2f})")

query = "set a timer for 5 minutes"
intent, confidence = predict_intent(query)
print(f"Query: '{query}' -> Predicted Intent: '{intent}' (confidence: {confidence:.2f})")

Saved model and tokenizer loaded successfully.
id_to_label mapping created from existing label_to_id.
Query: 'what is the weather like today?' -> Predicted Intent: 'r' (confidence: 1.00)
Query: 'tell me a joke' -> Predicted Intent: 'r' (confidence: 1.00)
Query: 'set a timer for 5 minutes' -> Predicted Intent: 'r' (confidence: 1.00)


In [17]:
import os

# Define the path in Google Drive where you want to save the model
save_path = "/content/drive/MyDrive/trained_chatbot_model"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# To save the trained model and tokenizer to Google Drive
print(f"Saving trained model and tokenizer to: {save_path}")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model and tokenizer saved successfully.")

Saving trained model and tokenizer to: /content/drive/MyDrive/trained_chatbot_model
Model and tokenizer saved successfully.


In [27]:
import wandb
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, Subset

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize wandb run with config for intent classification model
config = {
    "epochs": 5,
    "batch_size": 128,
    "learning_rate": 0.001,
    "dropout": random.uniform(0.01, 0.8),
    "architecture": "SimpleNN",
    "dataset": "MNIST (as example)",
}
project_name = "ai_voice_chatbot_intent"

with wandb.init(project=project_name, config=config) as run:
    config = run.config

    # Prepare dataset and dataloaders (simulate intent classification with MNIST)
    def get_dataloader(train, batch_size, slice_step=5):
        dataset = MNIST(root=".", train=train, transform=T.ToTensor(), download=True)
        subset = Subset(dataset, indices=range(0, len(dataset), slice_step))
        loader = DataLoader(subset, batch_size=batch_size, shuffle=train, num_workers=2, pin_memory=True)
        return loader

    train_loader = get_dataloader(train=True, batch_size=config.batch_size)
    val_loader = get_dataloader(train=False, batch_size=config.batch_size * 2)

    # Define a simple feedforward model for classification (simulate intent classifier)
    class IntentClassifier(nn.Module):
        def __init__(self, dropout):
            super().__init__()
            self.model = nn.Sequential(
                nn.Flatten(),
                nn.Linear(28 * 28, 256),
                nn.BatchNorm1d(256),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(256, 10),  # 10 classes as example intents
            )

        def forward(self, x):
            return self.model(x)

    model = IntentClassifier(config.dropout).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    # Validation function
    def validate(model, loader, loss_fn):
        model.eval()
        val_loss = 0
        correct = 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                val_loss += loss_fn(outputs, labels).item() * labels.size(0)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
        val_loss /= len(loader.dataset)
        accuracy = correct / len(loader.dataset)
        return val_loss, accuracy

    # Training loop with wandb logging
    for epoch in range(config.epochs):
        model.train()
        running_loss = 0
        for step, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            # Log training loss every 50 steps
            if step % 50 == 0:
                wandb.log({"train_loss": loss.item(), "epoch": epoch + step / len(train_loader)})

        val_loss, val_acc = validate(model, val_loader, loss_fn)
        wandb.log({"val_loss": val_loss, "val_accuracy": val_acc, "epoch": epoch + 1})

        # Save model checkpoint to wandb
        checkpoint_path = f"model_epoch_{epoch+1}.pt"
        torch.save(model.state_dict(), checkpoint_path)
        run.log_artifact(checkpoint_path, type="model", aliases=[f"epoch-{epoch+1}"])

        print(f"Epoch {epoch+1}: Train Loss={running_loss/len(train_loader):.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

    # Finish wandb run
    run.finish()

0,1
train/epoch,▁▁
train/global_step,█▁

0,1
total_flos,789326078976.0
train/epoch,3.0
train/global_step,3.0
train_loss,0.03913
train_runtime,39.845
train_samples_per_second,0.075
train_steps_per_second,0.075


100%|██████████| 9.91M/9.91M [00:00<00:00, 25.4MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.00MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 8.16MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 4.24MB/s]


Epoch 1: Train Loss=0.5965, Val Loss=0.3094, Val Acc=0.9120
Epoch 2: Train Loss=0.2932, Val Loss=0.2507, Val Acc=0.9230
Epoch 3: Train Loss=0.2285, Val Loss=0.2241, Val Acc=0.9260
Epoch 4: Train Loss=0.1990, Val Loss=0.2071, Val Acc=0.9390
Epoch 5: Train Loss=0.1737, Val Loss=0.1928, Val Acc=0.9395


0,1
epoch,▁▂▂▂▃▄▄▅▅▅▆▇▇▇█
train_loss,█▂▂▁▁▁▁▁▁▁
val_accuracy,▁▄▅██
val_loss,█▄▃▂▁

0,1
epoch,5.0
train_loss,0.16762
val_accuracy,0.9395
val_loss,0.19278


In [32]:
pip -q install evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h