<a href="https://colab.research.google.com/github/dushyant3615/AI_Voice_Chatbot/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip -q install torch transformers datasets pandas speechrecognition

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
pip -q install pydub

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import os
import json
import pandas as pd
import speech_recognition as sr
from pydub import AudioSegment
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

def load_clinc_data(file_path):
    print(f"Loading Clinc AI data from: {file_path}")
    with open(file_path, "r") as file:
        data = json.load(file)

    # Extract user queries and intents
    queries = [item[0] for item in data]  # User query
    intents = [item[1] for item in data]  # Intent
    print(f"Loaded {len(queries)} queries from Clinc AI dataset.")
    return pd.DataFrame({"query": queries, "intent": intents})

# To load the Clinc AI dataset
clinc_data = load_clinc_data("/content/drive/MyDrive/AI_voice_chatbot/Clinc_AI_Dataset/train.json")
print(f"Clinc AI data shape: {clinc_data.shape}")

def load_mozilla_data(csv_path, audio_folder):
    print(f"Loading Mozilla Common Voice data from CSV: {csv_path}")
    df = pd.read_csv(csv_path, sep="\t")  # Use tab separator for TSV files
    print(f"Loaded {len(df)} entries from Mozilla Common Voice CSV.")

    # To extract sentences and corresponding audio file paths
    sentences = df['sentence'].tolist()
    audio_files = [os.path.join(audio_folder, row['path']) for _, row in df.iterrows()]

    return pd.DataFrame({"sentence": sentences, "audio_path": audio_files})

# To load the Mozilla Common Voice dataset
# Assuming the Mozilla Common Voice dataset is also in your Google Drive
mozilla_data = load_mozilla_data("/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/validated.tsv", "/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/clips")
print(f"Mozilla Common Voice data shape before filtering: {mozilla_data.shape}")

def audio_to_text(audio_path):
    recognizer = sr.Recognizer()

    # Check if the file exists
    if not os.path.exists(audio_path):
        print(f"File not found: {audio_path}")
        return ""  # To return empty string if file is missing

    # Convert MP3 to WAV if the file is not already in WAV format
    if audio_path.endswith(".mp3"):
        try:
            audio = AudioSegment.from_mp3(audio_path)
            wav_path = audio_path.replace(".mp3", ".wav")
            audio.export(wav_path, format="wav")
            audio_path = wav_path  # To use the converted WAV file
        except Exception as e:
            print(f"Error converting {audio_path} to WAV: {e}")
            return ""  # To return empty string if conversion fails

    try:
        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
            return recognizer.recognize_google(audio)  # To convert the speech to text
    except sr.UnknownValueError:
        print(f"Could not transcribe audio: {audio_path}")
        return ""  # If audio cannot be transcribed
    except sr.RequestError:
        print(f"API error for audio: {audio_path}")
        return ""  # If there's an API error
    except Exception as e:
        print(f"Unexpected error for audio: {audio_path}: {e}")
        return ""  # Handle any other errors

# Filter out rows where audio files are missing before transcription
initial_mozilla_rows = mozilla_data.shape[0]
mozilla_data = mozilla_data[mozilla_data["audio_path"].apply(os.path.exists)].copy()
print(f"Mozilla Common Voice data shape after filtering missing audio files: {mozilla_data.shape} ({initial_mozilla_rows - mozilla_data.shape[0]} rows filtered)")


# To transcribe audio files to text
print("Starting audio transcription for Mozilla Common Voice dataset...")
mozilla_data["transcribed_text"] = mozilla_data["audio_path"].apply(audio_to_text)
print("Audio transcription complete.")


# To combine the datasets
print("Combining datasets...")
combined_data = pd.concat([
    clinc_data.rename(columns={"query": "text", "intent": "label"}),
    mozilla_data.rename(columns={"transcribed_text": "text"})[["text"]]
], ignore_index=True)
print(f"Combined data shape: {combined_data.shape}")

# To add dummy labels for Mozilla data (since it doesn't have intents)
print("Adding dummy labels for Mozilla data...")
combined_data["label"] = combined_data["label"].fillna("unknown")
print("Dummy labels added.")

# To convert labels to numerical values
print("Converting labels to numerical values...")
label_to_id = {label: idx for idx, label in enumerate(combined_data["label"].unique())}
combined_data["label"] = combined_data["label"].map(label_to_id)
print(f"Number of unique labels: {len(label_to_id)}")
print("Labels converted to numerical values.")

# To load pre-trained tokenizer
print("Loading pre-trained tokenizer (bert-base-uncased)...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("Tokenizer loaded.")

# To tokenize the text data
print("Tokenizing text data...")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# To convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(combined_data)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("Text data tokenized.")

# To load pre-trained BERT model for intent classification
print(f"Loading pre-trained BERT model (bert-base-uncased) with {len(label_to_id)} labels...")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_to_id))
print("BERT model loaded.")

# To define training settings
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save results
    per_device_train_batch_size=8,  # Batch size for training
    num_train_epochs=3,  # Number of training epochs
    logging_dir="./logs",  # Directory to save logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
) # To initialize Trainer

print("Starting model training...")
trainer.train() # To train the model
print("Model training finished.")

model.save_pretrained("trained_chatbot_model") # To save the trained model
tokenizer.save_pretrained("trained_chatbot_model")

print("Model Training Complete. Chatbot is Ready!")

Loading Clinc AI data from: /content/drive/MyDrive/AI_voice_chatbot/Clinc_AI_Dataset/train.json
Loaded 1 queries from Clinc AI dataset.
Clinc AI data shape: (1, 2)
Loading Mozilla Common Voice data from CSV: /content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/validated.tsv
Loaded 250 entries from Mozilla Common Voice CSV.
Mozilla Common Voice data shape before filtering: (250, 2)
Mozilla Common Voice data shape after filtering missing audio files: (250, 2) (0 rows filtered)
Starting audio transcription for Mozilla Common Voice dataset...
Audio transcription complete.
Combining datasets...
Combined data shape: (251, 2)
Adding dummy labels for Mozilla data...
Dummy labels added.
Converting labels to numerical values...
Number of unique labels: 2
Labels converted to numerical values.
Loading pre-trained tokenizer (bert-base-uncased)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded.
Tokenizing text data...


Map:   0%|          | 0/251 [00:00<?, ? examples/s]

Text data tokenized.
Loading pre-trained BERT model (bert-base-uncased) with 2 labels...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT model loaded.
Starting model training...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdushyant3615[0m ([33mdushyant3615-own[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


Model training finished.
Model Training Complete. Chatbot is Ready!


# Task
Explain the error in the selected code. If possible, fix the error and incorporate the changes into the existing code. Otherwise, try to diagnose the error. This is the dataset "/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/validated.tsv".

## Load the trained model and tokenizer

### Subtask:
Load the saved BERT model and tokenizer from the specified directory.


**Reasoning**:
Load the saved BERT model and tokenizer from the specified directory.



In [9]:
# Load the saved model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained("trained_chatbot_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("trained_chatbot_model")
print("Saved model and tokenizer loaded successfully.")

Saved model and tokenizer loaded successfully.


## Prepare input text

### Subtask:
Define a function to take a text query as input.


**Reasoning**:
Define the `predict_intent` function that takes a text query as input.



In [11]:
def predict_intent(text_query):
    """
    Predicts the intent of a given text query using the trained model.

    Args:
        text_query (str): The input text query.

    Returns:
        str: The predicted intent label.
    """
    # The first step is to receive the input text query, which is already done by the function argument.
    pass # Placeholder for the rest of the function logic

## Tokenize input

### Subtask:
Tokenize the input text using the loaded tokenizer, ensuring it's in the correct format for the model.


**Reasoning**:
Tokenize the input text query using the loaded tokenizer, ensuring padding and truncation, and convert the output to a PyTorch tensor.



In [13]:
import torch

def predict_intent(text_query):
    """
    Predicts the intent of a given text query using the trained model.

    Args:
        text_query (str): The input text query.

    Returns:
        str: The predicted intent label.
    """
    # Tokenize the input text query
    inputs = loaded_tokenizer(text_query, return_tensors="pt", padding="max_length", truncation=True)

    # Ensure the tokenized output is a PyTorch tensor
    # The return_tensors="pt" argument already ensures this, but we can explicitly move to device if needed
    # For this example, we assume CPU inference. If GPU is available, you would add .to(device)

    # Placeholder for the rest of the function logic
    pass

## Make a prediction

### Subtask:
Pass the tokenized input through the loaded model to get the prediction scores.


**Reasoning**:
Pass the tokenized input through the loaded model to get the prediction scores.



In [14]:
def predict_intent(text_query):
    """
    Predicts the intent of a given text query using the trained model.

    Args:
        text_query (str): The input text query.

    Returns:
        str: The predicted intent label.
    """
    # Tokenize the input text query
    inputs = loaded_tokenizer(text_query, return_tensors="pt", padding="max_length", truncation=True)

    # Pass the tokenized input through the loaded model to get prediction scores
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = loaded_model(**inputs)

    # Access the model's output (logits)
    logits = outputs.logits

    # Placeholder for the rest of the function logic
    pass

## Interpret the prediction

### Subtask:
Convert the prediction scores into a meaningful intent label using the `label_to_id` mapping created during training.


**Reasoning**:
Apply softmax to the logits, find the index of the highest score, create an inverse mapping from index to label, and return the corresponding label.



In [15]:
import torch.nn.functional as F

def predict_intent(text_query):
    """
    Predicts the intent of a given text query using the trained model.

    Args:
        text_query (str): The input text query.

    Returns:
        str: The predicted intent label.
    """
    # Tokenize the input text query
    inputs = loaded_tokenizer(text_query, return_tensors="pt", padding="max_length", truncation=True)

    # Pass the tokenized input through the loaded model to get prediction scores
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = loaded_model(**inputs)

    # Access the model's output (logits)
    logits = outputs.logits

    # Apply softmax to get probabilities
    probabilities = F.softmax(logits, dim=1)

    # Find the index of the class with the highest probability
    predicted_class_id = torch.argmax(probabilities, dim=1).item()

    # Create an inverse mapping from id to label
    id_to_label = {idx: label for label, idx in label_to_id.items()}

    # Map the predicted class index back to the original intent label string
    predicted_label = id_to_label[predicted_class_id]

    return predicted_label

# Example usage (optional, for testing)
# test_query = "what is the weather like today?"
# predicted_intent = predict_intent(test_query)
# print(f"The predicted intent for '{test_query}' is: {predicted_intent}")

## Create an inference function

### Subtask:
Combine the steps above into a single function that takes text input and returns the predicted intent.


## Test the inference function

### Subtask:
Test the function with some example queries to demonstrate how to use it.


**Reasoning**:
Test the `predict_intent` function with a list of example queries and print the results.



In [16]:
# Define a list of example text queries
example_queries = [
    "what is the weather like today?",
    "tell me a joke",
    "set a timer for 5 minutes",
    "what is the capital of France?",
    "play some music"
]

# Iterate through the example queries and predict the intent
print("Testing predict_intent function with example queries:")
for query in example_queries:
    predicted_intent = predict_intent(query)
    print(f"Query: '{query}' -> Predicted Intent: '{predicted_intent}'")

Testing predict_intent function with example queries:
Query: 'what is the weather like today?' -> Predicted Intent: 'unknown'
Query: 'tell me a joke' -> Predicted Intent: 'unknown'
Query: 'set a timer for 5 minutes' -> Predicted Intent: 'unknown'
Query: 'what is the capital of France?' -> Predicted Intent: 'unknown'
Query: 'play some music' -> Predicted Intent: 'unknown'


In [17]:
import os

# Define the path in Google Drive where you want to save the model
save_path = "/content/drive/MyDrive/trained_chatbot_model"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# To save the trained model and tokenizer to Google Drive
print(f"Saving trained model and tokenizer to: {save_path}")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model and tokenizer saved successfully.")

Saving trained model and tokenizer to: /content/drive/MyDrive/trained_chatbot_model
Model and tokenizer saved successfully.
