<a href="https://colab.research.google.com/github/dushyant3615/AI_Voice_Chatbot/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip -q install torch transformers datasets pandas speechrecognition

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
pip -q install pydub

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
import pandas as pd
import speech_recognition as sr
from pydub import AudioSegment
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

def load_clinc_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)

    # Extract user queries and intents
    queries = [item[0] for item in data]  # User query
    intents = [item[1] for item in data]  # Intent

    return pd.DataFrame({"query": queries, "intent": intents})

# To load the Clinc AI dataset
clinc_data = load_clinc_data("/content/drive/MyDrive/AI_voice_chatbot/Clinc_AI_Dataset/train.json")

def load_mozilla_data(csv_path, audio_folder):
    df = pd.read_csv(csv_path, sep="\t")  # Use tab separator for TSV files

    # To extract sentences and corresponding audio file paths
    sentences = df['sentence'].tolist()
    audio_files = [os.path.join(audio_folder, row['path']) for _, row in df.iterrows()]

    return pd.DataFrame({"sentence": sentences, "audio_path": audio_files})

# To load the Mozilla Common Voice dataset
# Assuming the Mozilla Common Voice dataset is also in your Google Drive
mozilla_data = load_mozilla_data("/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/validated.tsv",
                                 "/content/drive/MyDrive/AI_voice_chatbot/Mozilla_Common_Voice_Dataset/cv-corpus-20.0-delta-2024-12-06/en/clips")

def audio_to_text(audio_path):
    recognizer = sr.Recognizer()

    # Check if the file exists
    if not os.path.exists(audio_path):
        print(f"File not found: {audio_path}")
        return ""  # To return empty string if file is missing

    # Convert MP3 to WAV if the file is not already in WAV format
    if audio_path.endswith(".mp3"):
        try:
            audio = AudioSegment.from_mp3(audio_path)
            wav_path = audio_path.replace(".mp3", ".wav")
            audio.export(wav_path, format="wav")
            audio_path = wav_path  # To use the converted WAV file
        except Exception as e:
            print(f"Error converting {audio_path} to WAV: {e}")
            return ""  # To return empty string if conversion fails

    try:
        with sr.AudioFile(audio_path) as source:
            audio = recognizer.record(source)
            return recognizer.recognize_google(audio)  # To convert the speech to text
    except sr.UnknownValueError:
        print(f"Could not transcribe audio: {audio_path}")
        return ""  # If audio cannot be transcribed
    except sr.RequestError:
        print(f"API error for audio: {audio_path}")
        return ""  # If there's an API error
    except Exception as e:
        print(f"Unexpected error for audio: {audio_path}: {e}")
        return ""  # Handle any other errors

# Filter out rows where audio files are missing before transcription
mozilla_data = mozilla_data[mozilla_data["audio_path"].apply(os.path.exists)].copy()


# To transcribe audio files to text
mozilla_data["transcribed_text"] = mozilla_data["audio_path"].apply(audio_to_text)

# To combine the datasets
combined_data = pd.concat([
    clinc_data.rename(columns={"query": "text", "intent": "label"}),
    mozilla_data.rename(columns={"transcribed_text": "text"})[["text"]]
], ignore_index=True)

# To add dummy labels for Mozilla data (since it doesn't have intents)
combined_data["label"] = combined_data["label"].fillna("unknown")

# To convert labels to numerical values
label_to_id = {label: idx for idx, label in enumerate(combined_data["label"].unique())}
combined_data["label"] = combined_data["label"].map(label_to_id)

# To load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# To tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# To convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(combined_data)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# To load pre-trained BERT model for intent classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_to_id))

# To define training settings
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save results
    per_device_train_batch_size=8,  # Batch size for training
    num_train_epochs=3,  # Number of training epochs
    logging_dir="./logs",  # Directory to save logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
) # To initialize Trainer

trainer.train() # To train the model

model.save_pretrained("trained_chatbot_model") # To save the trained model
tokenizer.save_pretrained("trained_chatbot_model")

print("Model Training Complete. Chatbot is Ready!")