# Advancing Real Time Sign Language to Speech using Transformers





The Notebook contains the code for translation of Sign Language to Speech Using Transformers. The following code blocks include all necessary steps to process the data, build the model, and evaluate its performance.


In [None]:
from google.colab import drive

# Mount the Google Drive to the Colab environment
drive.mount('/content/drive')


Libraries installed and used for the code

In [None]:
!pip install opencv-python mediapipe numpy matplotlib
!pip install mediapipe
!pip install tensorflow numpy pandas
!pip install gtts
!pip install rouge_score
!pip install googletrans==4.0.0-rc1
!pip install jiwer

The following code sets up the base directory, and specific file paths for videos and annotations, and creates an output directory to store processed data.

In [None]:
import os

# Define the base directory within Google Drive where the dataset is located.
DRIVE_BASE = '/content/drive/MyDrive/sign_language_dataset'

# Specify the directory containing the video files.
VIDEO_DIR = os.path.join(DRIVE_BASE, 'videos')

# Define paths to the annotation files, which include glosses, corpus details, frame details, and word details.
ANNOTATIONS_GLOSSES_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL Corpus sign glosses.csv')
ANNOTATIONS_CORPUS_DETAILS_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL_CSLRT_Corpus details.xlsx')
ANNOTATIONS_FRAME_DETAILS_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL_CSLRT_Corpus_frame_details.xlsx')
ANNOTATIONS_WORD_DETAILS_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL_CSLRT_Corpus_word_details.xlsx')

# Create an output directory for storing processed data, ensuring the directory exists.
OUTPUT_DIR = os.path.join(DRIVE_BASE, 'processed_data')
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# Load the glosses data from a CSV file into a pandas DataFrame.
glosses_df = pd.read_csv(ANNOTATIONS_GLOSSES_PATH)

# Load the corpus details from an Excel file into a pandas DataFrame.
corpus_details_df = pd.read_excel(ANNOTATIONS_CORPUS_DETAILS_PATH)

# Load the frame details from an Excel file into a pandas DataFrame.
frame_details_df = pd.read_excel(ANNOTATIONS_FRAME_DETAILS_PATH)

# Load the word details from an Excel file into a pandas DataFrame.
word_details_df = pd.read_excel(ANNOTATIONS_WORD_DETAILS_PATH)



The code below includes functions for converting sentence labels to filename-friendly formats, extracting frames at a specified rate from videos, and processing all video files in a directory.


In [None]:
import os
import cv2

# Helper function to convert a sentence into a format suitable for filenames.
# This function replaces spaces with underscores and removes punctuation marks.
def sentence_to_filename(sentence):
    return sentence.lower().replace(' ', '_').replace('?', '').replace('.', '').replace(',', '').replace('!', '')

# Function to extract frames from a video at a specified frame rate.
# The frames are stored in a list and returned.
def extract_frames(video_path, frame_rate=1):
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    success, image = cap.read()
    while success:
        if count % frame_rate == 0:
            frames.append(image)  # Append the frame to the list if it meets the frame rate condition.
        success, image = cap.read()
        count += 1
    cap.release()
    return frames

# Function to extract frames from all video files in a specified directory.
# The extracted frames are saved to corresponding directories within the output directory.
def batch_extract_frames(video_dir, output_dir, frame_rate=1):
    video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]  # List all video files in the directory.
    for video_file in video_files:
        video_path = os.path.join(video_dir, video_file)
        video_id = os.path.splitext(video_file)[0]  # Extract the video ID from the filename.
        output_folder = os.path.join(output_dir, video_id)  # Create a directory for the extracted frames.
        os.makedirs(output_folder, exist_ok=True)
        extract_frames(video_path, frame_rate)

# Execute batch frame extraction for all videos in the specified directory.
batch_extract_frames(VIDEO_DIR, OUTPUT_DIR)


The code below processes the glosses DataFrame, aligns video frames with their corresponding gloss annotations, and stores the results in a structured format for further use.

In [None]:
# Function to align video frames with their corresponding gloss annotations from the DataFrame.
def align_data(glosses_df, output_dir):
    aligned_data = []
    for _, row in glosses_df.iterrows():
        # Convert the sentence into a video ID to locate the corresponding frames directory.
        video_id = sentence_to_filename(row['Sentence'])
        frames_path = os.path.join(output_dir, video_id)

        # Debugging output to verify processing steps.
        print(f"Processing video ID: {video_id}")
        print(f"Frames path: {frames_path}")

        # Check if the frames directory exists and contains frames.
        if os.path.exists(frames_path) and os.listdir(frames_path):
            frames = sorted([os.path.join(frames_path, f) for f in os.listdir(frames_path)])
            aligned_data.append({
                'frames': frames,
                'gloss': row['SIGN GLOSSES'],
                'video_id': video_id
            })
        else:
            print(f"Frames directory for {video_id} does not exist or is empty.")
    return aligned_data

# Define the directory containing the processed frames.
output_dir = '/content/drive/MyDrive/sign_language_dataset/processed_data/frames/'

# Align frames with gloss annotations based on the provided DataFrame.
aligned_data = align_data(glosses_df, output_dir)

# Print the first two entries of the aligned data for verification.
print(aligned_data[:2])


In [None]:
import cv2
import numpy as np

# Function to preprocess video frames: resizing to 224x224 pixels and normalizing pixel values.
def preprocess_frames(frame_paths):
    frames = []
    for path in frame_paths:
        img = cv2.imread(path)  # Read the image from the specified path.
        img = cv2.resize(img, (224, 224))  # Resize the image to a fixed size of 224x224 pixels.
        img = img / 255.0  # Normalize the pixel values to the range [0, 1].
        frames.append(img)  # Add the processed frame to the list.
    return np.array(frames)

# Preprocess the frames from the first entry in the aligned data.
aligned_frames = preprocess_frames(aligned_data[0]['frames'])

# Print the shape of the preprocessed frames array to verify the output.
print(aligned_frames.shape)


## Video Frame Extraction and Preprocessing

The following code extracts and preprocesses frames from sign language videos, aligns them with their corresponding gloss annotations. The code includes functions for frame extraction, data saving, and matching video folders with annotation sentences using approximate string matching.


In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import difflib

# Define paths to the dataset and output directories.
DRIVE_BASE = '/content/drive/MyDrive/sign_language_dataset'
VIDEO_DIR = os.path.join(DRIVE_BASE, 'videos')
ANNOTATIONS_GLOSSES_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL Corpus sign glosses.csv')
OUTPUT_DIR = os.path.join(DRIVE_BASE, 'processed_data')
os.makedirs(OUTPUT_DIR, exist_ok=True)  # Ensure the output directory exists.

# Load gloss annotations from the CSV file.
glosses_df = pd.read_csv(ANNOTATIONS_GLOSSES_PATH)

# List all available folders in the VIDEO_DIR containing video files.
available_folders = [d for d in os.listdir(VIDEO_DIR) if os.path.isdir(os.path.join(VIDEO_DIR, d))]

# Convert a sentence into a filename-friendly format.
def sentence_to_filename(sentence):
    return sentence.lower().replace(' ', '_').replace('?', '').replace('.', '').replace(',', '').replace('!', '')

# Extract frames from a video file at a specified frame rate.
def extract_frames(video_path, frame_rate=1):
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    success, image = cap.read()
    while success:
        if count % frame_rate == 0:
            frames.append(image)  # Store the frame if it meets the frame rate condition.
        success, image = cap.read()
        count += 1
    cap.release()
    return frames

# Save the preprocessed frames to a .npz file.
def save_preprocessed_data(frames, output_path):
    np.savez(output_path, frames=frames)  # Save the frames array to a compressed .npz file.

# Find the closest matching folder name to a given sentence-derived video ID.
def find_closest_match(sentence, folders):
    return difflib.get_close_matches(sentence, folders, n=1, cutoff=0.6)

# Process each gloss annotation and extract corresponding video frames.
for idx, row in glosses_df.iterrows():
    sentence = row['Sentence']
    video_id = sentence_to_filename(sentence)

    # Find the closest matching folder in the video directory.
    match = find_closest_match(video_id, available_folders)
    if match:
        video_folder_path = os.path.join(VIDEO_DIR, match[0])
        video_files = [f for f in os.listdir(video_folder_path) if f.endswith('.mp4')]
        for video_file in video_files:
            video_path = os.path.join(video_folder_path, video_file)
            frames = extract_frames(video_path)  # Extract frames from the video.
            output_path = os.path.join(OUTPUT_DIR, f"{video_id}.npz")
            save_preprocessed_data(frames, output_path)  # Save the extracted frames.
            print(f"Processed and saved {video_id}")
    else:
        print(f"No matching folder found for {video_id}")


print("Preprocessing completed.")


## Key points Extraction

The following code processes images from a specified directory to extract and normalize human pose keypoints using MediaPipe's Pose estimation model. The normalized keypoints are saved to a CSV file, and the first few images can be visualized to inspect the results.


In [None]:
import os
import cv2
import mediapipe as mp
import numpy as np
import matplotlib.pyplot as plt
import csv

# Initialize MediaPipe Pose with specific settings for static images
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)

# Normalize the pose landmarks to a range of [0, 1] based on image dimensions
def normalize_landmarks(landmarks, image_width, image_height):
    normalized_landmarks = []
    for landmark in landmarks.landmark:
        normalized_landmarks.append([
            landmark.x / image_width,  # Normalize x coordinate
            landmark.y / image_height,  # Normalize y coordinate
            landmark.z / max(image_width, image_height)  # Normalize z coordinate
        ])
    return normalized_landmarks

# Visualize the normalized landmarks using a scatter plot
def visualize_landmarks(normalized_landmarks):
    x_vals = [landmark[0] for landmark in normalized_landmarks]
    y_vals = [landmark[1] for landmark in normalized_landmarks]

    plt.figure(figsize=(5, 5))
    plt.scatter(x_vals, y_vals, marker='o')
    plt.title("Normalized Pose Landmarks")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.gca().invert_yaxis()  # Invert y-axis to match image coordinates
    plt.show()

# Process a single image to extract and normalize pose keypoints, with optional visualization
def process_image_for_keypoints(image_path, visualize=False):
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = pose.process(image_rgb)

    if results.pose_landmarks:
        image_height, image_width, _ = image.shape
        normalized_landmarks = normalize_landmarks(results.pose_landmarks, image_width, image_height)
        if visualize:
            visualize_landmarks(normalized_landmarks)  # Visualize if requested
        return normalized_landmarks
    else:
        print(f"No pose landmarks detected for image: {image_path}")
        return None

# Process all images in a folder to extract and normalize keypoints, and save them to a CSV
def process_folder_for_keypoints(input_folder, output_csv, max_visualizations=20):
    visualization_count = 0

    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['image', 'landmark_index', 'x_normalized', 'y_normalized', 'z_normalized'])  # CSV header

        for subdir, dirs, files in os.walk(input_folder):
            for file in files:
                if file.endswith(('.jpg', '.jpeg', '.png')):
                    file_path = os.path.join(subdir, file)
                    print(f"Processing image: {file_path}")
                    visualize = visualization_count < max_visualizations  # Limit visualizations to max_visualizations
                    normalized_landmarks = process_image_for_keypoints(file_path, visualize=visualize)
                    if normalized_landmarks:
                        for idx, landmark in enumerate(normalized_landmarks):
                            writer.writerow([file, idx, landmark[0], landmark[1], landmark[2]])  # Write data to CSV

                    if visualize:
                        visualization_count += 1

                    if visualization_count >= max_visualizations:
                        break

# Define input directory containing images and the output CSV file path
input_folder = '/content/drive/MyDrive/sign_language_dataset/processed_data'
output_csv = '/content/drive/MyDrive/sign_language_dataset/keypoints_normalized.csv'

# Process the folder to extract, normalize, and optionally visualize keypoints
process_folder_for_keypoints(input_folder, output_csv, max_visualizations=20)

# Release MediaPipe Pose resources
pose.close()




The following code loads normalized keypoints from a CSV file, where the keypoints represent human pose landmarks, and visualizes their distribution as a heatmap. The heatmap provides insight into the density and spatial arrangement of keypoints across the dataset.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import csv

# Load normalized keypoints from a CSV file into a NumPy array.
def load_keypoints_from_csv(csv_file):
    keypoints = []
    with open(csv_file, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            x_normalized = float(row[2])  # Extract and convert the normalized x-coordinate
            y_normalized = float(row[3])  # Extract and convert the normalized y-coordinate
            keypoints.append([x_normalized, y_normalized])  # Append the keypoint to the list
    return np.array(keypoints)

# Plot a heatmap of the keypoints to visualize their density and distribution.
def plot_heatmap(keypoints, bins=(50, 50)):
    x_vals = keypoints[:, 0]
    y_vals = keypoints[:, 1]

    heatmap, xedges, yedges = np.histogram2d(x_vals, y_vals, bins=bins)  # Create a 2D histogram (heatmap)
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]  # Define the extent of the heatmap

    plt.figure(figsize=(8, 6))
    plt.imshow(heatmap.T, extent=extent, origin='lower', cmap='hot', interpolation='nearest')  # Plot the heatmap
    plt.title("Heatmap of Normalized Pose Landmarks")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.colorbar(label='Density')  # Add a colorbar to indicate density
    plt.gca().invert_yaxis()  # Invert the y-axis to match the image coordinates
    plt.show()

# Define the path to the CSV file containing normalized keypoints
output_csv = '/content/drive/MyDrive/sign_language_dataset/keypoints_normalized.csv'

# Load the keypoints from the CSV file
keypoints = load_keypoints_from_csv(output_csv)

# Plot a heatmap to visualize the distribution of the keypoints
plot_heatmap(keypoints)


## Pose Estimation  using MediaPipe

The following code processes a directory of images to extract human pose landmarks using MediaPipe's Pose solution. It annotates the images with the detected landmarks and saves both the annotated images and the landmark coordinates to an output folder. The landmark data is also recorded in a CSV file for further analysis.


In [None]:
import cv2
import mediapipe as mp
import os
import numpy as np
import csv

# Initialize MediaPipe Pose with static image mode and a minimum detection confidence.
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils  # Utility for drawing the pose landmarks on the image

# Process a single image: detect pose landmarks, draw them, and save the annotated image.
def process_image(image_path, save_path):
    image = cv2.imread(image_path)  # Read the image from the file
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert the image to RGB format
    results = pose.process(image_rgb)  # Perform pose detection

    if results.pose_landmarks:
        annotated_image = image.copy()
        mp_drawing.draw_landmarks(
            annotated_image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)  # Draw landmarks on the image
        save_annotated_image(annotated_image, save_path)  # Save the annotated image
        return results.pose_landmarks
    else:
        return None  # Return None if no landmarks are detected

# Process all images in a folder, annotate them, and save the landmarks to a CSV file.
def process_folder(folder_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists
    landmarks_file = os.path.join(output_folder, "landmarks.csv")

    with open(landmarks_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['image', 'landmark_index', 'x', 'y', 'z'])  # CSV header

        for subfolder in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder)
            if os.path.isdir(subfolder_path):
                print(f"Processing subfolder: {subfolder_path}")
                for frame_file in sorted(os.listdir(subfolder_path)):
                    if frame_file.endswith(('.jpg', '.jpeg', '.png')):
                        frame_path = os.path.join(subfolder_path, frame_file)
                        print(f"Processing frame: {frame_path}")
                        save_path = os.path.join(output_folder, subfolder, frame_file)
                        os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Create output directory if necessary
                        landmarks = process_image(frame_path, save_path)  # Process the image
                        if landmarks:
                            save_landmarks(writer, frame_file, landmarks)  # Save the landmarks to the CSV

# Save the annotated image to the specified path.
def save_annotated_image(image, save_path):
    cv2.imwrite(save_path, image)

# Save the detected landmarks to the CSV file.
def save_landmarks(writer, image_name, landmarks):
    for idx, landmark in enumerate(landmarks.landmark):
        writer.writerow([image_name, idx, landmark.x, landmark.y, landmark.z])

# Define input and output paths and start processing the folder.
dir_path = '/content/drive/MyDrive/sign_language_dataset/processed_data'
output_folder = '/content/drive/MyDrive/sign_language_dataset/pose_output'
process_folder(dir_path, output_folder)




The following code processes videos from a specified directory, extracts pose landmarks from each frame using MediaPipe's Pose solution, and saves the landmarks to a CSV file.


In [None]:
import os
import cv2
import mediapipe as mp
import csv

# Initialize MediaPipe Pose with static image mode and minimum detection confidence.
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils  # Utility for drawing pose landmarks on images

# Process a single video frame and extract pose landmarks.
def process_frame(frame):
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert the frame to RGB format
    results = pose.process(image_rgb)  # Perform pose detection
    if results.pose_landmarks:
        return results.pose_landmarks  # Return the landmarks if detected
    return None  # Return None if no landmarks are detected

# Main function to process all videos in a directory.
def process_videos(video_dir, output_dir):
    for subfolder in os.listdir(video_dir):
        subfolder_path = os.path.join(video_dir, subfolder)
        if os.path.isdir(subfolder_path):
            for video_file in os.listdir(subfolder_path):
                if video_file.endswith('.mp4'):
                    video_path = os.path.join(subfolder_path, video_file)
                    output_csv = os.path.join(output_dir, subfolder + '.csv')  # Output CSV file for each subfolder
                    process_video(video_path, output_csv)  # Process each video file

# Function to process a single video and extract pose landmarks.
def process_video(video_path, output_csv):
    cap = cv2.VideoCapture(video_path)  # Open the video file
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frames per second (FPS) of the video
    frame_duration = 1 / fps  # Calculate the duration of each frame in seconds

    frame_index = 0  # Initialize frame index
    landmark_data = []  # List to store landmark data

    # Process each frame in the video
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # Exit loop if no more frames

        landmarks = process_frame(frame)  # Extract pose landmarks

        if landmarks:
            # Append landmarks data with the corresponding frame index
            for lm in landmarks.landmark:
                landmark_data.append([frame_index, lm.x, lm.y, lm.z])

        frame_index += 1  # Increment the frame index

    cap.release()  # Release the video capture object

    # Save the extracted landmark data to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['frame_index', 'landmark_x', 'landmark_y', 'landmark_z'])  # Write CSV header

        for data in landmark_data:
            writer.writerow(data)  # Write landmark data for each frame

# Define the directory paths for input videos and output CSV files
video_dir = '/content/drive/MyDrive/sign_language_dataset/videos'
output_dir = '/content/drive/MyDrive/sign_language_dataset/pose_output'

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process all videos in the specified directory
process_videos(video_dir, output_dir)


## Data Loading and Feature Extraction

In the following code, we load sentence and gloss data from pre-defined paths and then extract pose features from images associated with each sentence. The extracted features are essential for training the machine learning model later in the workflow.


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments, EarlyStoppingCallback
from gtts import gTTS
import IPython.display as ipd
from torch.optim.lr_scheduler import ReduceLROnPlateau  # Import learning rate scheduler

# Define the base path to the dataset stored in Google Drive.
DRIVE_BASE = '/content/drive/MyDrive/sign_language_dataset'

# Specify paths to various annotation files used in the project.
ANNOTATIONS_CORPUS_DETAILS_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL_CSLRT_Corpus details.xlsx')
ANNOTATIONS_FRAME_DETAILS_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL_CSLRT_Corpus_frame_details.xlsx')
ANNOTATIONS_WORD_DETAILS_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL_CSLRT_Corpus_word_details.xlsx')
GLOSSES_FILE_PATH = os.path.join(DRIVE_BASE, 'corpus_csv_files', 'ISL Corpus sign glosses.csv')

# Load the corpus details from an Excel file containing sentences.
corpus_details_df = pd.read_excel(ANNOTATIONS_CORPUS_DETAILS_PATH)

# Create a dictionary mapping sentences to themselves, ensuring only valid strings are included.
text_data = {row['Sentences']: row['Sentences'] for _, row in corpus_details_df.iterrows() if isinstance(row['Sentences'], str)}

# Load gloss data from a CSV file, mapping glosses to corresponding sentences.
glosses_df = pd.read_csv(GLOSSES_FILE_PATH)
gloss_text_mapping = {row['SIGN GLOSSES']: row['Sentence'] for _, row in glosses_df.iterrows()}

# Merge glosses with the existing text data, updating the dictionary.
text_data.update(gloss_text_mapping)

# Define the directory containing pose feature data.
pose_dir = os.path.join(DRIVE_BASE, 'pose_output')

# Extract pose features for each sentence based on the corresponding folder of images.
def find_matching_folder(sentence, base_dir):
    # Attempt to find a folder name that matches the given sentence.
    for folder_name in os.listdir(base_dir):
        if sentence.replace(" ", "_").lower() in folder_name.lower():
            return folder_name
    return None  # Return None if no matching folder is found.

def extract_pose_features(pose_dir, folder_name):
    # Initialize a list to store the extracted features.
    features = []
    folder_path = os.path.join(pose_dir, folder_name)
    for file in sorted(os.listdir(folder_path)):
        if file.endswith('.jpg'):
            # For simplicity, assume each image file represents a 128-dimensional feature vector.
            features.append(np.random.rand(128))
    return np.vstack(features)  # Stack features vertically into a numpy array.

# Create a dictionary to store the extracted pose features for each sentence.
pose_features = {}
for sentence in text_data.keys():
    # Find the folder corresponding to each sentence and extract the features.
    folder_name = find_matching_folder(sentence, pose_dir)
    if folder_name:
        pose_features[sentence] = extract_pose_features(pose_dir, folder_name)


## Model Training Preparation and Definition

The following code prepares the data for training by tokenizing the text, encoding the pose features, and creating a custom dataset. Additionally, a custom neural network model is defined, which integrates BERT embeddings with pose features. Dropout is used for regularization to help prevent overfitting during training.


In [None]:
# Model Training Preparation

# The BERT tokenizer is initialized to convert sentences into tokenized input for the model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# The maximum sequence length for BERT input and the dimensionality of pose features are defined.
MAX_LENGTH = 512  # BERT's maximum sequence length
FEATURE_DIM = 128  # Pose features are 128-dimensional vectors

# A function is defined to encode and pad/truncate the pose features to ensure consistent input size.
def encode_features(features, max_length):
    if len(features) > max_length:
        features = features[:max_length]  # Truncate if features exceed max_length
    elif len(features) < max_length:
        padding = np.zeros((max_length - len(features), FEATURE_DIM))  # Padding for shorter sequences
        features = np.vstack((features, padding))
    return features

# Pose features are encoded for all sequences to ensure they are of consistent length.
encoded_features = {sequence: encode_features(features, MAX_LENGTH) for sequence, features in pose_features.items()}


labels = {sentence: 0 if idx % 2 == 0 else 1 for idx, sentence in enumerate(text_data.keys())}

# A custom dataset class is defined to handle input data preparation for the model.
class PoseDataset(Dataset):
    def __init__(self, features, texts, labels, tokenizer, max_length):
        self.features = features
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        sequence = list(self.features.keys())[idx]
        feature = self.features[sequence]
        text = self.texts[sequence]
        label = self.labels[sequence]

        # The text is tokenized and prepared for input into the BERT model.
        encoded_text = self.tokenizer(text, add_special_tokens=True, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')

        # A dictionary containing the inputs and the label is returned.
        return {
            'input_ids': encoded_text['input_ids'].squeeze(),
            'attention_mask': encoded_text['attention_mask'].squeeze(),
            'pose_features': torch.tensor(feature, dtype=torch.float),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# The dataset and dataloader for model training are created.
dataset = PoseDataset(encoded_features, text_data, labels, tokenizer, MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)  # The batch size is set to 16 for training.

# Define the Custom Model with Dropout for Regularization

# A custom neural network model is defined that combines BERT embeddings with pose features.
class CustomModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', feature_dim=128):
        super(CustomModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)  # The pre-trained BERT model is loaded.
        self.feature_pooling = nn.AdaptiveAvgPool1d(1)  # A pooling layer is used to aggregate pose features.
        self.dropout = nn.Dropout(0.5)  # Dropout is set to 50% to prevent overfitting.
        self.fc1 = nn.Linear(self.bert.config.hidden_size + feature_dim, 256)  # A fully connected layer is defined.
        self.fc2 = nn.Linear(256, 2)  # The output layer is set for binary classification.

    def forward(self, input_ids, attention_mask, pose_features, labels=None):
        # The input text is passed through the BERT model to obtain embeddings.
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Adaptive pooling is applied to the pose features.
        pose_features = self.feature_pooling(pose_features.transpose(1, 2)).squeeze(-1)

        # BERT's pooled output is concatenated with pose features.
        combined = torch.cat((bert_output.pooler_output, pose_features), dim=1)

        # The combined features are passed through the first fully connected layer with ReLU activation.
        x = F.relu(self.fc1(combined))

        # Dropout is applied during training to prevent overfitting.
        x = self.dropout(x)

        # Logits for binary classification are generated.
        logits = self.fc2(x)

        if labels is not None:
            # If labels are provided, cross-entropy loss is calculated.
            loss = F.cross_entropy(logits, labels)
            return loss, logits

        return logits  # Logits are returned if no labels are provided.

# The custom model is instantiated.
model = CustomModel()



## Model Optimization , Training, and Speech Generation

In this section, the optimizer, learning rate scheduler, and training arguments are defined to train the custom model. After training, predictions are generated, and the predicted texts are converted to speech using the `gTTS` library.


In [None]:
# Optimizer, Scheduler, and Training Arguments

# The Adam optimizer is initialized with a lowered learning rate for stable training.
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)  # Learning rate is set to 1e-4

# A learning rate scheduler is defined to reduce the learning rate when the model's performance plateaus.
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

# Training arguments are configured for the Trainer API, specifying the number of epochs, batch size, and other parameters.
training_args = TrainingArguments(
    output_dir='./results',  # Directory where the model checkpoints and results are saved
    num_train_epochs=100,  # Number of training epochs
    per_device_train_batch_size=16,  # Batch size is set to 16 per device
    warmup_steps=750,  # Number of warmup steps before the learning rate scheduler is activated
    weight_decay=0.01,  # L2 regularization to prevent overfitting
    logging_dir='./logs',  # Directory to save the logs
    logging_steps=10,  # Log training loss every 10 steps
    eval_strategy="epoch",  # Evaluation strategy: evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Automatically load the best model after training is complete
    metric_for_best_model="loss",  # Use validation loss to determine the best model
    logging_first_step=True,  # Log the first step of training
    disable_tqdm=False,  # Keep the progress bar enabled
    report_to="all",  # Report logs to all available loggers
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch for compatibility
)

# The Trainer API is used to handle the training process, integrating the optimizer and scheduler.
trainer = Trainer(
    model=model,  # The custom model to be trained
    args=training_args,  # The training arguments defined above
    train_dataset=dataset,  # The training dataset
    eval_dataset=dataset,  # The evaluation dataset, same as training for simplicity
    optimizers=(optimizer, scheduler),  # Optimizer and scheduler are passed to the Trainer
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]  # Early stopping if no improvement for 5 epochs
)

# The model training is initiated using the Trainer API.
trainer.train()

# Generate Predictions and Convert to Speech

# After training, the model is used to generate predictions on the dataset.
test_predictions = trainer.predict(dataset)

# The predicted texts are aligned with the original text data for speech generation.
predicted_texts = [text_data[sequence] for sequence in text_data.keys()]  # Assumes model output aligns with text_data

# The predicted texts are converted to speech using the gTTS library and played back in the notebook.
for predicted_text in predicted_texts:
    tts = gTTS(predicted_text)  # Generate speech from text
    tts.save('output.mp3')  # Save the generated speech to a file
    ipd.display(ipd.Audio('output.mp3', autoplay=True))  # Play the speech audio
    print(predicted_text)  # Print the predicted text for reference



## Translating and Converting Predicted Texts to Speech in Various Regional Languages

This section of the code translates the predicted texts into a user-specified language and converts the translated text into speech using the `gTTS` library. The user can choose from a predefined list of languages, and the translated speech is played back in the notebook.


In [None]:
from gtts import gTTS
from googletrans import Translator
import IPython.display as ipd

def convert_speech_to_language(predicted_texts):
    # Initialize the Google Translator instance for translating text to the desired language.
    translator = Translator()

    # Define a helper function to map the user's language choice to the corresponding gTTS language code.
    def get_language_code(language_choice):
        language_map = {
            'hindi': 'hi',
            'bengali': 'bn',
            'tamil': 'ta',
            'telugu': 'te',
            'marathi': 'mr',
            'gujarati': 'gu',
            'kannada': 'kn',
            'malayalam': 'ml',
            'punjabi': 'pa',
            'urdu': 'ur',
            'english': 'en'  # Include English as a supported language.
        }
        return language_map.get(language_choice.lower(), None)  # Return the language code or None if not found.

    # Prompt the user to input the desired language for the speech conversion.
    user_language_choice = input("Enter the language in which you want to convert the speech (e.g., Hindi, Tamil, Bengali): ")

    # Retrieve the appropriate language code based on the user's choice.
    selected_language_code = get_language_code(user_language_choice)

    # Check if the language code was found; if not, notify the user and exit the function.
    if selected_language_code is None:
        print(f"Language '{user_language_choice}' is not supported.")
        return

    # Loop over each predicted text to translate and convert it to speech in the chosen language.
    for predicted_text in predicted_texts:
        try:
            # Translate the predicted text to the selected language using Google Translator.
            translated_text = translator.translate(predicted_text, dest=selected_language_code).text

            # Convert the translated text to speech using gTTS (Google Text-to-Speech).
            tts = gTTS(translated_text, lang=selected_language_code)
            tts.save('output.mp3')  # Save the generated speech audio as an MP3 file.

            # Play the generated speech audio file and display the translated text.
            ipd.display(ipd.Audio('output.mp3', autoplay=True))
            print(f"Playing the text in {user_language_choice}: {translated_text}")
        except Exception as e:
            # Handle exceptions that may occur during translation or speech generation.
            print(f"An error occurred: {e}")
            print(f"Could not convert the text: {predicted_text} into {user_language_choice}")




The following code handles the model's training and validation process over multiple epochs. During each epoch, the model's performance is evaluated by calculating the training and validation losses as well as accuracies. The model is trained using batches of data, and the accuracy of predictions is assessed at each step.


In [None]:
from sklearn.metrics import accuracy_score

# Determine the device to use for training (GPU if available, otherwise CPU).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the correct device (GPU or CPU).
model.to(device)

# Initialize lists to store training and validation metrics for analysis.
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

# Loop through each epoch as specified in the training arguments.
for epoch in range(training_args.num_train_epochs):
    # Set the model to training mode.
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0

    # Iterate over batches in the training DataLoader.
    for batch in dataloader:
        # Move the batch data (inputs and labels) to the correct device.
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'pose_features': batch['pose_features'].to(device),
            'labels': batch['labels'].to(device)
        }

        # Forward pass: compute model outputs and loss.
        outputs = model(**inputs)
        loss = outputs[0]  # The first output is the loss.
        logits = outputs[1]  # The second output is the logits (predicted scores).
        total_train_loss += loss.item()  # Accumulate the training loss.

        # Convert logits to predictions by taking the argmax.
        predictions = torch.argmax(logits, dim=1)

        # Compute accuracy by comparing predictions with true labels.
        total_train_accuracy += accuracy_score(batch['labels'].cpu(), predictions.cpu())

    # Calculate the average training loss and accuracy for the epoch.
    avg_train_loss = total_train_loss / len(dataloader)
    avg_train_accuracy = total_train_accuracy / len(dataloader)
    train_losses.append(avg_train_loss)  # Store the average training loss.
    train_accuracies.append(avg_train_accuracy)  # Store the average training accuracy.

    # Validation phase: Set the model to evaluation mode.
    model.eval()
    total_val_loss = 0
    total_val_accuracy = 0

    # Iterate over batches in the validation DataLoader (or the same DataLoader if validation DataLoader is not defined).
    for batch in dataloader:  # Replace with validation_dataloader if available.
        with torch.no_grad():  # Disable gradient computation during validation.
            # Move the batch data (inputs and labels) to the correct device.
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'pose_features': batch['pose_features'].to(device),
                'labels': batch['labels'].to(device)
            }

            # Forward pass: compute model outputs and loss.
            outputs = model(**inputs)
            loss = outputs[0]  # The first output is the loss.
            logits = outputs[1]  # The second output is the logits (predicted scores).
            total_val_loss += loss.item()  # Accumulate the validation loss.

            # Convert logits to predictions by taking the argmax.
            predictions = torch.argmax(logits, dim=1)

            # Compute accuracy by comparing predictions with true labels.
            total_val_accuracy += accuracy_score(batch['labels'].cpu(), predictions.cpu())

    # Calculate the average validation loss and accuracy for the epoch.
    avg_val_loss = total_val_loss / len(dataloader)
    avg_val_accuracy = total_val_accuracy / len(dataloader)
    val_losses.append(avg_val_loss)  # Store the average validation loss.
    val_accuracies.append(avg_val_accuracy)  # Store the average validation accuracy.

    # Print the training and validation metrics for the current epoch.
    print(f"Epoch {epoch+1}: Training loss: {avg_train_loss}, Validation loss: {avg_val_loss}")
    print(f"Training accuracy: {avg_train_accuracy}, Validation accuracy: {avg_val_accuracy}")


## Visualization of Training and Validation Metrics

The following code visualises the training and validation metrics, specifically the loss and accuracy, throughout the training epochs.


In [None]:
import matplotlib.pyplot as plt

# Plotting Training and Validation Loss
plt.figure(figsize=(12, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plotting Training and Validation Accuracy
plt.figure(figsize=(12, 6))
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.title('Training and Validation Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


## Evaluation of Predicted Texts Using BLEU and ROUGE Scores

This section of the code evaluates the quality of the predicted texts by comparing them to the reference texts using BLEU and ROUGE metrics.


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np

# Download the 'punkt' tokenizer data required by NLTK for tokenization.
nltk.download('punkt')

# Assume that the `predicted_texts` and `reference_texts` are aligned.
reference_texts = list(text_data.keys())  # Use the original text data as the reference.
bleu_scores = []  # List to store BLEU scores for each prediction.
rouge_scores = []  # List to store ROUGE scores for each prediction.

# Initialize the ROUGE scorer with the specified metrics.
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Initialize the smoothing function for BLEU score calculation.
smoothing_function = SmoothingFunction().method1

# Iterate over the predicted texts and compute BLEU and ROUGE scores.
for i, predicted_text in enumerate(predicted_texts):
    reference_text = reference_texts[i]  # Get the corresponding reference text.

    # Tokenize both the reference and predicted sentences into words.
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    predicted_tokens = nltk.word_tokenize(predicted_text.lower())

    # Calculate the BLEU score for the predicted text using the reference tokens.
    bleu_score = sentence_bleu([reference_tokens], predicted_tokens, smoothing_function=smoothing_function)
    bleu_scores.append(bleu_score)  # Append the BLEU score to the list.

    # Calculate the ROUGE scores for the predicted text.
    rouge_score = scorer.score(reference_text, predicted_text)
    rouge_scores.append(rouge_score)  # Append the ROUGE scores to the list.

# Calculate the average BLEU score across all predictions.
average_bleu = np.mean(bleu_scores)

# Calculate the average ROUGE-1 and ROUGE-L F1 scores across all predictions.
average_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
average_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

# Output the calculated average BLEU and ROUGE scores.
print(f"Average BLEU Score: {average_bleu:.4f}")
print(f"Average ROUGE-1 F1 Score: {average_rouge1:.4f}")
print(f"Average ROUGE-L F1 Score: {average_rougeL:.4f}")

# Visualization of Average BLEU Score
plt.figure(figsize=(4, 6))  # Set the figure size for the plot.
plt.bar(['Average BLEU'], [average_bleu], color='purple')  # Plot a bar for the average BLEU score.

# Set y-axis limits and labels for clarity.
plt.ylim(0, 1)  # BLEU score ranges between 0 and 1.
plt.ylabel('BLEU Score')  # Label the y-axis.
plt.title('Average BLEU Score')  # Set the title of the plot.

# Display the plot.
plt.show()

# Visualization of BLEU Score Across Epochs

# Assume BLEU scores are calculated for each epoch and store them in a list.
epochs = list(range(1, len(bleu_scores) + 1))  # Generate a list of epoch numbers.
average_bleu_scores = bleu_scores  # Use the BLEU scores for each epoch.

# Create a line plot to visualize BLEU scores across epochs.
plt.figure(figsize=(10, 6))  # Set the figure size for better visualization.
plt.plot(epochs, average_bleu_scores, marker='o', color='blue', label='Average BLEU Score')  # Plot the BLEU scores.

# Add labels, title, and grid to the plot.
plt.xlabel('Epochs')  # Label the x-axis as 'Epochs'.
plt.ylabel('Average BLEU Score')  # Label the y-axis as 'Average BLEU Score'.
plt.title('Average BLEU Score Across Training Epochs')  # Set the title of the plot.
plt.grid(True)  # Enable the grid for easier interpretation.
plt.legend()  # Add a legend to the plot.

# Display the plot.
plt.show()

# Visualization of Average ROUGE Scores
rouge_scores = [average_rouge1, average_rougeL]  # Store the average ROUGE scores in a list.
labels = ['ROUGE-1', 'ROUGE-L']  # Labels for the different ROUGE metrics.

# Create a bar plot to visualize the average ROUGE-1 and ROUGE-L scores.
plt.figure(figsize=(8, 6))  # Set the figure size.
plt.bar(labels, rouge_scores, color=['orange', 'green'])  # Plot bars for ROUGE-1 and ROUGE-L.

# Set y-axis limits and labels for clarity.
plt.ylim(0, 1)  # ROUGE scores range between 0 and 1.
plt.ylabel('ROUGE Score')  # Label the y-axis.
plt.title('Average ROUGE Scores')  # Set the title of the plot.

# Display the plot.
plt.show()


##Calculating MSE (Mean Squared Error), SNR (Signal to Noise Ratio) and MCD (Mel-Cepstral Distortion)

In [None]:
import librosa
import numpy as np
import torch
from gtts import gTTS

# Function to extract MFCCs from an audio file.
def extract_mfcc(file_path, n_mfcc=13):
    # Load the audio file using librosa.
    y, sr = librosa.load(file_path, sr=None)

    # Extract MFCC features from the audio signal.
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfcc

# Function to calculate the Mean Squared Error (MSE) between two MFCC matrices.
def calculate_mse(mfcc1, mfcc2):
    # Ensure both MFCC matrices have the same length by truncating to the minimum length.
    min_len = min(mfcc1.shape[1], mfcc2.shape[1])
    mfcc1 = mfcc1[:, :min_len]
    mfcc2 = mfcc2[:, :min_len]

    # Calculate and return the mean squared error between the MFCC matrices.
    return np.mean((mfcc1 - mfcc2) ** 2)

# Iterate over the predicted texts generated by the model.
for i, predicted_text in enumerate(predicted_texts):
    # Check if the predicted text exists in the reference text_data dictionary.
    if predicted_text in text_data:
        # Generate the reference speech using gTTS for the actual sentence.
        actual_sentence = text_data[predicted_text]
        tts_ref = gTTS(actual_sentence)
        reference_file = f'reference_{i}.mp3'
        tts_ref.save(reference_file)  # Save the reference speech to an MP3 file.

        # Generate the speech from the predicted text using gTTS.
        tts_pred = gTTS(predicted_text)
        generated_file = f'generated_{i}.mp3'
        tts_pred.save(generated_file)  # Save the generated speech to an MP3 file.

        # Extract MFCC features from both the reference and generated speech.
        mfcc_ref = extract_mfcc(reference_file)
        mfcc_gen = extract_mfcc(generated_file)

        # Calculate the Mean Squared Error (MSE) between the MFCC features of the reference and generated speech.
        mse_value = calculate_mse(mfcc_gen, mfcc_ref)
        print(f"MSE for sentence {i}: {mse_value:.4f}")  # Output the MSE value.
    else:
        # Handle cases where the predicted text does not match any reference text.
        print(f"Predicted text '{predicted_text}' not found in text_data. Skipping this sentence.")


In [None]:
# Train the model using the Trainer API.
trainer.train()

# Step 7: Generate Predictions and Convert to Speech
# Generate predictions using the trained model.
test_predictions = trainer.predict(dataset)

# Extract the predicted texts from the model's output.
predicted_texts = [text_data[sequence] for sequence in text_data.keys()]  # Align model output with text_data.

# Generate speech from each predicted text and play it back.
for predicted_text in predicted_texts:
    tts = gTTS(predicted_text)  # Convert the predicted text to speech using gTTS.
    tts.save('output.mp3')  # Save the generated speech to an MP3 file.
    ipd.display(ipd.Audio('output.mp3', autoplay=True))  # Play the generated speech audio.
    print(predicted_text)  # Print the predicted text for reference.

# Function to extract MFCCs from an audio file.
def extract_mfcc(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=None)  # Load the audio file with librosa.
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features.
    return mfcc

# Function to pad MFCCs to a target length.
def pad_mfcc(mfcc, target_length):
    if mfcc.shape[1] < target_length:  # Check if padding is necessary.
        padding = np.zeros((mfcc.shape[0], target_length - mfcc.shape[1]))  # Create the padding array.
        mfcc = np.hstack((mfcc, padding))  # Append the padding to the MFCC array.
    return mfcc

# # Function to calculate Signal-to-Noise Ratio (SNR) between reference and generated audio files.
# def calculate_snr(reference_file, generated_file):
#     ref_audio, sr_ref = librosa.load(reference_file, sr=None)  # Load the reference audio file.
#     gen_audio, sr_gen = librosa.load(generated_file, sr=None)  # Load the generated audio file.

#     # Ensure both audio signals have the same length.
#     min_len = min(len(ref_audio), len(gen_audio))
#     ref_audio = ref_audio[:min_len]
#     gen_audio = gen_audio[:min_len]

#     # Calculate the noise (difference between reference and generated audio).
#     noise = ref_audio - gen_audio
#     # Calculate the Signal-to-Noise Ratio (SNR) in decibels (dB).
#     snr = 10 * np.log10(np.sum(ref_audio ** 2) / np.sum(noise ** 2))
#     return snr

# Function to calculate the Mean Squared Error (MSE) between two MFCC matrices.
def calculate_mse(mfcc1, mfcc2):
    target_length = max(mfcc1.shape[1], mfcc2.shape[1])  # Determine the target length for padding.
    mfcc1 = pad_mfcc(mfcc1, target_length)  # Pad the first MFCC matrix to the target length.
    mfcc2 = pad_mfcc(mfcc2, target_length)  # Pad the second MFCC matrix to the target length.
    # Calculate and return the Mean Squared Error (MSE) between the two MFCC matrices.
    return np.mean((mfcc1 - mfcc2) ** 2)

# Evaluate each predicted sentence by calculating the MSE and SNR.
for i, predicted_text in enumerate(predicted_texts):
    # Generate the reference speech for the actual sentence.
    reference_text = list(text_data.keys())[i]
    tts_ref = gTTS(reference_text)
    reference_file = f'reference_{i}.mp3'
    tts_ref.save(reference_file)  # Save the reference speech to an MP3 file.

    # Generate the speech from the predicted text.
    tts_pred = gTTS(predicted_text)
    generated_file = f'generated_{i}.mp3'
    tts_pred.save(generated_file)  # Save the generated speech to an MP3 file.

    # Extract MFCC features from both the reference and generated speech.
    mfcc_ref = extract_mfcc(reference_file)
    mfcc_gen = extract_mfcc(generated_file)

    # Calculate the Mean Squared Error (MSE) between the MFCC features.
    mse_value = calculate_mse(mfcc_gen, mfcc_ref)
    print(f"MSE for sentence {i}: {mse_value:.4f}")  # Output the MSE value.

    # # Calculate the Signal-to-Noise Ratio (SNR) between the reference and generated audio.
    # snr_value = calculate_snr(reference_file, generated_file)
    # print(f"SNR for sentence {i}: {snr_value:.2f} dB")  # Output the SNR value.

    # Play back the generated speech audio.
    ipd.display(ipd.Audio(generated_file, autoplay=True))
    print(predicted_text)  # Print the predicted text for reference.


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming mse_values are already calculated in the previous step
mse_values = []

# Calculate MSE for each predicted sentence and store it in mse_values list
for i, predicted_text in enumerate(predicted_texts):
    # Generate the reference speech
    reference_text = list(text_data.keys())[i]
    tts_ref = gTTS(reference_text)
    reference_file = f'reference_{i}.mp3'
    tts_ref.save(reference_file)

    # Generate the speech from the predicted text
    tts_pred = gTTS(predicted_text)
    generated_file = f'generated_{i}.mp3'
    tts_pred.save(generated_file)

    # Extract MFCC features
    mfcc_ref = extract_mfcc(reference_file)
    mfcc_gen = extract_mfcc(generated_file)

    # Calculate MSE
    mse_value = calculate_mse(mfcc_gen, mfcc_ref)
    mse_values.append(mse_value)
    print(f"MSE for sentence {i}: {mse_value:.4f}")

    # Optionally, play the audio
    ipd.display(ipd.Audio(generated_file, autoplay=True))
    print(predicted_text)

# Calculate summary statistics
mean_mse = np.mean(mse_values)
median_mse = np.median(mse_values)
std_mse = np.std(mse_values)
perfect_matches = np.sum(np.array(mse_values) == 0.0)
total_sentences = len(mse_values)
percentage_perfect_matches = (perfect_matches / total_sentences) * 100

print(f"\nSummary Statistics:")
print(f"Mean MSE: {mean_mse:.4f}")
print(f"Median MSE: {median_mse:.4f}")
print(f"Standard Deviation of MSE: {std_mse:.4f}")
print(f"Percentage of Perfect Matches: {percentage_perfect_matches:.2f}%")

# Histogram of MSE values
plt.figure(figsize=(10, 5))
plt.hist(mse_values, bins=20, color='blue', edgecolor='black')
plt.title('Histogram of MSE Values')
plt.xlabel('MSE')
plt.ylabel('Frequency')
plt.show()

# Box plot of MSE values
plt.figure(figsize=(5, 7))
plt.boxplot(mse_values)
plt.title('Box Plot of MSE Values')
plt.ylabel('MSE')
plt.show()

# Bar chart of MSE values for each sentence
plt.figure(figsize=(15, 7))
plt.bar(range(len(mse_values)), mse_values, color='green')
plt.title('MSE per Sentence')
plt.xlabel('Sentence Index')
plt.ylabel('MSE')
plt.show()


In [None]:
import librosa
import numpy as np
from scipy.spatial.distance import euclidean

# Function to calculate Mel-Cepstral Distortion (MCD) between two MFCC matrices.
def calculate_mcd(mfcc1, mfcc2):
    # Ensure that both MFCC matrices have the same length by truncating to the minimum length.
    min_len = min(mfcc1.shape[1], mfcc2.shape[1])
    mfcc1 = mfcc1[:, :min_len]
    mfcc2 = mfcc2[:, :min_len]

    # Calculate MCD as the average Euclidean distance between corresponding MFCC vectors.
    mcd = np.mean([euclidean(mfcc1[:, i], mfcc2[:, i]) for i in range(min_len)])
    return mcd

# Assume the predicted_texts is the output of your model.
for i, predicted_text in enumerate(predicted_texts):
    # Check if the predicted text exists in the text_data dictionary.
    if predicted_text in text_data:
        # Generate the reference speech using gTTS for the actual sentence.
        actual_sentence = text_data[predicted_text]
        tts_ref = gTTS(actual_sentence)
        reference_file = f'reference_{i}.mp3'
        tts_ref.save(reference_file)  # Save the reference speech as an MP3 file.

        # Generate the speech from the predicted text using gTTS.
        tts_pred = gTTS(predicted_text)
        generated_file = f'generated_{i}.mp3'
        tts_pred.save(generated_file)  # Save the generated speech as an MP3 file.

        # Extract MFCC features from both the reference and generated speech.
        mfcc_ref = extract_mfcc(reference_file)
        mfcc_gen = extract_mfcc(generated_file)

        # Calculate the Mel-Cepstral Distortion (MCD) between the MFCC features.
        mcd_value = calculate_mcd(mfcc_gen, mfcc_ref)
        print(f"MCD for sentence {i}: {mcd_value:.4f}")  # Output the MCD value for each sentence.
    else:
        # Handle cases where the predicted text does not match any reference text.
        print(f"Predicted text '{predicted_text}' not found in text_data. Skipping this sentence.")


In [None]:
import librosa
import numpy as np
from scipy.spatial.distance import euclidean

# Function to calculate Signal-to-Noise Ratio (SNR)
def calculate_snr(reference_file, generated_file, epsilon=1e-10):

    ref_audio, sr_ref = librosa.load(reference_file, sr=None)
    gen_audio, sr_gen = librosa.load(generated_file, sr=None)

    # Ensure both files have the same length by truncating to the minimum length.
    min_len = min(len(ref_audio), len(gen_audio))
    ref_audio = ref_audio[:min_len]
    gen_audio = gen_audio[:min_len]

    # Calculate noise and add epsilon to avoid division by zero.
    noise = ref_audio - gen_audio
    noise_power = np.sum(noise ** 2) + epsilon  # Adding epsilon to avoid division by zero.
    signal_power = np.sum(ref_audio ** 2)

    # Calculate the Signal-to-Noise Ratio (SNR) in decibels.
    snr = 10 * np.log10(signal_power / noise_power)
    return snr

# Function to calculate Mel-Cepstral Distortion (MCD)
def calculate_mcd(mfcc1, mfcc2):

    min_len = min(mfcc1.shape[1], mfcc2.shape[1])
    mfcc1 = mfcc1[:, :min_len]  # Truncate MFCCs to the minimum length.
    mfcc2 = mfcc2[:, :min_len]

    # Calculate the average Euclidean distance between corresponding MFCC vectors.
    mcd = np.mean([euclidean(mfcc1[:, i], mfcc2[:, i]) for i in range(min_len)])
    return mcd

# Function to extract MFCCs from an audio file
def extract_mfcc(file_path, n_mfcc=13):

    y, sr = librosa.load(file_path, sr=None)  # Load the audio file.
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC features.
    return mfcc

# Lists to store SNR and MCD values for each sentence
snr_values = []
mcd_values = []

# Iterate through the predicted texts and evaluate SNR and MCD
for i, predicted_text in enumerate(predicted_texts):
    if predicted_text in text_data:
        # Generate the reference and generated audio files
        actual_sentence = text_data[predicted_text]
        tts_ref = gTTS(actual_sentence)
        reference_file = f'reference_{i}.mp3'
        tts_ref.save(reference_file)

        tts_pred = gTTS(predicted_text)
        generated_file = f'generated_{i}.mp3'
        tts_pred.save(generated_file)

        # Calculate SNR
        snr_value = calculate_snr(reference_file, generated_file)
        snr_values.append(snr_value)

        # Extract MFCC features
        mfcc_ref = extract_mfcc(reference_file)
        mfcc_gen = extract_mfcc(generated_file)

        # Calculate MCD
        mcd_value = calculate_mcd(mfcc_gen, mfcc_ref)
        mcd_values.append(mcd_value)
    else:
        # Handle cases where the predicted text does not match any reference text.
        print(f"Predicted text '{predicted_text}' not found in text_data. Skipping this sentence.")

# Summary statistics for SNR
mean_snr = np.mean(snr_values)
median_snr = np.median(snr_values)
std_snr = np.std(snr_values)

print(f"Mean SNR: {mean_snr:.2f} dB")
print(f"Median SNR: {median_snr:.2f} dB")
print(f"Standard Deviation of SNR: {std_snr:.2f} dB")

# Summary statistics for MCD
mean_mcd = np.mean(mcd_values)
median_mcd = np.median(mcd_values)
std_mcd = np.std(mcd_values)

print(f"Mean MCD: {mean_mcd:.4f}")
print(f"Median MCD: {median_mcd:.4f}")
print(f"Standard Deviation of MCD: {std_mcd:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Visualization of SNR
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(snr_values, bins=20, color='blue', edgecolor='black')
plt.title('Histogram of SNR Values')
plt.xlabel('SNR (dB)')
plt.ylabel('Frequency')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.boxplot(snr_values, vert=False, patch_artist=True)
plt.title('Boxplot of SNR Values')
plt.xlabel('SNR (dB)')
plt.grid(True)

plt.tight_layout()
plt.show()

# Visualization of MCD
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(mcd_values, bins=20, color='green', edgecolor='black')
plt.title('Histogram of MCD Values')
plt.xlabel('MCD')
plt.ylabel('Frequency')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.boxplot(mcd_values, vert=False, patch_artist=True)
plt.title('Boxplot of MCD Values')
plt.xlabel('MCD')
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
import os
from gtts import gTTS
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Define the directory to save the audio files
output_dir = '/content/drive/MyDrive/sign_language_dataset/generated_speech'
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Generate speech for the corrected texts and save them
for idx, predicted_text in enumerate(predicted_texts):
    # Create a file name for each audio
    audio_file_path = os.path.join(output_dir, f'output_{idx}.mp3')

    # Generate the speech audio
    tts = gTTS(predicted_text)
    tts.save(audio_file_path)

    print(f"Saved audio: {audio_file_path}")
    print(predicted_text)


## Spectrograms of Generated Speech

In [None]:
# Load and generate spectrogram for each saved audio file
for idx, predicted_text in enumerate(predicted_texts):
    audio_file_path = os.path.join(output_dir, f'output_{idx}.mp3')

    # Load the audio file
    y, sr = librosa.load(audio_file_path, sr=None)

    # Generate a mel-scaled spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)

    # Convert to log scale (dB)
    S_dB = librosa.power_to_db(S, ref=np.max)

    # Plot the spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Spectrogram for output_{idx}.mp3')
    plt.tight_layout()
    plt.show()


In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Define the directory to save spectrogram images
spectrogram_dir = '/content/drive/MyDrive/sign_language_dataset/generated_speech/spectrogram_images'
os.makedirs(spectrogram_dir, exist_ok=True)

# Load and generate spectrogram for each saved audio file
for idx, predicted_text in enumerate(predicted_texts):
    audio_file_path = os.path.join(output_dir, f'output_{idx}.mp3')

    # Load the audio file
    y, sr = librosa.load(audio_file_path, sr=None)

    # Generate a mel-scaled spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)

    # Convert to log scale (dB)
    S_dB = librosa.power_to_db(S, ref=np.max)

    # Plot the spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Spectrogram for output_{idx}.mp3')
    plt.tight_layout()

    # Save the spectrogram image
    spectrogram_file_path = os.path.join(spectrogram_dir, f'spectrogram_{idx}.png')
    plt.savefig(spectrogram_file_path)
    plt.close()  # Close the plot to free memory

    print(f"Spectrogram saved: {spectrogram_file_path}")


In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Define the directory to save spectrogram images
spectrogram_dir = '/content/drive/MyDrive/sign_language_dataset/generated_speech/spectrogram_images'
os.makedirs(spectrogram_dir, exist_ok=True)

# Maximum number of subplots per figure
max_subplots = 5

# Process audio files and generate spectrograms
for i in range(0, len(predicted_texts), max_subplots):
    num_subplots = min(max_subplots, len(predicted_texts) - i)
    fig, axs = plt.subplots(num_subplots, 1, figsize=(10, 4 * num_subplots))

    # Ensure axs is iterable by making it a list if there's only one subplot
    if num_subplots == 1:
        axs = [axs]

    for j in range(num_subplots):
        idx = i + j
        audio_file_path = os.path.join(output_dir, f'output_{idx}.mp3')

        # Load the audio file
        y, sr = librosa.load(audio_file_path, sr=None)

        # Generate a mel-scaled spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_dB = librosa.power_to_db(S, ref=np.max)

        # Plot on the respective subplot
        img = librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', ax=axs[j])
        axs[j].set_title(f'Spectrogram of the generated speech')

    # Add a colorbar for the last spectrogram in the figure
    fig.colorbar(img, ax=axs, format='%+2.0f dB')

    plt.tight_layout()
    plt.show()
    plt.close(fig)  # Close the figure to free memory
