<a href="https://colab.research.google.com/github/devajithsb/Automatic-Speech-Recognition---CNN/blob/main/CNN_working_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#CNN implementation code

In [None]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


In [None]:
import librosa
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import speech_recognition as sr
# 1. Extract Features from Audio
def extract_features(audio_file):
    try:
        y, sr = librosa.load(audio_file, sr=16000)
        if len(y) == 0:
            print(f"Warning: {audio_file} is silent.")
            return None
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        if mfccs.shape[1] == 0:
            print(f"Warning: No MFCCs extracted from {audio_file}")
            return None
        return np.mean(mfccs.T, axis=0)  # Mean pooling to flatten features
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return None

# 2. Convert Speech to Text (using Google Speech Recognition)
def speech_to_text(audio_file):
    try:
        r = sr.Recognizer()
        with sr.AudioFile(audio_file) as source:
            audio_data = r.record(source)
        text = r.recognize_google(audio_data)  # Requires internet connection
        return text.lower()
    except sr.UnknownValueError:
        print(f"Could not understand audio: {audio_file}")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return None
    except Exception as e: # Catching other potential errors
        print(f"Error in speech_to_text: {e}")
        return None

# 3. Dataset
audio_files_and_labels = [
    ("sounds/Life is a beautiful (alfred-british).wav", "Life is a beautiful journey"),  # Use actual text labels
    ("sounds/Hello Myself Devajit.wav", "hello myself devajit"),
    ("sounds/Hello My self Ryan(canadian -ryan).wav", "hello myself ryan"),
    ("sounds/Hello My self Rishi(indian-rishi).wav", "hello myself rishi"),
    ("sounds/Hello Myself Devajit (irish-cillian).wav", "hello myself Devajit"),
    ("sounds/Life is a beautiful (shirley-scottish).wav", "Life is a beautiful journey"),
]

X, y = [], []
for audio_path, label in audio_files_and_labels:
    if os.path.exists(audio_path):
        features = extract_features(audio_path)
        if features is not None:
            X.append(features)
            y.append(label.lower())  # Keep labels lowercase
        else:
            print(f"Warning: Features not extracted for {audio_path}")
    else:
        print(f"Error: Audio file not found: {audio_path}")

if not X or not y:
    exit("Error: No valid features extracted. Check your audio files.")

X = np.array(X)

# 4. Encode Labels (AFTER splitting the data)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 5. Split Data (BEFORE encoding labels)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 6. Build DNN Model (input_shape should be correct now)
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# 7. Compile and Train Model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Correct loss function
              metrics=['accuracy'])

model.summary()
model.fit(X_train, y_train, epochs=10, batch_size=8, validation_data=(X_test, y_test))

# 8. Evaluate Model (Word-Level Metrics - Corrected)
def evaluate_model(ground_truth, predicted):
    if ground_truth is None or predicted is None:
        return {'precision': 0, 'recall': 0, 'f1_score': 0, 'accuracy': 0}

    ground_truth_words = ground_truth.lower().split() # Ground truth is already a string
    predicted_words = predicted.lower().split()

    common_words = set(ground_truth_words) & set(predicted_words)
    precision = len(common_words) / len(predicted_words) if predicted_words else 0
    recall = len(common_words) / len(ground_truth_words) if ground_truth_words else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    accuracy = len(common_words) / len(set(ground_truth_words) | set(predicted_words)) if (set(ground_truth_words) | set(predicted_words)) else 0
    return {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}

# 9. Model Prediction and Evaluation (Corrected)
total_precision = total_recall = total_f1 = total_accuracy = total_weight = 0

for audio_path, ground_truth_label in audio_files_and_labels:  # Use ground_truth_label
    recognized_text = speech_to_text(audio_path)
    if recognized_text is not None:
        print(f"Audio: {audio_path}")
        print(f"Ground Truth: {ground_truth_label}")  # Print the actual ground truth
        print(f"Recognized Text: {recognized_text}")

        metrics = evaluate_model(ground_truth_label, recognized_text) #Use ground_truth_label
        weight = 1
        total_precision += metrics['precision'] * weight
        total_recall += metrics['recall'] * weight
        total_f1 += metrics['f1_score'] * weight
        total_accuracy += metrics['accuracy'] * weight
        total_weight += weight

# 10. Display Weighted Averages
if total_weight > 0:
    print("\nWeighted Averages:")
    print(f"Weighted Precision: {total_precision / total_weight:.2f}")
    print(f"Weighted Recall: {total_recall / total_weight:.2f}")
    print(f"Weighted F1-Score: {total_f1 / total_weight:.2f}")
    print(f"Weighted Accuracy: {total_accuracy / total_weight:.2f}")
else:
    print("No valid predictions to calculate weighted averages.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 26.8986 - val_accuracy: 0.0000e+00 - val_loss: 25.3912
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.5000 - loss: 13.9438 - val_accuracy: 0.0000e+00 - val_loss: 23.7656
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 0.2500 - loss: 34.1374 - val_accuracy: 0.0000e+00 - val_loss: 22.1333
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - accuracy: 0.7500 - loss: 15.1134 - val_accuracy: 0.0000e+00 - val_loss: 19.5149
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.0000e+00 - loss: 28.3133 - val_accuracy: 0.0000e+00 - val_loss: 16.8044
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.0000e+00 - loss: 27.4013 - val_accuracy: 0.0000e+00 - val_loss: 15.602