<a href="https://colab.research.google.com/github/devajithsb/Automatic-Speech-Recognition---ANN/blob/main/ANN_working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install SpeechRecognition



In [29]:
import librosa
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras import layers
import speech_recognition as sr

def extract_features(audio_file):
    try:
        y, sr = librosa.load(audio_file, sr=16000)
        if len(y) == 0:
            print(f"Warning: {audio_file} is silent.")
            return np.zeros(13)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        if mfccs.shape[1] == 0:
            print(f"Warning: No MFCCs extracted from {audio_file}")
            return np.zeros(13)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        return np.zeros(13)

def speech_to_text(audio_file):
    try:
        r = sr.Recognizer()
        with sr.AudioFile(audio_file) as source:
            audio_data = r.record(source)
        return r.recognize_google(audio_data).lower()
    except sr.UnknownValueError:
        print(f"Could not understand audio: {audio_file}")
        return None
    except sr.RequestError as e:
        print(f"Could not request results: {e}")
        return None

audio_files_and_labels = [
    ("/content/sounds/Life is a beautiful (alfred-british).wav", "Life is a beautiful journey"),
    ("/content/sounds/Hello Myself Devajit.wav", "hello myself devajit"),
    ("/content/sounds/Hello My self Ryan(canadian -ryan).wav", "hello myself ryan"),
    ("/content/sounds/Hello My self Rishi(indian-rishi).wav", "hello myself rishi"),
    ("/content/sounds/Life is a beautiful (shirley-scottish).wav", "Life is a beautiful journey"),
    ("/content/sounds/Hello Myself Devajit (irish-cillian).wav", "hello myself Devajit"),
    ("/content/sounds/Hello I m Rose From .wav","Hello I'm Rose From New Zaeland"),
    ("/content/sounds/Hello Myself Devajit (irish-cillian).wav", "hello myself Devajit"),
]
X, y = [], []
for audio_path, label in audio_files_and_labels:
    if os.path.exists(audio_path):
        features = extract_features(audio_path)
        X.append(features)
        y.append(label.lower())
    else:
        print(f"Error: Audio file not found: {audio_path}")

if not X:
    exit("Error: No valid features extracted.")

X = np.array(X)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_test, y_test))

def evaluate_model(ground_truth, predicted):
    if not ground_truth or not predicted:
        return {'precision': 0, 'recall': 0, 'f1_score': 0, 'accuracy': 0}

    gt_words = set(ground_truth.lower().split())
    pred_words = set(predicted.lower().split())

    common = gt_words & pred_words
    precision = len(common) / len(pred_words) if pred_words else 0
    recall = len(common) / len(gt_words) if gt_words else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    accuracy = len(common) / len(gt_words | pred_words) if (gt_words | pred_words) else 0

    return {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}

total_precision = total_recall = total_f1 = total_accuracy = total_weight = 0

for audio_path, ground_truth_label in audio_files_and_labels:
    recognized_text = speech_to_text(audio_path)
    if recognized_text:
        metrics = evaluate_model(ground_truth_label, recognized_text)
        total_precision += metrics['precision']
        total_recall += metrics['recall']
        total_f1 += metrics['f1_score']
        total_accuracy += metrics['accuracy']
        total_weight += 1

if total_weight > 0:
    print("\nWeighted Averages:")
    print(f"Weighted Precision: {total_precision / total_weight:.2f}")
    print(f"Weighted Recall: {total_recall / total_weight:.2f}")
    print(f"Weighted F1-Score: {total_f1 / total_weight:.2f}")
    print(f"Weighted Accuracy: {total_accuracy / total_weight:.2f}")
else:
    print("No valid predictions to calculate weighted averages.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.1667 - loss: 39.6902 - val_accuracy: 0.0000e+00 - val_loss: 15.8314
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.0000e+00 - loss: 36.9489 - val_accuracy: 0.0000e+00 - val_loss: 21.7970
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.1667 - loss: 18.8898 - val_accuracy: 0.0000e+00 - val_loss: 20.3339
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.1667 - loss: 18.1170 - val_accuracy: 0.0000e+00 - val_loss: 24.5048
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.6667 - loss: 20.7059 - val_accuracy: 0.0000e+00 - val_loss: 26.4119
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.3333 - loss: 30.9746 - val_accuracy: 0.0000e+00 - val_loss: 25.7224
Epoch 7/