<a href="https://colab.research.google.com/github/codewithroger/TTL-Project-Voice-Recognition-/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 2: Import libraries
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
import joblib


In [5]:
# Step 2: Import libraries
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
import joblib

# Step 3: Set your base directory inside Google Drive
# Update base_dir to the correct path
base_dir = '/content/drive/MyDrive/TTL'
voice_dir = os.path.join(base_dir, 'Voice')
model_dir = os.path.join(base_dir, 'models')

# Step 4: Load metadata
metadata = pd.read_csv(os.path.join(voice_dir, 'frames_metadata.csv'))
print("Metadata columns:", metadata.columns)
print(metadata.head())

Metadata columns: Index(['frame_name', 'fsID', 'start', 'end', 'salience', 'fold', 'classId',
       'Label'],
      dtype='object')
          frame_name  fsID  start   end  salience  fold  classId   Label
0  Voice_frame_0.wav     0    0.0   5.0         1     1        0  Yogesh
1  Voice_frame_1.wav     1    5.0  10.0         1     1        0  Yogesh
2  Voice_frame_2.wav     2   10.0  15.0         1     1        0  Yogesh
3  Voice_frame_3.wav     3   15.0  20.0         1     1        0  Yogesh
4  Voice_frame_4.wav     4   20.0  25.0         1     1        0  Yogesh


In [6]:
def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, sr=44100)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40, fmax=8000)
        chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
        mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
        contrast = librosa.feature.spectral_contrast(y=audio, sr=sample_rate)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio), sr=sample_rate)

        features = np.hstack((
            np.mean(mfccs.T, axis=0),
            np.mean(chroma.T, axis=0),
            np.mean(mel.T, axis=0),
            np.mean(contrast.T, axis=0),
            np.mean(tonnetz.T, axis=0)
        ))
        return features
    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

# Step 6: Extract features and labels
features = []
labels = []

for index, row in metadata.iterrows():
    file_path = os.path.join(voice_dir, f'fold{row["fold"]}', row["frame_name"])

    if not os.path.exists(file_path):
        print(f"File does not exist: {file_path}")
        continue

    print(f"Processing file: {file_path}")
    feature = extract_features(file_path)

    if feature is not None:
        features.append(feature)
        labels.append(row["Label"])
    else:
        print(f"Failed to extract features from {file_path}")

if len(labels) == 0:
    raise ValueError("No labels found. Ensure the audio files are accessible and features are being extracted correctly.")

X = np.array(features)
y = np.array(labels)

# Step 7: Encode labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

# Step 8: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state=42)

# Step 9: Build model
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],)))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(len(le.classes_)))
model.add(Activation('softmax'))

# Step 10: Compile model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Step 11: Train model (reduce epochs for testing)
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), verbose=1)

# Step 12: Evaluate model
score = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')


Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_0.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_1.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_2.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_3.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_4.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_5.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_6.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_7.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_8.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_9.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_10.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_11.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold1/Voice_frame_12.wav
Processing file: /content/drive/MyD



Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_0.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_1.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_2.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_3.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_4.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_5.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_6.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_7.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_8.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_9.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_10.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_11.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold2/vikram_frame_12.wav
Processing file: /cont



Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_0.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_1.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_2.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_3.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_4.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_5.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_6.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_7.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_8.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_9.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_10.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_11.wav
Processing file: /content/drive/MyDrive/TTL/Voice/fold3/sushan_frame_12.wav
Processing file: /cont

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.4566 - loss: 1.5648 - val_accuracy: 0.5051 - val_loss: 4.9426
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8162 - loss: 0.5489 - val_accuracy: 0.5859 - val_loss: 2.6180
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8879 - loss: 0.3271 - val_accuracy: 0.5152 - val_loss: 1.7588
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9535 - loss: 0.1432 - val_accuracy: 0.5556 - val_loss: 1.4096
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9658 - loss: 0.0995 - val_accuracy: 0.6566 - val_loss: 0.9421
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9638 - loss: 0.1320 - val_accuracy: 0.8485 - val_loss: 0.3951
Epoch 7/20
[1m13/13[0m [32m━━━━

In [7]:

# Step 7: Encode labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

# Step 8: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state=42)

# Step 9: Build model
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],)))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(len(le.classes_)))
model.add(Activation('softmax'))

# Step 10: Compile model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Step 11: Train model (reduce epochs for testing)
history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test), verbose=1)

# Step 12: Evaluate model
score = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.4459 - loss: 1.6485 - val_accuracy: 0.3939 - val_loss: 5.1773
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8331 - loss: 0.4081 - val_accuracy: 0.5960 - val_loss: 2.1000
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.8876 - loss: 0.3129 - val_accuracy: 0.5152 - val_loss: 1.6791
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9689 - loss: 0.1242 - val_accuracy: 0.5657 - val_loss: 1.3669
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9242 - loss: 0.2146 - val_accuracy: 0.5657 - val_loss: 2.1752
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9499 - loss: 0.1476 - val_accuracy: 0.6263 - val_loss: 1.2333
Epoch 7/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━

In [8]:
# ... (previous code) ...

# Step 13: Save model and LabelEncoder
model.save(os.path.join(model_dir, 'Voice_recognition.h5'))  # Corrected file name
joblib.dump(le, os.path.join(model_dir, 'Voice_recognition.pkl'))  # Corrected file name

print("Model and LabelEncoder have been saved successfully.")

# Step 14: Predict function
def predict_sound_label(file_name):
    feature = extract_features(file_name)
    if feature is None:
        print(f"Unable to extract features from {file_name}")
        return None

    feature = feature.reshape(1, -1)
    predicted_vector = model.predict(feature)
    predicted_label = le.inverse_transform(np.argmax(predicted_vector, axis=1))
    return predicted_label[0]

# Step 15: Test with a sample file
test_file = os.path.join(voice_dir, 'fold3', 'sushan_frame_0.wav')  # Check if 'fold3' exists and contains the file
if os.path.exists(test_file):
    predicted_label = predict_sound_label(test_file)
    print(f"The predicted label for the sound is: {predicted_label}")
else:
    print(f"Error: Test file not found at {test_file}. Please ensure the file exists and the path is correct.")



Model and LabelEncoder have been saved successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
The predicted label for the sound is: Sushan
