# Final

In [1]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn import preprocessing
import tensorflow as tf
import python_speech_features as mfcc

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import joblib

In [2]:
def extract_feature(y, sr):
    mfccs_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    return mfccs_scaled_features

In [3]:
model = tf.keras.models.load_model("./model/model_3.h5")

In [4]:
def predict_speaker(filename):
    
    # audio features
    audio, sr = librosa.load(filename, mono=True)
    audio, index = librosa.effects.trim(audio)
    
    # audio data
    mfcc = extract_feature(audio, sr)
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    rmse = librosa.feature.rms(y=audio)
    spec_cent = librosa.feature.spectral_centroid(y=audio, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(audio)

    features = [np.mean(chroma_stft), np.mean(rmse), np.mean(spec_cent), np.mean(spec_bw), np.mean(rolloff), np.mean(zcr)]
    for e in mfcc:
        features.append(np.mean(e))
        
    features = np.array(features)
    features = features.reshape(1, features.shape[0])
    # header for dataframe
    df_columns = ["chroma_stft", "rmse", "spectral_centroid", "spectral_bandwidth", "rolloff", "zero_crossing_rate"]
    for i in range(13):
        df_columns.append(f"mfcc_{i}")

    df = pd.DataFrame(features)
    df.columns = df_columns
    
    # scale the features
    scaler = joblib.load("./preprocessing_helper/standard_scaler.joblib")
    X = scaler.transform(df)
    
    # predict
    y_p = model.predict(X)
    y_p = np.argmax(y_p, axis=1)
    
    # decode the labels
    label_encoder = joblib.load("./preprocessing_helper/label_encoder.joblib")
    return label_encoder.inverse_transform(y_p)

## Test on random data

In [5]:
filename = f"./dataset/test/2193263_Awaaaz.wav"
predict_speaker(filename)

array([2193263], dtype=int64)

In [6]:
test_files = "./dataset/test_random/"

for file in os.listdir(test_files):
    file_path = test_files + file
    print("Actual roll_no:", file.split("_")[0], "test: ", file.split("_")[1:])
    roll = predict_speaker(file_path)
    print("Predicted roll_no:", roll[0])
    print()

Actual roll_no: 2193119 test:  ['aur', 'vai.wav']
Predicted roll_no: 2193119

Actual roll_no: 2193119 test:  ['awaaz.wav']
Predicted roll_no: 2193230

Actual roll_no: 2193119 test:  ['hello', 'this', 'is', 'me.wav']
Predicted roll_no: 2193119

Actual roll_no: 2193119 test:  ['in', 'middle.wav']
Predicted roll_no: 2196003

Actual roll_no: 2193119 test:  ['i', 'got.wav']
Predicted roll_no: 2196003

Actual roll_no: 2193119 test:  ['k.wav']
Predicted roll_no: 2193119

Actual roll_no: 2193119 test:  ['kaise', 'ho.wav']
Predicted roll_no: 2193119

Actual roll_no: 2193119 test:  ['mic', 'testing.wav']
Predicted roll_no: 2193119

Actual roll_no: 2193274 test:  ['1.wav']
Predicted roll_no: 2193274

Actual roll_no: 2193274 test:  ['2.wav']
Predicted roll_no: 2193274

Actual roll_no: 2193274 test:  ['3.wav']
Predicted roll_no: 2193057

Actual roll_no: 2193274 test:  ['4.wav']
Predicted roll_no: 2193274



In [17]:
filename = f"./new_file.wav"
predict_speaker(filename)

  return f(*args, **kwargs)


array([2193057], dtype=int64)