In [4]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from pydub import AudioSegment



In [5]:
def eucl_dist(vec1, vec2):
    minn_length = min(vec1.shape[1], vec2.shape[1])
    if vec1.shape[1] < vec2.shape[1]: 
        vec2 = vec2[:,:minn_length]
    else: 
        vec1 = vec1[:,:minn_length]
    return np.linalg.norm(vec1-vec2)

def cosine_sim(vec1,vec2):
    minn_length = min(vec1.shape[1], vec2.shape[1])
    if vec1.shape[1] < vec2.shape[1]: 
        vec2 = vec2[:,:minn_length]
    else: 
        vec1 = vec1[:,:minn_length]
    vec1 = vec1.flatten()
    vec2 = vec2.flatten()
    cs_sim = np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
    return cs_sim

In [6]:
# PROPERTY OF DEEPMIND1234
def extract_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=22050)

    # Extract the MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Extract the chroma features
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

    # Extract the spectral contrast features
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    spec_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    
    # Concatenate the features into a single array
    #features = np.concatenate([mfcc, chroma, contrast, spectral_centroid, zero_crossing_rate], axis=0)
    features = np.concatenate([mfcc,chroma,contrast,spectral_centroid,zero_crossing_rate, spec_bandwidth], axis=0)

    return features

In [7]:
features_d = extract_features('media/deepak_test_1.wav')
features_a = extract_features('media/aadarsh_test_1.wav')
features_s = extract_features('media/sean_test_1.wav')
features_ta = extract_features('media/test_vector.wav')

  return f(*args, **kwargs)


In [8]:
print(eucl_dist(features_d, features_a))
print(eucl_dist(features_d, features_s))
print(eucl_dist(features_s, features_a))

12053.565439607575
14103.502609334624
16035.969697392145


In [9]:
print(eucl_dist(features_d, features_ta))
print(eucl_dist(features_a, features_ta))
print(eucl_dist(features_s, features_ta))

18596.916229174458
19356.916498702696
23659.574096291326


In [10]:
def NearestNeighbour(test_features,voice_set):
    f_min = 10e20
    similar = None
    for key,value in voice_set.items():
        cur_min = eucl_dist(test_features,value)
        #cur_min = cosine_sim(test_features,value)
        print(str(key) + ":"+ str(cur_min))
        if cur_min < f_min:
            f_min = cur_min
            similar = key
    return similar

In [12]:
feature_dict = {"deepak":features_d,"aadarsh":features_a,"sean":features_s,"TA":features_ta}
import speech_recognition as sr
file_path = 'media/deepak_test_1.wav'
r = sr.Recognizer()
def recognize_speech(file_path):
    with sr.AudioFile(file_path) as source:
        audio = r.record(source)
    try:
        text = r.recognize_google(audio)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
    return text

In [13]:
feature_d_new = extract_features('media/deepak_inference.wav')
detected_speaker = NearestNeighbour(feature_d_new, feature_dict)
r_text = recognize_speech('media/deepak_inference.wav')
print(f'{detected_speaker} says {r_text}')

deepak:24316.4523758576
aadarsh:21842.77581350192
sean:24534.705424865075
TA:24019.941095867864
aadarsh says testing testing


In [14]:
feature_a_new = extract_features('media/aadarsh_inference.wav')
detected_speaker = NearestNeighbour(feature_a_new, feature_dict)
r_text = recognize_speech('media/aadarsh_inference.wav')
print(f'{detected_speaker} says {r_text}')

deepak:24706.525365361664
aadarsh:21631.604161588373
sean:28138.97313845313
TA:25688.89313801095
aadarsh says testing testing my audio
