In [2]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from pydub import AudioSegment
from IPython.display import Audio
from scipy.io import wavfile



In [3]:
def eucl_dist(vec1, vec2):
    minn_length = min(vec1.shape[1], vec2.shape[1])
    if vec1.shape[1] < vec2.shape[1]: 
        vec2 = vec2[:,:minn_length]
    else: 
        vec1 = vec1[:,:minn_length]
    return np.linalg.norm(vec1-vec2)

def cosine_sim(vec1,vec2):
    minn_length = min(vec1.shape[1], vec2.shape[1])
    if vec1.shape[1] < vec2.shape[1]: 
        vec2 = vec2[:,:minn_length]
    else: 
        vec1 = vec1[:,:minn_length]
    vec1 = vec1.flatten()
    vec2 = vec2.flatten()
    cs_sim = np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
    return cs_sim

In [4]:
# PROPERTY OF DEEPMIND1234
def extract_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=22050)

    # Extract the MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Extract the chroma features
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

    # Extract the spectral contrast features
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    
    # NOT HELPFUL !!! 
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    spec_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    
    # Concatenate the features into a single array
    #features = np.concatenate([contrast], axis=0)
    
    features = np.concatenate([mfcc,chroma,contrast,zero_crossing_rate, spec_bandwidth], axis=0)

    return features

In [5]:
features_d = extract_features('media/deepak_inference_3.wav')
print(features_d.shape)
features_a = extract_features('media/aadarsh_test_3.wav')
features_s = extract_features('media/sean_test_3.wav')
#features_ta = extract_features('media/test_vector.wav')

(34, 229)


In [230]:
def NearestNeighbour(test_features,voice_set):
    f_min = 10e20
    similar = None
    for key,value in voice_set.items():
        cur_min = eucl_dist(test_features,value)
        #cur_min = cosine_sim(test_features,value)
        print(str(key) + ":"+ str(cur_min))
        if cur_min < f_min:
            f_min = cur_min
            similar = key
    return similar

In [231]:
feature_dict = {"deepak":features_d,"aadarsh":features_a,"sean":features_s}
import speech_recognition as sr

r = sr.Recognizer()
def recognize_speech(file_path):
    with sr.AudioFile(file_path) as source:
        audio = r.record(source)
    try:
        text = r.recognize_google(audio)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
    return text

In [237]:
test_file_path = 'media/deepak_test_2.wav'
feature_d_new = extract_features(test_file_path)
detected_speaker = NearestNeighbour(feature_d_new, feature_dict)
r_text = recognize_speech(test_file_path)
print(f'{detected_speaker} says {r_text}')
fs, test_audio = wavfile.read(test_file_path)
Audio(test_audio, rate=fs)

deepak:13809.579462586717
aadarsh:16051.267415627404
sean:17280.645511664992
deepak says 1 2 3 4 5 6 7 8 9 10


In [238]:
test_file_path = 'media/sean_test_2.wav'
feature_a_new = extract_features(test_file_path)
detected_speaker = NearestNeighbour(feature_a_new, feature_dict)
r_text = recognize_speech(test_file_path)
print(f'{detected_speaker} says {r_text}')
fs, test_audio = wavfile.read(test_file_path)
Audio(test_audio, rate=fs)

deepak:14380.900337985273
aadarsh:13929.635976494761
sean:13157.9062985093
sean says 1 2 3 4 5 6 7 8 9 10


In [239]:
test_file_path = 'media/aadarsh_test_2.wav'
feature_d_new = extract_features(test_file_path)
detected_speaker = NearestNeighbour(feature_d_new, feature_dict)
r_text = recognize_speech(test_file_path)
print(f'{detected_speaker} says {r_text}')
fs, test_audio = wavfile.read(test_file_path)
Audio(test_audio, rate=fs)

deepak:13186.938509464673
aadarsh:12361.665745757717
sean:12583.827512027688
aadarsh says 1 2 3 4 5 6 7 8 9 10
