In [5]:
import librosa # used for audio analysis
import numpy as np #used to create large, multi-dimensional arrays 
from sklearn.mixture import GaussianMixture #used to create GuassianMixture models
from math import log, ceil
import joblib #used to save gmm model in a file to save on procesing time in pickle format
import time #used to track processing time
from data_class import Speaker_Verification as sv #parent class which contains shared fields
from tabulate import tabulate as tb
from scipy.stats.distributions import chi2

In [6]:
def likelihood_ratio(loglike1, loglike2):
    return (2 * (loglike2-loglike1))

In [7]:

# ## Loading Audio Files
base = sv("main").base 
speaker = sv("main").speaker 
enrollment_base = sv("main").enrollment_base 
enrollment_wav = sv("main").enrollment_wav 
enrollment_pickle = sv("main").enrollment_pickle
enrollment = sv("main").enrollment 
validationFiles = sv("main").validationFiles

 # frame audio file with 512ms hop length
hop_length = sv("main").hop_length
n_mfcc = sv("main").n_mfcc #num of mfcc features extracted for each frame

threshold_values = [0.05]

#this list will contain IDs of speakers. Ex. ["Al", "Bob", ...]
validationSpeakers = [] 
for i in range(len(validationFiles)):
    validationSpeakers.append(sv.getSpeaker(validationFiles[i]))

likelihood_values = [[0 for i in range(2)] for j in range(len(validationSpeakers))]

print('')
for t in range(1):
    testResults = []
    result_array = []
    result_array2 = [ [ None for i in range(2) ] for j in range(len(validationFiles)+1) ]
    threshold = threshold_values[t]
    for i in range(len(validationFiles)):
        start_time = time.time() # tracks time

        validation = str(base+validationFiles[i])

        # load audio file and save audio file and sample rate
        test1, sr1 = librosa.load(enrollment)
        test2, sr2 = librosa.load(validation)

        #audio_dur1 = len(test1) / sr1
        audio_dur2 = len(test2) / sr2

        #Normalize both audio files so that amplitudes are within -1 to 1 scale
        test1 = librosa.util.normalize(test1)
        test2 = librosa.util.normalize(test2)

        # extract mfcc features in an array
        mfccs1 = librosa.feature.mfcc(y=test1, sr=sr1, n_mfcc = n_mfcc, hop_length=hop_length)
        mfccs2 = librosa.feature.mfcc(y=test2, sr=sr2, n_mfcc = n_mfcc, hop_length=hop_length)

        # get ratio of enrollment audio sample dur to validation audio sample dur
        audio_dur1 = librosa.get_duration(y=test1, sr=sr1)
        audio_dur2 = librosa.get_duration(y=test2, sr=sr2)
        ratio = ceil(audio_dur1/audio_dur2)


        #GMM of both MFCC feature vectors (array)
        gm1 = joblib.load(enrollment_pickle)
        gm2 = GaussianMixture(n_components=n_mfcc).fit(mfccs2)
        
        #print(str(validation))
        num_of_cols = int((mfccs2.shape[1]))

        #Calculating Log Likelihood of Both Models
        log_like1 = abs(gm1.score(mfccs2))
        log_like2 = 0
        #log_like2 = abs(gm2.score(mfccs1[i:i+num_of_cols]))

        l, counter= 0, 0
        while l < int(mfccs1.shape[1]-num_of_cols-1):
            mfccsSub = mfccs1[:, l:l+num_of_cols]
            if(mfccsSub.shape[1] != num_of_cols):
                print('true')
                np.resize(mfccsSub,(n_mfcc,num_of_cols))
            log_like2 += abs(gm2.score(mfccsSub))
            counter += 1
            l+=num_of_cols
        
        log_like2 /= counter

        likelihood_values[i][0] = log_like1
        likelihood_values[i][1] = log_like2
        
        likeh_ratio = likelihood_ratio(log_like1,log_like2)
        likeh_ratio_p = chi2.sf(likeh_ratio, n_mfcc) #this is a p-value
        
        print('Log_LH#1: '+str(log_like1)+'\t'+'Log_LH#2: '+str(log_like2)+'\tLikelihood Ratio: '+str(likeh_ratio_p))
        
        #null hypothesis is that VALIDATION model is nested in ENROLLMENT model
        if(likeh_ratio_p < threshold): #accept NULL hypothesis
            #print("{:s} and {:s} are the same speaker. \nACCEPT".format(str(enrollment),str(validation)))
            match = True
            result_array2[i][0] = 'Accept'
        else: #reject NULL hypothesis
            #print("{:s} and {:s} aren't the same speaker. \nREJECT".format(str(enrollment),str(validation)))
            match = False
            result_array2[i][0] = 'Reject'
        testResults.append(match)
        #print("Execution Time:--- %s seconds ---\n" % (time.time() - start_time))


Log_LH#1: 276409885013.05505	Log_LH#2: 334467032642.38556	Likelihood Ratio: 0.0
Log_LH#1: 251566864015.83417	Log_LH#2: 318301569533.2657	Likelihood Ratio: 0.0
Log_LH#1: 214115301002.39575	Log_LH#2: 307738881130.2546	Likelihood Ratio: 0.0
Log_LH#1: 257145988634.0572	Log_LH#2: 312991207550.7506	Likelihood Ratio: 0.0
Log_LH#1: 265354774720.9699	Log_LH#2: 329671720425.9593	Likelihood Ratio: 0.0
Log_LH#1: 192598849991.76813	Log_LH#2: 278495926367.79126	Likelihood Ratio: 0.0
Log_LH#1: 251190944363.51743	Log_LH#2: 297874517540.5016	Likelihood Ratio: 0.0
Log_LH#1: 294511332108.1603	Log_LH#2: 352634386021.90326	Likelihood Ratio: 0.0
Log_LH#1: 260699686813.46274	Log_LH#2: 346412141909.3382	Likelihood Ratio: 0.0
Log_LH#1: 266847315910.28546	Log_LH#2: 316248224044.6151	Likelihood Ratio: 0.0
Log_LH#1: 285653694778.6527	Log_LH#2: 348840177476.80554	Likelihood Ratio: 0.0
Log_LH#1: 244509089250.15665	Log_LH#2: 341857676076.0903	Likelihood Ratio: 0.0
Log_LH#1: 303257413183.8224	Log_LH#2: 366930384701.

In [8]:
#threshold_values2 = np.arange(0.0, 0.5, 0.01)
threshold_values2 = [0.05]
for t in range(len(threshold_values2)):
    threshold = threshold_values2[t]
    for s in range((len(validationFiles))):
        likeh_ratio = 2*log(likelihood_values[s][0]/likelihood_values[s][1])
        if(likeh_ratio <= threshold):
            #print("{:s} and {:s} are the same speaker. \nACCEPT".format(str(enrollment),str(validation)))
            match = True
            result_array2[i][0] = 'Accept'
        else:
            #print("{:s} and {:s} aren't the same speaker. \nREJECT".format(str(enrollment),str(validation)))
            match = False
            result_array2[i][0] = 'Reject'
        testResults.append(match)
        #print("Execution Time:--- %s seconds ---\n" % (time.time() - start_time))
    TA, TR, FA, FR, accuracy, total= 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

    for i in range(len(validationFiles)):
        isUser = (speaker==validationSpeakers[i])
        if (isUser):
            if(isUser == testResults[i]):
                TA += 1
                result_array2[i][1] = 'This is true accept'
            else:
                FR += 1
                result_array2[i][1] = 'This is false reject'
        else:
            if(isUser == testResults[i]):
                TR += 1
                result_array2[i][1] = 'This is true reject'
            else:
                FA += 1
                result_array2[i][1] = 'This is false accept'
           
    print(TA, TR, FA, FR)
    TAR = TA/(TA+FR) #TP/TP+FN
    TRR = TR/(TR+FA) #TN/TN+FP
    FAR = FA/(FA+TR) #FP/FP+TN
    FRR = FR/(FR+TA) #FN/FN+TP
    EER = (FAR+FRR)*0.5
    accuracy = (TA+TR)/(TA+TR+FA+FR)
    #accuracy = accuracy/len(validationFiles)

    print('Speaker:', speaker, 'Threshold', threshold, 'Hop Length', hop_length)
    print(tb([['General Accuracy', str(round(accuracy*100, 3))+'%'],
            ['Equal Error Rate (EER)', str(round(EER*100, 3))+'%'],
            ['True Accept Rate (TAR)', str(round(TAR*100, 3))+'%'], 
            ['True Reject Rate (TRR)', str(round(TRR*100, 3))+'%'],
            ['False Accept Rate (FAR)', str(round(FAR*100, 3))+'%'],
            ['False Reject Rate (FRR)', str(round(FRR*100, 3))+'%']], 
            headers=['Authentication Measure', 'Percentage']))

3.0 0.0 10.0 0.0
Speaker: geo Threshold 0.05 Hop Length 256
Authentication Measure    Percentage
------------------------  ------------
General Accuracy          23.077%
Equal Error Rate (EER)    50.0%
True Accept Rate (TAR)    100.0%
True Reject Rate (TRR)    0.0%
False Accept Rate (FAR)   100.0%
False Reject Rate (FRR)   0.0%


# tracks time
start_time = time.time()

# ## Loading Audio Files
base = sv("enrollment").base
enrollment_base = sv("enrollment").enrollment_base
enrollment = sv("enrollment").enrollment

# load audio file and save audio file and sample rate
enrollLib, sr1 = librosa.load(enrollment)

audio_dur1 = len(enrollLib) / sr1

# frame audio file
hop_length1 = sv("enrollment").hop_length 
n_mfcc = sv("enrollment").n_mfcc #num of mfcc features extracted for each frame

#Normalize both audio files so that amplitudes are within -1 to 1 scale
enrollLib = librosa.util.normalize(enrollLib)

# extract mfcc features in an array
mfccsEnroll = librosa.feature.mfcc(y=enrollLib, sr=sr1, n_mfcc = n_mfcc, hop_length=hop_length1)

#create Gaussian Mixture Model
gmmEnroll = GaussianMixture(n_components=n_mfcc, random_state = 42, covariance_type='full').fit(mfccsEnroll)

#save model in pickle format for future use - saves on processing time
pkl_file = enrollment_base+".pkl"
joblib.dump(gmmEnroll, base+pkl_file) 
print(base+pkl_file +' file created')
print(gmmEnroll.means_.shape)

print("Execution Time:--- %s seconds ---" % (time.time() - start_time))