In [1]:
import IPython.display as ipd
import scipy.io.wavfile as wav
from scipy.fftpack import dct
from zipfile import ZipFile
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import librosa
import python_speech_features
from scipy.stats import skew
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
InteractiveShell.ast_node_interactivity = "all"

In [6]:
#data import
def audio_import(nclass, naudio):
    rate, data = wav.read('Read_Up/IDR' + str(nclass) + '/' + str(naudio) + '.wav')
    filename = 'Data/IDR' + str(nclass) + '/' + str(naudio) + '.csv'
    return rate, data, filename

#parameters
framelength, framestride, nfft, num_fbanks, n_cep_coeff = 0.025, 0.015, 512, 40, 12

#generate frames
def frames_gen(rate, data, framelength, framestride):
    frmlen, frmstrd, signallen = int(round(rate*framelength)), int(round(rate*framestride)), len(data)
    paddinglen = frmstrd - (signallen - frmlen) % frmstrd #making number of frames even
    paddedsig = np.concatenate((data, np.zeros(paddinglen)), axis = 0)
    paddedsiglen = len(paddedsig)
    nframes = int(np.floor((paddedsiglen - frmlen)/frmstrd) + 1)
    indices = np.tile(np.arange(0, frmlen), (nframes, 1)) + np.tile((np.arange(0, nframes*frmstrd, frmstrd)), (frmlen, 1)).T
    frames = paddedsig[indices]
    return frames, frmlen

#apply hamming window to each frame
def hamming_window(frames, frmlen):
    frames *= np.hamming(frmlen)
    return frames

#convert each windowed frame into a power spectrum
def periodogram_gen(frames, nfft):
    frame_fft = np.absolute(np.fft.rfft(frames, n = nfft, axis = 1))
    frame_periodogram = np.square(frame_fft)/nfft
    return frame_periodogram

#helper functions
def freq_to_mel(freq):
    return 2595*np.log10(1+freq/700)
def mel_to_freq(mel):
    return 700*(np.power(10, mel/2595) - 1)

# making mel-scale filterbank
def filter_bank_gen(rate, num_fbanks, nfft):
    #for x filter banks, we need x+2 mel points
    low_mel_lim = 0
    up_mel_lim = freq_to_mel(rate/2)
    mel_range = np.linspace(0, up_mel_lim, num_fbanks + 2)
    freq_range = mel_to_freq(mel_range)
    bins = np.floor((nfft + 1) * freq_range/rate)
    fbank = np.zeros((num_fbanks, int(np.floor(nfft/2 + 1))))
    for m in range(1, num_fbanks + 1):
        lower = int(bins[m - 1]) # lower
        peak = int(bins[m]) # peak
        upper = int(bins[m + 1]) # upper
        for k in range(lower, peak):
            fbank[m - 1, k] = (k - bins[m - 1])/(bins[m] - bins[m - 1])
        for k in range(peak, upper):
            fbank[m - 1, k] = (bins[m + 1] - k)/(bins[m + 1] - bins[m])
    return fbank

# filtered frames
def filtered_frame_gen(frame_periodogram, fbank):
    #multiply each frame with all filterbanks and add up for coefficients.
    filter_banks = np.dot(frame_periodogram, fbank.T)
    #for numerical stability
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) #if condition is true, return eps, else return original val
    filter_banks = 20*np.log10(filter_banks)
    return filter_banks

#make mfcc coefficients
def mfcc_gen(filter_banks, n_cep_coeff):
    mfcc = dct(filter_banks, type = 2, axis = 1, norm = 'ortho')[:, 1:(n_cep_coeff + 1)]
    return mfcc  

# N = 2
# def ctpn(n_cep_coeff, mfcc, t, n):
#     if((t+n) > n_cep_coeff-1):
#         return mfcc[:,n_cep_coeff-1]
#     elif(0 <= (t+n) <= n_cep_coeff-1):
#         return mfcc[:, t+n]

# def ctmn(n_cep_coeff, mfcc, t, n):
#     if((t-n) < 0):
#         return mfcc[:,0]
#     elif(0 <= (t-n) <= n_cep_coeff-1):
#         return mfcc[:, t-n]  
    
# def deltacoeff(t, mfcc):
#     dt = 0
#     for n in range(1,N):
#         dt += n * (ctpn(n_cep_coeff, mfcc, t, n) - ctmn(n_cep_coeff, mfcc, t, n)) / 2 * np.square(n)
#     return dt

# def deltacoeff_gen(mfcc, n_cep_coeff):
#     deltacoef = np.zeros(mfcc.shape)
#     for t in range(0, n_cep_coeff):
#         dt = deltacoeff(t, mfcc)
#         deltacoef[:, t] = dt
#     return deltacoef

N = 2
def ctpn(n_cep_coeff, coeff_type, t, n):
    if((t+n) > n_cep_coeff-1):
        return coeff_type[:,n_cep_coeff-1]
    elif(0 <= (t+n) <= n_cep_coeff-1):
        return coeff_type[:, t+n]

def ctmn(n_cep_coeff, coeff_type, t, n):
    if((t-n) < 0):
        return coeff_type[:,0]
    elif(0 <= (t-n) <= n_cep_coeff-1):
        return coeff_type[:, t-n]  
    
def deltacoeff(t, coeff_type):
    dt = 0
    for n in range(1,N):
        dt+= n*(ctpn(n_cep_coeff, coeff_type, t, n) - ctmn(n_cep_coeff, coeff_type, t, n))/2*np.square(n)
    return dt

def deltacoeff_gen(coeff_type, n_cep_coeff):
    deltacoef = np.zeros(coeff_type.shape)
    for t in range(0, n_cep_coeff):
        dt = deltacoeff(t, coeff_type)
        deltacoef[:, t] = dt
    return deltacoef

def deltadeltacoeff_gen(deltacoef, n_cep_coeff):
    deltadeltacoef = np.zeros(deltacoef.shape)
    for t in range(0, n_cep_coeff):
        ddt = deltacoeff(t, deltacoef)
        deltadeltacoef[:, t] = ddt
    return deltadeltacoef
    

In [7]:
def csv_data_gen(Data, framelength = framelength, framestride = framestride, nfft = nfft, num_fbanks = num_fbanks, n_cep_coeff = n_cep_coeff):
    for nclass in range(1,10):
        for naudio in range(1, 68):
            #calculating mfcc
            rate, data, filename = audio_import(nclass, naudio)
            frames, frmlen = frames_gen(rate, data, framelength, framestride)
            frames = hamming_window(frames, frmlen)
            frame_periodogram = periodogram_gen(frames, nfft)
            fbank = filter_bank_gen(rate, num_fbanks, nfft)
            filter_banks = filtered_frame_gen(frame_periodogram, fbank)
            mfcc = mfcc_gen(filter_banks, n_cep_coeff)
            #calculating delta_coefficients 
#             mfcc = librosa.feature.mfcc(data.astype(float), sr = rate, n_mfcc=12).T
#             mfcc = python_speech_features.base.mfcc(data, rate, winlen = 0.025, winstep = 0.015, nfilt = 40, nfft = 512, numcep = 12, preemph = 0)
            mfcc = mfcc - np.mean(mfcc, axis = 0)
            delta_coef = deltacoeff_gen(mfcc, n_cep_coeff)
            deltadelta_coef = deltadeltacoeff_gen(delta_coef, n_cep_coeff)
            print(mfcc.shape,nclass,naudio)
#             Data = Data.append(pd.Series(np.hstack((np.mean(mfcc, axis = 0), np.mean(delta_coef, axis = 0), nclass))), ignore_index = True)
            
#     Data.columns = ['MFCC_mean' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_mean' + str(x) for x in range(0, n_cep_coeff)] + ['Dialect']
#     return Data
            Data = Data.append(pd.Series(np.hstack(
                (np.mean(mfcc, axis = 0), np.max(mfcc, axis = 0), np.min(mfcc, axis = 0), np.std(mfcc, axis = 0), np.median(mfcc, axis = 0), skew(mfcc, axis = 0), 
                 np.mean(delta_coef, axis = 0), np.max(delta_coef, axis = 0), np.min(delta_coef, axis = 0), np.std(delta_coef, axis = 0), np.median(delta_coef, axis = 0), skew(delta_coef, axis = 0), 
                 np.mean(deltadelta_coef, axis = 0), np.max(deltadelta_coef, axis = 0), np.min(deltadelta_coef, axis = 0), np.std(deltadelta_coef, axis = 0), np.median(deltadelta_coef, axis = 0), skew(deltadelta_coef, axis = 0), nclass))), ignore_index = True)
            
    Data.columns = ['MFCC_mean' + str(x) for x in range(0, n_cep_coeff)] + ['MFCC_max' + str(x) for x in range(0, n_cep_coeff)] + ['MFCC_min' + str(x) for x in range(0, n_cep_coeff)] + ['MFCC_std' + str(x) for x in range(0, n_cep_coeff)] + ['MFCC_median' + str(x) for x in range(0, n_cep_coeff)] + ['MFCC_skew' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_mean' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_max' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_min' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_std' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_median' + str(x) for x in range(0, n_cep_coeff)] + ['DEL_skew' + str(x) for x in range(0, n_cep_coeff)] + ['DELDEL_mean' + str(x) for x in range(0, n_cep_coeff)] + ['DELDEL_max' + str(x) for x in range(0, n_cep_coeff)] + ['DELDEL_min' + str(x) for x in range(0, n_cep_coeff)] + ['DELDEL_std' + str(x) for x in range(0, n_cep_coeff)] + ['DELDEL_median' + str(x) for x in range(0, n_cep_coeff)] + ['DELDEL_skew' + str(x) for x in range(0, n_cep_coeff)] + ['Speaker']
    return Data

      
  For each speech frame, a set of MFCC is computed. This set of coefficients is called an acoustic vector which represents the phonetically important characteristics of speech and is very useful for further analysis and processing in Speech Recognition.      

In [8]:
Data = pd.DataFrame()
Data = csv_data_gen(Data)

(3918, 12) 1 1
(2911, 12) 1 2
(2989, 12) 1 3
(3571, 12) 1 4
(3695, 12) 1 5
(579, 12) 1 6
(1346, 12) 1 7
(1413, 12) 1 8
(1237, 12) 1 9
(3916, 12) 1 10
(4082, 12) 1 11
(3751, 12) 1 12
(3691, 12) 1 13
(3967, 12) 1 14
(3694, 12) 1 15
(3947, 12) 1 16
(2745, 12) 1 17
(3183, 12) 1 18
(2871, 12) 1 19
(2888, 12) 1 20
(3271, 12) 1 21
(2787, 12) 1 22
(3708, 12) 1 23
(2742, 12) 1 24
(2831, 12) 1 25
(2715, 12) 1 26
(3040, 12) 1 27
(3802, 12) 1 28
(3429, 12) 1 29
(3574, 12) 1 30
(3988, 12) 1 31
(3446, 12) 1 32
(1128, 12) 1 33
(356, 12) 1 34
(3870, 12) 1 35
(3400, 12) 1 36
(3464, 12) 1 37
(4151, 12) 1 38
(3468, 12) 1 39
(3609, 12) 1 40
(3876, 12) 1 41
(3488, 12) 1 42
(3950, 12) 1 43
(3353, 12) 1 44
(1032, 12) 1 45
(215, 12) 1 46
(3891, 12) 1 47
(3397, 12) 1 48
(3441, 12) 1 49
(4090, 12) 1 50
(3548, 12) 1 51
(3094, 12) 1 52
(4093, 12) 1 53
(3623, 12) 1 54
(3722, 12) 1 55
(2600, 12) 1 56
(3005, 12) 1 57
(2914, 12) 1 58
(2656, 12) 1 59
(3788, 12) 1 60
(2825, 12) 1 61
(3600, 12) 1 62
(2601, 12) 1 63
(295

(3877, 12) 8 53
(3751, 12) 8 54
(3875, 12) 8 55
(3912, 12) 8 56
(3432, 12) 8 57
(2070, 12) 8 58
(3862, 12) 8 59
(3441, 12) 8 60
(3173, 12) 8 61
(3665, 12) 8 62
(3037, 12) 8 63
(3777, 12) 8 64
(3506, 12) 8 65
(1266, 12) 8 66
(2626, 12) 8 67
(2998, 12) 9 1
(3831, 12) 9 2
(3842, 12) 9 3
(1436, 12) 9 4
(728, 12) 9 5
(2147, 12) 9 6
(4123, 12) 9 7
(3874, 12) 9 8
(3845, 12) 9 9
(3398, 12) 9 10
(3909, 12) 9 11
(4101, 12) 9 12
(3306, 12) 9 13
(3895, 12) 9 14
(4088, 12) 9 15
(3940, 12) 9 16
(3831, 12) 9 17
(323, 12) 9 18
(1288, 12) 9 19
(3065, 12) 9 20
(2952, 12) 9 21
(2866, 12) 9 22
(2569, 12) 9 23
(2876, 12) 9 24
(3606, 12) 9 25
(2937, 12) 9 26
(2601, 12) 9 27
(3056, 12) 9 28
(3203, 12) 9 29
(3151, 12) 9 30
(3200, 12) 9 31
(1198, 12) 9 32
(2196, 12) 9 33
(4039, 12) 9 34
(3675, 12) 9 35
(3982, 12) 9 36
(3643, 12) 9 37
(3126, 12) 9 38
(3450, 12) 9 39
(3954, 12) 9 40
(3447, 12) 9 41
(3840, 12) 9 42
(3888, 12) 9 43
(3592, 12) 9 44
(3188, 12) 9 45
(687, 12) 9 46
(3574, 12) 9 47
(4050, 12) 9 48
(387

In [10]:
Data.shape

(603, 217)

In [11]:
Data.to_csv('data.csv', index = False)

In [30]:
data = pd.read_csv('data.csv')

In [31]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:,-1], test_size = 0.15)

In [32]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [33]:
X_train = normalize(X_train)
X_test = normalize(X_test)

In [34]:
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(512, 216)

(91, 216)

(512,)

(91,)

In [35]:
# lr_clf = LogisticRegression(
#     random_state = 200,
#     max_iter = 1000,
#     verbose = 1,
#     n_jobs = -1,
#     solver = 'newton-cg'
# )
# lr_clf.fit(X_train, y_train)
# predicted = lr_clf.predict_proba(X_test)

# knn_clf = KNeighborsClassifier(
#     n_neighbors = 5,
#     n_jobs = -1,
#     leaf_size = 100
# )
# knn_clf.fit(X_train, y_train)
# predicted = knn_clf.predict_proba(X_test)

# svc_clf = svm.SVC(
#     kernel = 'linear',
#     verbose = True,
#     random_state = True
# )
# svc_clf.fit(X_train, y_train)
# pred_labels = svc_clf.predict(X_test)

clf = svm.SVC(kernel = 'linear', probability=True, C = 10, gamma = 0.1)

clf.fit(X_train, y_train)

print(accuracy_score(clf.predict(X_test), y_test))
# C_grid = [0.001, 0.01, 0.1, 1, 10]
# gamma_grid = [0.001, 0.01, 0.1, 1, 10]
# param_grid = {'C': C_grid, 'gamma' : gamma_grid}

# grid = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv = 3, scoring = "accuracy")
# grid.fit(X_train, y_train)

# # Find the best model
# print(grid.best_score_)

# print(grid.best_params_)

# print(grid.best_estimator_)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

0.9340659340659341


In [67]:
pred_labels = predicted.argmax(axis = 1)
pred_labels

array([2, 1, 2, 2, 4, 0, 2, 1, 6, 3, 6, 3, 6, 1, 0, 2, 5, 6, 0, 4, 4, 1,
       3, 0, 1, 2, 5, 2, 5, 2, 0, 6, 4, 1, 6, 4, 3, 7, 3, 8, 0, 8, 5, 3,
       4, 0, 8, 3, 6, 1, 2, 8, 1, 6, 2, 6, 2, 6, 6, 7, 3, 5, 8, 5, 7, 1,
       5, 7, 2, 2, 0, 4, 3, 2, 4, 5, 2, 6, 0, 4, 7, 0, 7, 7, 6, 6, 8, 2,
       7, 0, 1])

In [68]:
print(("Accuracy score")+str(accuracy_score(y_test, pred_labels)))

Accuracy score0.0
