In [11]:
import numpy as np
from scipy.fftpack import dct
import matplotlib.pyplot as plt
import scipy.io.wavfile as wav
import math

In [89]:
def mfcc(file):
    sample_rate, signal = wav.read(file)
    signal = signal[0:int(10 * sample_rate)] # Keep 1st 5 secs of signal
    numCoefficients = 13 # choose the size of mfcc array
    frame_size = .025 #25ms frames
    frame_separation = .015 #10ms overlap
    
    frame_len_dist = int(round(frame_size * sample_rate))
    frame_sep_dist = int(round(frame_separation * sample_rate))
    
    framed_matrix = frame(signal, sample_rate, frame_len_dist, frame_sep_dist)
    
    framed_matrix = smooth(framed_matrix, frame_len_dist)
    
    power_matrix = getPeriodogram(framed_matrix, frame_len_dist)
    #print(frame_len_dist)
    #print(power_matrix.shape)
    
    
    minHz = 0
    maxHz = sample_rate / 2
    mlfb = getMelFilterBank(minHz, maxHz, frame_len_dist, sample_rate)
    #print(mlfb.shape)
    
    coefficients = getCoefficients(power_matrix, mlfb)
    return coefficients

In [90]:
#FRAMING
def frame(signal, sample_rate, frame_len_dist, frame_sep_dist):
    frame_overlap_dist = frame_len_dist - frame_sep_dist
    signal_length = len(signal)
    num_frames = int(np.floor(signal_length / frame_sep_dist))
    if num_frames * frame_sep_dist + frame_overlap_dist < signal_length:
        num_frames += 1
    corr_signal_length = (num_frames - 1) * frame_sep_dist + frame_len_dist
    zero_padding_length = corr_signal_length - signal_length
    zero_padding = np.zeros(zero_padding_length)
    padded_signal = np.append(signal, zero_padding)
    framed_matrix = np.zeros((num_frames, frame_len_dist))
    for frame_num in range(num_frames):
        for i in range(int(frame_len_dist)):
            framed_matrix[frame_num][i] = padded_signal[frame_num * frame_sep_dist + i]
    return framed_matrix

In [91]:
#WINDOWING y(n) = x(n) * w(n)
def smooth(framed_matrix, frame_len_dist):
    framed_matrix *= np.hamming(frame_len_dist)
    return framed_matrix

In [92]:
#FFT Matrix i.e. periodogram
def getPeriodogram(framed_matrix, frame_len_dist):
    framed_matrix = np.absolute(np.fft.fft(framed_matrix))
    framed_matrix = (1/frame_len_dist) * np.square(framed_matrix)
    return framed_matrix


In [93]:
#Mel to frequency conversions
def freqToMel(freq):
    return 1127.01048 * np.log(1 + freq / 700.0)

def melToFreq(mel):
    return 700 * (np.exp(mel / 1127.01048 ) - 1)

In [94]:
#Generate Filter Bank
#Algorithm to generate filter bank from 
def getMelFilterBank(minHz, maxHz, frame_len_dist, sample_rate, numFilters = 40):
    minMel = freqToMel(minHz)
    maxMel = freqToMel(maxHz)
    
    melAxis = np.linspace(minMel, maxMel, numFilters + 2)
    hzAxis = melToFreq(melAxis)
    roundedHzAxis = np.floor((frame_len_dist+1)*hzAxis/sample_rate)
    #print(roundedHzA)
    
    melfb = np.zeros((numFilters, frame_len_dist))

    for m in range(1, numFilters + 1):
        #Iterate through every row of the filter bank to populate it: note most of them will be 0s
        #m - row; k - column
        left = int(roundedHzAxis[m - 1])   # left
        middle = int(roundedHzAxis[m])             # center
        right = int(roundedHzAxis[m + 1])    # right

        for k in range(left, middle):
            melfb[m - 1, k] = (k - roundedHzAxis[m - 1]) / (roundedHzAxis[m] - roundedHzAxis[m - 1])
        for k in range(middle, right):
            melfb[m - 1, k] = (roundedHzAxis[m + 1] - k) / (roundedHzAxis[m + 1] - roundedHzAxis[m])
    return melfb


In [95]:
#Extract Coefficients after Taking DCT
def getCoefficients(power_matrix, melfb, num_coefficients = 12 ):
    filter_banks = np.dot(power_matrix, melfb.T)
    #print(filter_banks.shape)
    filter_banks = 20 * np.log10(filter_banks)  # dB
    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_coefficients + 1)] # Keep 2-13
    return mfcc

In [96]:
file = "train/s1.wav"
coeffs = mfcc(file)
#print(coeffs.shape)
#print(coeffs)

In [97]:
from scipy.cluster.vq import vq, kmeans


In [98]:
def train(folder, k = 16, iter = 100, numfiles = 8):
    codebooks = {}
    for i in range(1,numfiles + 1):
        file = str(folder) + "/s" + str(i) + ".wav"
        obs = mfcc(file)
        codebook = kmeans(obs, k, iter, thresh=1e-8)
        codebooks[i] = codebook[0]
    return codebooks


In [99]:
def test(file, codebooks, numfiles = 9):
    min_dist = float('inf')
    best_match = 0
    for i in range(1, numfiles + 1):
        obs = mfcc(file)
        codebook = codebooks[i]
        #print(obs.shape)
        obs = remove_rows_with_nan(obs)
        #print(obs.shape)
        #print(np.shape(codebook))
        #print(np.shape(obs))
        code, dist = vq(obs, codebook)
        avg_dist = np.average(dist)
        print(avg_dist)
        if avg_dist < min_dist:
            min_dist = avg_dist
            best_match = i
    return best_match
        
    

In [100]:
def remove_rows_with_nan(coeffs):
    lines_removed = 0
    to_delete = []
    for i in range(len(coeffs)):
        if True in np.isnan(coeffs[i]):
            lines_removed += 1
            to_delete.append(i)
    coeffs = np.delete(coeffs, to_delete, 0)
    return coeffs
            
            

In [111]:
#Speakers
#1 - Afrikaans Woman 1: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=1
#2 - Arabic Woman 2: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=23
#3 - Dutch Man: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=1300
#4 - Hindi Man: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=910
#5 - Japanese Woman: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=223
#6 - Spanish Man: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=323
#7 - English Man: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=61
#8 - Russian Woman: http://accent.gmu.edu/searchsaa.php?function=detail&speakerid=302

#Test files are other speakers in the 

In [114]:
codebooks = train("accent-data/train", numfiles = 8)
#print(codebooks[1])
test("accent-data/test/telugu1.wav", codebooks, numfiles=8)

54.8837710493
67.7547459078
81.0025854322
67.5292192605
56.4085001536
91.198137455
97.3002960228
49.6251995147


8

In [None]:
#THE RETURNED NUMBER SHOULD MATCH THE TESTING WAV FILE NUMBER

In [116]:
#print(numFilters)
#print(melfb.T.shape)
#print(framed_matrix.shape)
#print(mfcc.shape)
#print(np.avg([1,3,4]))

In [None]:
#FINISHED WITH SIGNAL PROCESSING

In [65]:
np.isnan([float('nan'), 1])

array([ True, False], dtype=bool)

In [46]:
a = np.zeros(4)
a[1] = 2
a[3] = 6
print(a)
b = np.delete(a,[1,3], 0)
print(a)
print(b)


[ 0.  2.  0.  6.]
[ 0.  2.  0.  6.]
[ 0.  0.]
