In [None]:
import numpy as np

# load data
example = np.load('data/lab1_example.npz', allow_pickle=True)['example'].item()
data = np.load('data/lab1_data.npz', allow_pickle=True)['data']

# this code helps to look at the data
# for i in example:
#     print("\n\n")
#     print(i)
#     print(example[i])

# for j in data:
#     for i in data[0]:
#         print("\n")
#         print(i)
#         print(j[i])


In [None]:
#@title Enframe

###############
# 4.1 Enframe #
###############

import matplotlib.pyplot as plt

def enframe(samples, winlen, winshift):
    """
    Slices the input samples into overlapping windows.

    Args:
        winlen: window length in samples.
        winshift: shift of consecutive windows in samples
    Returns:
        numpy array [N x winlen], where N is the number of windows that fit
        in the input signal
    """
    if len(samples) < winlen:
        raise ValueError('Too long winlen wrt input signal.')

    N = 1 + int((len(samples) - winlen) / winshift)     # first window + how many times the window can be shifted
    frames = np.zeros((N, winlen))

    # init window
    start = 0
    end = start + winlen

    for i in range(N):
        # save frame
        frames[i] = samples[start:end]

        # shift window
        start = start + winshift
        end = start + winlen
    return frames

samples = example['samples']
winlen = int(example["samplingrate"]*20/1000)       # number of samples in 20 ms
winshift = int(example["samplingrate"]*10/1000)     # number of samples in 10 ms
frames = enframe(samples, winlen, winshift)

plt.figure()
plt.pcolormesh(frames)
plt.title('Enframe - computed')
plt.figure()
plt.pcolormesh(example['frames'])
plt.title('Enframe - example')
plt.show()

In [None]:
#@title Pre-emphasis

####################
# 4.2 Pre-emphasis #
####################

from scipy.signal import lfilter

def preemp(input, p=0.97):
    """
    Pre-emphasis filter.

    Args:
        input: array of speech frames [N x M] where N is the number of frames and
               M the samples per frame
        p: preemhasis factor (defaults to the value specified in the exercise)

    Output:
        output: array of pre-emphasised speech samples
    Note (you can use the function lfilter from scipy.signal)
    """
    # y[n] = x[n] - p*x[n-1]
    num = [1, -p]
    den = [1]
    return lfilter(num, den, input)

preempcoeff = .97
preemph = preemp(frames, preempcoeff)

plt.figure()
plt.pcolormesh(preemph)
plt.title('Pre-emphasis - computed')
plt.figure()
plt.pcolormesh(example['preemph'])
plt.title('Pre-emphasis - example')
plt.show()

In [None]:
#@title Hamming window

######################
# 4.3 Hamming window #
######################

from scipy.signal import hamming

def windowing(input):
    """
    Applies hamming window to the input frames.

    Args:
        input: array of speech samples [N x M] where N is the number of frames and
               M the samples per frame
    Output:
        array of windowed speech samples [N x M]
    Note (you can use the function hamming from scipy.signal, include the sym=False option
    if you want to get the same results as in the example)
    """
    w = hamming(input.shape[1], sym=False)

    # window shape (for explanation)
    # plt.figure()
    # plt.plot(w)
    # plt.title('Hamming window')
    # plt.xlabel('sample')
    # plt.ylabel('amplitude')
    # plt.show()

    return input * w

windowed = windowing(preemph)

plt.figure()
plt.pcolormesh(windowed)
plt.title('Hamming window - computed')
plt.figure()
plt.pcolormesh(example['windowed'])
plt.title('Hamming window - example')
plt.show()

In [None]:
#@title FFT

###########
# 4.4 FFT #
###########

from scipy.fftpack import fft

def powerSpectrum(input, nfft):
    """
    Calculates the power spectrum of the input signal, that is the square of the modulus of the FFT

    Args:
        input: array of speech samples [N x M] where N is the number of frames and
               M the samples per frame
        nfft: length of the FFT
    Output:
        array of power spectra [N x nfft]
    Note: you can use the function fft from scipy.fftpack
    """
    return np.absolute(fft(input, n=nfft)) ** 2

nfft = 512
spec = powerSpectrum(windowed, nfft)

plt.figure()
plt.pcolormesh(spec)
plt.title('abs(FFT)^2 - computed')
plt.figure()
plt.pcolormesh(example['spec'])
plt.title('abs(FFT)^2 - computed')
plt.show()

In [None]:
#@title Mel filterbank

######################
# 4.5 Mel filterbank #
######################

def trfbank(fs, nfft, lowfreq=133.33, linsc=200/3., logsc=1.0711703, nlinfilt=13, nlogfilt=27, equalareas=False):
    """Compute triangular filterbank for MFCC computation.

    Inputs:
    fs:         sampling frequency (rate)
    nfft:       length of the fft
    lowfreq:    frequency of the lowest filter
    linsc:      scale for the linear filters
    logsc:      scale for the logaritmic filters
    nlinfilt:   number of linear filters
    nlogfilt:   number of log filters

    Outputs:
    res:  array with shape [N, nfft], with filter amplitudes for each column.
            (N=nlinfilt+nlogfilt)
    From scikits.talkbox"""
    # Total number of filters
    nfilt = nlinfilt + nlogfilt

    #------------------------
    # Compute the filter bank
    #------------------------
    # Compute start/middle/end points of the triangular filters in spectral
    # domain
    freqs = np.zeros(nfilt+2)
    freqs[:nlinfilt] = lowfreq + np.arange(nlinfilt) * linsc
    freqs[nlinfilt:] = freqs[nlinfilt-1] * logsc ** np.arange(1, nlogfilt + 3)
    if equalareas:
        heights = np.ones(nfilt)
    else:
        heights = 2./(freqs[2:] - freqs[0:-2])

    # Compute filterbank coeff (in fft domain, in bins)
    fbank = np.zeros((nfilt, nfft))
    # FFT bins (in Hz)
    nfreqs = np.arange(nfft) / (1. * nfft) * fs
    for i in range(nfilt):
        low = freqs[i]
        cen = freqs[i+1]
        hi = freqs[i+2]

        lid = np.arange(np.floor(low * nfft / fs) + 1,
                        np.floor(cen * nfft / fs) + 1, dtype=np.int)
        lslope = heights[i] / (cen - low)
        rid = np.arange(np.floor(cen * nfft / fs) + 1,
                        np.floor(hi * nfft / fs) + 1, dtype=np.int)
        rslope = heights[i] / (hi - cen)
        fbank[i][lid] = lslope * (nfreqs[lid] - low)
        fbank[i][rid] = rslope * (hi - nfreqs[rid])

    return fbank

def logMelSpectrum(input, samplingrate):
    """
    Calculates the log output of a Mel filterbank when the input is the power spectrum

    Args:
        input: array of power spectrum coefficients [N x nfft] where N is the number of frames and
               nfft the length of each spectrum
        samplingrate: sampling rate of the original signal (used to calculate the filterbank shapes)
    Output:
        array of Mel filterbank log outputs [N x nmelfilters] where nmelfilters is the number
        of filters in the filterbank
    Note: use the trfbank function provided in lab1_tools.py to calculate the filterbank shapes and
          nmelfilters
    """
    nfft = input.shape[1]
    filterbank = trfbank(samplingrate, nfft)

    # filters (for explanation)
    plt.figure()
    plt.plot(filterbank[::5])
    plt.title('Filterbank')
    plt.xlabel('frequency')
    plt.ylabel('amplitude')
    plt.show()

    return np.log(input @ filterbank.T)

samplingrate = example['samplingrate']
mspec_ = logMelSpectrum(spec, samplingrate)

plt.figure()
plt.pcolormesh(mspec_)
plt.title('Mel filterbank - computed')
plt.figure()
plt.pcolormesh(example['mspec'])
plt.title('Mel filterbank - example')
plt.show()

In [None]:
#@title DCT

###########
# 4.6 DCT #
###########

from scipy.fftpack.realtransforms import dct

def lifter(mfcc, lifter=22):
    """
    Applies liftering to improve the relative range of MFCC coefficients.

       mfcc: NxM matrix where N is the number of frames and M the number of MFCC coefficients
       lifter: lifering coefficient

    Returns:
       NxM array with lifeterd coefficients
    """
    nframes, nceps = mfcc.shape
    cepwin = 1.0 + lifter/2.0 * np.sin(np.pi * np.arange(nceps) / lifter)
    return np.multiply(mfcc, np.tile(cepwin, nframes).reshape((nframes,nceps)))

def cepstrum(input, nceps):
    """
    Calulates Cepstral coefficients from mel spectrum applying Discrete Cosine Transform

    Args:
        input: array of log outputs of Mel scale filterbank [N x nmelfilters] where N is the
               number of frames and nmelfilters the length of the filterbank
        nceps: number of output cepstral coefficients
    Output:
        array of Cepstral coefficients [N x nceps]
    Note: you can use the function dct from scipy.fftpack.realtransforms
    """
    cepstrum = dct(input)
    cepstrum = cepstrum[:, :nceps]
    return cepstrum

nceps = 13
mfcc_ = cepstrum(mspec_, nceps)
lmfcc = lifter(mfcc_)

plt.figure()
plt.pcolormesh(mfcc_)
plt.title('MFCC - computed')
plt.figure()
plt.pcolormesh(example['mfcc'])
plt.title('MFCC - example')
plt.show()

plt.figure()
plt.pcolormesh(lmfcc)
plt.title('Liftered MFCC - computed')
plt.figure()
plt.pcolormesh(example['lmfcc'])
plt.title('Liftered MFCC - example')
plt.show()

In [None]:
# @title Feature correlation

#########################
# 5 Feature correlation #
#########################

def mspec(samples, winlen = 400, winshift = 200, preempcoeff=0.97, nfft=512, samplingrate=20000):
    """Computes Mel Filterbank features.

    Args:
        samples: array of speech samples with shape (N,)
        winlen: lenght of the analysis window
        winshift: number of samples to shift the analysis window at every time step
        preempcoeff: pre-emphasis coefficient
        nfft: length of the Fast Fourier Transform (power of 2, >= winlen)
        samplingrate: sampling rate of the original signal

    Returns:
        N x nfilters array with mel filterbank features (see trfbank for nfilters)
    """
    frames = enframe(samples, winlen, winshift)
    preemph = preemp(frames, preempcoeff)
    windowed = windowing(preemph)
    spec = powerSpectrum(windowed, nfft)
    return logMelSpectrum(spec, samplingrate)


def mfcc(samples, winlen = 400, winshift = 200, preempcoeff=0.97, nfft=512, nceps=13, samplingrate=20000, liftercoeff=22):
    """Computes Mel Frequency Cepstrum Coefficients.

    Args:
        samples: array of speech samples with shape (N,)
        winlen: lenght of the analysis window
        winshift: number of samples to shift the analysis window at every time step
        preempcoeff: pre-emphasis coefficient
        nfft: length of the Fast Fourier Transform (power of 2, >= winlen)
        nceps: number of cepstrum coefficients to compute
        samplingrate: sampling rate of the original signal
        liftercoeff: liftering coefficient used to equalise scale of MFCCs

    Returns:
        N x nceps array with lifetered MFCC coefficients
    """
    mspecs = mspec(samples, winlen, winshift, preempcoeff, nfft, samplingrate)
    ceps = cepstrum(mspecs, nceps)
    return lifter(ceps, liftercoeff)

# MFCCs and mspec of all utterances
for utterance in data:
    samples = utterance['samples']
    utterance['mfcc'] = mfcc(samples, winlen, winshift, preempcoeff, nfft, nceps, samplingrate)
    utterance['mspec'] = mspec(samples, winlen, winshift, preempcoeff, nfft, samplingrate)

# show some utterances
# for utterance in data[[0, 4]]:
#     plt.figure()
#     plt.pcolormesh(utterance['mfcc'])
#     plt.title('Gender: {}, Digit: {}, Repetition: {}'.format(utterance['gender'], utterance['digit'], utterance['repetition']))
# plt.show()

# concatenate frames
mfcc_frames = np.concatenate([u['mfcc'] for u in data], axis=0)
mspec_frames = np.concatenate([u['mspec'] for u in data], axis=0)

# correlations of features
correlations_mfcc = np.corrcoef(mfcc_frames, rowvar=False)
correlations_mspec = np.corrcoef(mspec_frames, rowvar=False)

plt.figure()
plt.pcolormesh(correlations_mfcc)
plt.title('Correlation matrix - MFCC')
plt.show()

plt.figure()
plt.pcolormesh(correlations_mspec)
plt.title('Correlation matrix - Mel filterbank')
plt.show()

In [None]:
#@title Clustering

################
# 6 Clustering #
################

from sklearn.mixture import GaussianMixture

np.random.seed(1)

# train GMM
gmm4 = GaussianMixture(n_components=4, covariance_type='diag', n_init=100)
gmm4.fit(mfcc_frames)

gmm32 = GaussianMixture(n_components=32, covariance_type='diag', n_init=100)
gmm32.fit(mfcc_frames)

In [None]:
# analyze some utterances
for utterance in data[[4, 5, 26, 27, 6, 2, 3, 24, 25]]:
    plt.figure()
    plt.pcolormesh(gmm4.predict_proba(utterance['mfcc']))
    plt.title('Gender: {}, Digit: {}, Repetition: {}'.format(utterance['gender'], utterance['digit'], utterance['repetition']))
    plt.xlabel('component')
    plt.ylabel('frame')
for utterance in data[[16, 17, 38, 39]]:
    plt.figure()
    plt.pcolormesh(gmm32.predict_proba(utterance['mfcc']))
    plt.title('Gender: {}, Digit: {}, Repetition: {}'.format(utterance['gender'], utterance['digit'], utterance['repetition']))
    plt.xlabel('component')
    plt.ylabel('frame')
plt.show()

In [None]:
#@title Comparing utterances

##########################
# 7 Comparing utterances #
##########################

def dtw(x, y, dist):
    """Dynamic Time Warping.

    Args:
        x, y: arrays of size NxD and MxD respectively, where D is the dimensionality
              and N, M are the respective lenghts of the sequences
        dist: distance function (can be used in the code as dist(x[i], y[j]))

    Outputs:
        d: global distance between the sequences (scalar) normalized to len(x)+len(y)
        LD: local distance between frames from x and y (NxM matrix)
        AD: accumulated distance between frames of x and y (NxM matrix)
        path: best path through AD

    Note that you only need to define the first output for this exercise.
    """
    N = x.shape[0]
    M = y.shape[0]

    # local distance between frames (frame-wise distance)
    LD = np.zeros((N, M))
    for i in range(N):
        for j in range(M):
            LD[i, j] = dist(x[i], y[j])

    # accumulated distances
    AD = np.zeros((N, M))
    pred = np.zeros((N, M, 2), dtype='uint8')   # the predecessor is represented as tuple of coordinates
    for i in range(N):
        for j in range(M):
            if i != 0 or j != 0:
                # find minimum and save predecessor
                candidates_pred = []
                ad_candidates_pred = []
                if i > 0:
                    candidates_pred.append((i-1, j))
                    ad_candidates_pred.append(AD[i-1, j])
                if i > 0 and j > 0:
                    candidates_pred.append((i-1, j-1))
                    ad_candidates_pred.append(AD[i-1, j-1])
                if j > 0:
                    candidates_pred.append((i, j-1))
                    ad_candidates_pred.append(AD[i, j-1])
                m = min(ad_candidates_pred)
                idx_candidate = ad_candidates_pred.index(m)
                pred[i, j] = candidates_pred[idx_candidate]
            else:
                m = 0

            # compute accumulated distance
            AD[i, j] = LD[i, j] + m

    # global distance
    d = AD[-1, -1] / (N + M)

    # best path (backtracking)
    path = [(N-1, M-1)]
    current = path[0]
    while current != (0, 0):
        path.insert(0, tuple(pred[current]))
        current = path[0]

    # show best path (debugging)
    # LD_ = LD.copy()
    # for node in path:
    #     LD_[node] = 0   # to be visible in black
    # plt.figure()
    # plt.pcolormesh(LD_)
    # plt.show()

    return d, LD, AD, path

def euclidean_distance(x, y):
    return np.linalg.norm(x - y)


N = len(data)   # number of utterances
D = np.zeros((N, N))

# compute distance matrix
for i in range(D.shape[0]):
    for j in range(D.shape[1]):
        x = data[i]['mfcc']
        y = data[j]['mfcc']
        D[i, j] = dtw(x, y, euclidean_distance)[0]

# display distance matrix
plt.figure()
plt.pcolormesh(D)
plt.title('Distance matrix')
plt.show()

In [None]:
#@title Hierarchical clustering

from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

def tidigit2labels(tidigitsarray):
    """
    Return a list of labels including gender, speaker, digit and repetition information for each
    utterance in tidigitsarray. Useful for plots.
    """
    labels = []
    nex = len(tidigitsarray)
    for ex in range(nex):
        labels.append(tidigitsarray[ex]['gender'] + '_' + 
                      tidigitsarray[ex]['speaker'] + '_' + 
                      tidigitsarray[ex]['digit'] + '_' + 
                      tidigitsarray[ex]['repetition'])
    return labels

# hierarchical clustering
Z = linkage(squareform(D), method='complete')
plt.figure()
dendrogram(Z, labels=tidigit2labels(data))
plt.show()