In [41]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import librosa
import sys
import torch
import random
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import statistics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import preprocessing
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch import optim
import torch.utils.data as data_utils
import os
import re
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from pomegranate.distributions import Normal
from pomegranate.gmm import GeneralMixtureModel
from pomegranate.hmm import DenseHMM
#from pomegranate import *

In [17]:
import os
from glob import glob

import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


def parse_free_digits(directory):
    # Parse relevant dataset info
    files = glob(os.path.join(directory, "*.wav"))
    fnames = [f.split("/")[1].split(".")[0].split("_") for f in files]
    ids = [f[2] for f in fnames]
    y = [int(f[0]) for f in fnames]
    speakers = [f[1] for f in fnames]
    _, Fs = librosa.core.load(files[0], sr=None)

    def read_wav(f):
        wav, _ = librosa.core.load(f, sr=None)

        return wav

    # Read all wavs
    wavs = [read_wav(f) for f in files]

    # Print dataset info
    print("Total wavs: {}. Fs = {} Hz".format(len(wavs), Fs))

    return wavs, Fs, ids, y, speakers


def extract_features(wavs, n_mfcc=6, Fs=8000):
    # Extract MFCCs for all wavs
    window = 30 * Fs // 1000
    step = window // 2
    frames = [
        librosa.feature.mfcc(
            y=wav, sr=Fs, n_fft=window, hop_length=window - step, n_mfcc=n_mfcc
        ).T
        for wav in tqdm(wavs, desc="Extracting mfcc features...")
    ]

    print("Feature extraction completed with {} mfccs per frame".format(n_mfcc))

    return frames


def split_free_digits(frames, ids, speakers, labels):
    print("Splitting in train test split using the default dataset split")
    # Split to train-test
    X_train, y_train, spk_train = [], [], []
    X_test, y_test, spk_test = [], [], []
    test_indices = ["0", "1", "2", "3", "4"]

    for idx, frame, label, spk in zip(ids, frames, labels, speakers):
        if str(idx) in test_indices:
            X_test.append(frame)
            y_test.append(label)
            spk_test.append(spk)
        else:
            X_train.append(frame)
            y_train.append(label)
            spk_train.append(spk)

    return X_train, X_test, y_train, y_test, spk_train, spk_test


def make_scale_fn(X_train):
    # Standardize on train data
    scaler = StandardScaler()
    scaler.fit(np.concatenate(X_train))
    print("Normalization will be performed using mean: {}".format(scaler.mean_))
    print("Normalization will be performed using std: {}".format(scaler.scale_))

    def scale(X):
        scaled = []

        for frames in X:
            scaled.append(scaler.transform(frames))
        return scaled

    return scale


def parser(directory, n_mfcc=6):
    wavs, Fs, ids, y, speakers = parse_free_digits(directory)
    frames = extract_features(wavs, n_mfcc=n_mfcc, Fs=Fs)
    X_train, X_test, y_train, y_test, spk_train, spk_test = split_free_digits(
        frames, ids, speakers, y
    )

    return X_train, X_test, y_train, y_test, spk_train, spk_test

In [36]:
###Βήμα 9
X_train, X_test, y_train, y_test, spk_train, spk_test = parser('recordings/') #parse recordings/

Total wavs: 3000. Fs = 8000 Hz


Extracting mfcc features...: 100%|██████████| 3000/3000 [00:12<00:00, 241.32it/s]

Feature extraction completed with 6 mfccs per frame
Splitting in train test split using the default dataset split





In [37]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train) #80% train and 20% test
print("If using all data to calculate normalization statistics")
scale_fn = make_scale_fn(X_train + X_val + X_test)
print("If using X_train + X_dev to calculate normalization statistics")
scale_fn = make_scale_fn(X_train + X_val)
print("If using X_train to calculate normalization statistics")
scale_fn = make_scale_fn(X_train)
X_train = scale_fn(X_train)
X_dev = scale_fn(X_val)
X_test = scale_fn(X_test)

If using all data to calculate normalization statistics
Normalization will be performed using mean: [-517.82970067   62.3857955    18.81777176    9.58994408  -19.21332918
  -10.9054417 ]
Normalization will be performed using std: [152.29960089  51.98705829  36.71929108  29.63888661  24.80403283
  23.39483933]
If using X_train + X_dev to calculate normalization statistics
Normalization will be performed using mean: [-517.77180304   62.41689972   18.86552787    9.61652008  -19.17346574
  -10.77057825]
Normalization will be performed using std: [152.46343541  51.98376561  36.72489087  29.65300818  24.84129996
  23.30360999]
If using X_train to calculate normalization statistics
Normalization will be performed using mean: [-515.81248143   62.70110253   18.93207515    9.70706692  -19.38836838
  -10.70239465]
Normalization will be performed using std: [151.82794745  52.35051782  36.82451718  29.70388096  24.75182463
  23.48335437]


In [38]:
#check number of samples for digit for train set and test set and then print the results
train_number = []
test_number = []
val_number = []
y_t = np.array(y_train)
y_te = np.array(y_test)
y_v = np.array(y_val)
for i in range(0,10):
    train_number.append((np.where(y_t == i)[0]).shape[0])
print("For train set:")
for i in range(len(train_number)):                          
    print('Number of samples for digit {}:'.format(i),train_number[i])
for i in range(0,10):
    val_number.append((np.where(y_v == i)[0]).shape[0])
print("For validation set:")
for i in range(len(val_number)):                          
    print('Number of samples for digit {}:'.format(i),val_number[i])
print("For test set:")
for i in range(0,10):
    test_number.append((np.where(y_te == i)[0]).shape[0])
for i in range(len(test_number)):                          
    print('Number of samples for digit {}:'.format(i),test_number[i])

For train set:
Number of samples for digit 0: 216
Number of samples for digit 1: 216
Number of samples for digit 2: 216
Number of samples for digit 3: 216
Number of samples for digit 4: 216
Number of samples for digit 5: 216
Number of samples for digit 6: 216
Number of samples for digit 7: 216
Number of samples for digit 8: 216
Number of samples for digit 9: 216
For validation set:
Number of samples for digit 0: 54
Number of samples for digit 1: 54
Number of samples for digit 2: 54
Number of samples for digit 3: 54
Number of samples for digit 4: 54
Number of samples for digit 5: 54
Number of samples for digit 6: 54
Number of samples for digit 7: 54
Number of samples for digit 8: 54
Number of samples for digit 9: 54
For test set:
Number of samples for digit 0: 30
Number of samples for digit 1: 30
Number of samples for digit 2: 30
Number of samples for digit 3: 30
Number of samples for digit 4: 30
Number of samples for digit 5: 30
Number of samples for digit 6: 30
Number of samples for d

In [45]:
###Βήμα 10
n_states = 2  # the number of HMM states
n_mixtures = 2  # the number of Gaussians
gmm = True  # whether to use GMM or plain Gaussian
covariance_type = "diag"  # Use diagonal covariange


# Gather data separately for each digit
def gather_in_dic(X, labels):
    dic = {}
    for dig in set(labels):
        x = [X[i] for i in range(len(labels)) if labels[i] == dig]
        lengths = [len(i) for i in x]
        y = [dig for _ in range(len(x))]
        dic[dig] = (x, lengths, y)
    return dic

train_dic = gather_in_dic(X_train, y_train)
val_dic = gather_in_dic(X_val, y_val)
test_dic = gather_in_dic(X_test, y_test)
labels = list(set(y_train))

In [51]:
#create GMM HMM model with the following parameters: X which will be one digit, states, Gaussian distributions, whether to use GMM or not
class GMM_HMM:
    def __init__(self, X, n_states=2, n_mixtures=2, gmm=True):
        self.X = X                                                        
        self.n_states = n_states                                         
        self.n_mixtures = n_mixtures                                     
        self.gmm = gmm 
        self.trans_matrix = np.zeros(self.n_states, self.n_states)
        for i in range(self.n_states):
            for j in range(self.n_states):
                if i + 1 == j or i == j:
                    self.trans_matrix[i, j] = 0.5
        self.trans_matrix[self.n_states][self.n_states - 1] = 1
        self.start = np.zeros(self.n_states)
        self.start[0] = 1
        self.end = np.zeros(self.n_states)
        self.end[-1] = 1          
        self.dists = []                                                 # list of probability distributions for the HMM states
        for i in range(self.n_states):
            a = GeneralMixtureModel.from_samples(Normal, self.n_mixtures, np.float_(self.X))
            self.dists.append(a)
        self.model = DenseHMM.from_matrix(self.trans_matrix, self.dists, self.starts, self.ends, state_names=['s{}'.format(i) for i in range(n_states)]) 
      
    def fit(self, X, max_iters=4):
        self.max_iters = max_iters
        self.model.fit(X, max_iters=self.max_iters) #train hmm model
        return self
    
    def predict(self, X):
        prediction, _ = self.model.viterbi(X) # run viterbi to predict 
        return prediction


In [None]:
#initialize one gmm-hmm for every digit
def initialize_gmm_hmm(train_dic, n_states, n_mixtures):
    hmm_list = []
    