In [1]:
import sys
sys.path.append('/Users/tim/Desktop/Speech/lab1')
sys.path.append('/Users/tim/Desktop/Speech/lab2')
import numpy as np
from matplotlib import pyplot as plt
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F


from lab3_tools import *
from lab3_proto import *

In [2]:
# phoneHMMs is a dictionary with 21 keys, each corresponding to a phonetic model
phoneHMMs = np.load('../lab2/lab2_models_all.npz', allow_pickle=True)['phoneHMMs'].item()
phones = sorted(phoneHMMs.keys())
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
# A list of unique states for reference
# Note that we model three segments for each phoneme
stateList = [ph + '_' + str(id) for ph in phones for id in range(nstates[ph])]
stateList[39]

'sil_0'

# Forced Alignment

In [3]:
# Read the audio and compute liftered MFCC features
from lab1_proto import mfcc

filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = loadAudio(filename)
lmfcc = mfcc(samples)

In [4]:
# Recover the sequence of digits (word level transcription) in the file
wordTrans = list(path2info(filename)[2])
wordTrans

['z', '4', '3']

In [5]:
from prondict import prondict
phoneTrans = words2phones(wordTrans, prondict)
phoneTrans

['sil',
 'z',
 'iy',
 'r',
 'ow',
 'sp',
 'f',
 'ao',
 'r',
 'sp',
 'th',
 'r',
 'iy',
 'sp',
 'sil']

In [6]:
from lab2_proto import concatHMMs

# Create a combined model for this specific utterance:
utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

In [7]:
stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])]

In [8]:
from lab2_tools import log_multivariate_normal_density_diag
from lab2_proto import viterbi

# NxM array of emission(observation) log likelihoods, N frames, M states
obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars']) 
log_startprob = np.log(utteranceHMM['startprob'][:-1])
log_transmat = np.log(utteranceHMM['transmat'][:-1, :-1])
vloglik, vpath = viterbi(obsloglik, log_startprob, log_transmat)

stateList = [stateTrans[i] for i in vpath]
stateList

['sil_0',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_2',
 'z_0',
 'z_0',
 'z_0',
 'z_0',
 'z_1',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_1',
 'iy_2',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_1',
 'r_2',
 'ow_0',
 'ow_1',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'f_0',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_2',
 'ao_0',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'r_0',
 'r_0',
 'r_0',
 'r_1',
 'r_2',
 'th_0',


# Feature Extraction

In [9]:
# from lab1_proto import mspec

# traindata = []
# for root, dirs, files in os.walk('tidigits/disc_4.1.1/tidigits/train'):
#     for file in files:
#         if file.endswith('.wav'):
#             filename = os.path.join(root, file)
#             samples, samplingrate = loadAudio(filename)
#             lmfcc = mfcc(samples)
#             mspecs = mspec(samples)
#             targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
#             traindata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': mspecs, 'targets': targets})
# np.savez('traindata.npz', traindata=traindata)

# testdata = []
# for root, dirs, files in os.walk('tidigits/disc_4.2.1/tidigits/test'):
#     for file in files:
#         if file.endswith('.wav'):
#             filename = os.path.join(root, file)
#             samples, samplingrate = loadAudio(filename)
#             lmfcc = mfcc(samples)
#             mspecs = mspec(samples)
#             targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
#             testdata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': mspecs, 'targets': targets})
# np.savez('testdata.npz', testdata=testdata)

# Training and Validation Sets

In [10]:
train_data = np.load('traindata.npz', allow_pickle=True)['traindata']
test_data = np.load('testdata.npz', allow_pickle=True)['testdata']

In [11]:
random.seed(420)

# Include original data with parsed information
data_list = []
for item in train_data:
    filename = item['filename']
    info_tuple = path2info(filename)
    data_list.append((info_tuple, item))  # Store tuple of parsed info and the full data item

from collections import defaultdict
# Separate data by gender
gender_data = defaultdict(list)
for info, full_data in data_list:
    gender_data[info[0]].append((info, full_data))


train_set = []
valid_set = []

# Split data for each gender
for gender, items in gender_data.items():
    # Group by speaker
    speaker_data = defaultdict(list)
    for info, full_data in items:
        speaker_data[info[1]].append(full_data)
    
    # Create lists of speakers and shuffle them
    speakers = list(speaker_data.keys())
    random.shuffle(speakers)
    
    # Calculate number of speakers for training
    num_train_speakers = int(round(len(speakers) * 0.9))
    
    # Split speakers into training and validation
    train_speakers = speakers[:num_train_speakers]
    valid_speakers = speakers[num_train_speakers:]
    
    # Aggregate the data entries for each set of speakers
    for spkr in train_speakers:
        train_set.extend(speaker_data[spkr])
    for spkr in valid_speakers:
        valid_set.extend(speaker_data[spkr])

# Shuffle the sets to ensure random order
random.shuffle(train_set)
random.shuffle(valid_set)

# Acoustic Context (Dynamic Features)

In [12]:
dlmfcc_train_x = []
dmspec_train_x = []
for i in range(len(train_set)):
    current_utterance = train_set[i]
    num_timesteps = current_utterance['lmfcc'].shape[0]

    for t in range(num_timesteps):
        temp_mfcc_stack = []
        temp_mspec_features = []
        for j in range(t - 3, t + 3 + 1):
            if j < 0:
                # Mirror at the beginning
                temp_mfcc_stack.extend(current_utterance['lmfcc'][abs(j)])
                temp_mspec_features.extend(current_utterance['mspec'][abs(j)])
            elif j >= num_timesteps:
                # Mirror at the end
                temp_mfcc_stack.extend(current_utterance['lmfcc'][2 * num_timesteps - j - 1])
                temp_mspec_features.extend(current_utterance['mspec'][2 * num_timesteps - j - 1])
            else:
                # Normal case
                temp_mfcc_stack.extend(current_utterance['lmfcc'][j])
                temp_mspec_features.extend(current_utterance['mspec'][j])
        
        # Flatten the data structures as specified at 4.6
        dlmfcc_train_x.append(temp_mfcc_stack)
        dmspec_train_x.append(temp_mspec_features)

dlmfcc_train_x = np.array(dlmfcc_train_x)
dmspec_train_x = np.array(dmspec_train_x) 

#---------------------------------------------------------#
dlmfcc_val_x = []
dmspec_val_x = []
for i in range(len(valid_set)):
    current_utterance = valid_set[i]
    num_timesteps = current_utterance['lmfcc'].shape[0]

    for t in range(num_timesteps):
        temp_mfcc_stack = []
        temp_mspec_features = []
        for j in range(t - 3, t + 3 + 1):
            if j < 0:
                # Mirror at the beginning
                temp_mfcc_stack.extend(current_utterance['lmfcc'][abs(j)])
                temp_mspec_features.extend(current_utterance['mspec'][abs(j)])
            elif j >= num_timesteps:
                # Mirror at the end
                temp_mfcc_stack.extend(current_utterance['lmfcc'][2 * num_timesteps - j - 1])
                temp_mspec_features.extend(current_utterance['mspec'][2 * num_timesteps - j - 1])
            else:
                # Normal case
                temp_mfcc_stack.extend(current_utterance['lmfcc'][j])
                temp_mspec_features.extend(current_utterance['mspec'][j])
        
        # Flatten the data structures as specified at 4.6
        dlmfcc_val_x.append(temp_mfcc_stack)
        dmspec_val_x.append(temp_mspec_features)

dlmfcc_val_x = np.array(dlmfcc_val_x)
dmspec_val_x = np.array(dmspec_val_x) 
    
#---------------------------------------------------------#
dlmfcc_test_x = []
dmspec_test_x = []
for i in range(len(train_set)):
    current_utterance = train_set[i]
    num_timesteps = current_utterance['lmfcc'].shape[0]

    for t in range(num_timesteps):
        temp_mfcc_stack = []
        temp_mspec_features = []
        for j in range(t - 3, t + 3 + 1):
            if j < 0:
                # Mirror at the beginning
                temp_mfcc_stack.extend(current_utterance['lmfcc'][abs(j)])
                temp_mspec_features.extend(current_utterance['mspec'][abs(j)])
            elif j >= num_timesteps:
                # Mirror at the end
                temp_mfcc_stack.extend(current_utterance['lmfcc'][2 * num_timesteps - j - 1])
                temp_mspec_features.extend(current_utterance['mspec'][2 * num_timesteps - j - 1])
            else:
                # Normal case
                temp_mfcc_stack.extend(current_utterance['lmfcc'][j])
                temp_mspec_features.extend(current_utterance['mspec'][j])
        
        # Flatten the data structures as specified at 4.6
        dlmfcc_test_x.append(temp_mfcc_stack)
        dmspec_test_x.append(temp_mspec_features)

dlmfcc_test_x = np.array(dlmfcc_test_x)
dmspec_test_x = np.array(dmspec_test_x) 

# Feature Standardisation

### What will happen with the very short utterances in the files containing isolated digits when normalizing each utterance individually?

Short utterances will have less reliable estimates of mean and variance, leading to potentially unstable feature scaling.

Extreme normalization values might occur if a particular feature deviates slightly in a short utterance, as there's less data to average out noise and variability. This can exaggerate the importance of minor variations in short utterances, potentially skewing the model training or performance.

In [13]:
lmfcc_train_x = []
mspec_train_x = []
train_y = []
for i in range(len(train_set)):
    lmfcc_train_x.append(train_set[i]['lmfcc'])
    mspec_train_x.append(train_set[i]['mspec'])
    train_y.append(train_set[i]['targets'])
lmfcc_train_x = np.concatenate(lmfcc_train_x, axis=0)
mspec_train_x = np.concatenate(mspec_train_x, axis=0)
train_y = np.concatenate(train_y, axis=0)
#---------------------------------------------------------#
lmfcc_val_x = []
mspec_val_x = []
val_y = []
for i in range (len(valid_set)):
    lmfcc_val_x.append(valid_set[i]['lmfcc'])
    mspec_val_x.append(valid_set[i]['mspec'])
    val_y.append(valid_set[i]['targets'])
lmfcc_val_x = np.concatenate(lmfcc_val_x, axis=0)
mspec_val_x = np.concatenate(mspec_val_x, axis=0)
val_y = np.concatenate(val_y, axis=0)
#---------------------------------------------------------#
lmfcc_test_x = []
mspec_test_x = []
test_y = []
for i in range (len(test_data)):
    lmfcc_test_x.append(test_data[i]['lmfcc'])
    mspec_test_x.append(test_data[i]['mspec'])
    test_y.append(test_data[i]['targets'])
lmfcc_test_x = np.concatenate(lmfcc_test_x, axis=0)
mspec_test_x = np.concatenate(mspec_test_x, axis=0)
test_y = np.concatenate(test_y, axis=0)

In [14]:
print(lmfcc_train_x.shape)
print(lmfcc_val_x.shape)
print(lmfcc_test_x.shape)
print()
print(mspec_train_x.shape)
print(mspec_val_x.shape)
print(mspec_test_x.shape)
print()
print(dlmfcc_train_x.shape)
print(dlmfcc_val_x.shape)
print(dlmfcc_test_x.shape)
print()
print(dmspec_train_x.shape)
print(dmspec_val_x.shape)
print(dmspec_test_x.shape)

(1363305, 13)
(144087, 13)
(1527014, 13)

(1363305, 40)
(144087, 40)
(1527014, 40)

(1363305, 91)
(144087, 91)
(1363305, 91)

(1363305, 280)
(144087, 280)
(1363305, 280)


In [15]:
from sklearn.preprocessing import StandardScaler

lmfcc_scaler = StandardScaler()
dlmfcc_scaler = StandardScaler()
mspec_scaler = StandardScaler()
dmspec_scaler = StandardScaler()

lmfcc_train_x = lmfcc_scaler.fit_transform(lmfcc_train_x)
dlmfcc_train_x = dlmfcc_scaler.fit_transform(dlmfcc_train_x)
mspec_train_x = mspec_scaler.fit_transform(mspec_train_x)
dmspec_train_x = dmspec_scaler.fit_transform(dmspec_train_x)
lmfcc_train_x = lmfcc_train_x.astype('float32')
dlmfcc_train_x = dlmfcc_train_x.astype('float32')
mspec_train_x = mspec_train_x.astype('float32')
dmspec_train_x = dmspec_train_x.astype('float32')

# Transform validation and test data using the normalization coefficients from training data
lmfcc_val_x = lmfcc_scaler.transform(lmfcc_val_x)
dlmfcc_val_x = dlmfcc_scaler.transform(dlmfcc_val_x)
mspec_val_x = mspec_scaler.transform(mspec_val_x)
dmspec_val_x = dmspec_scaler.transform(dmspec_val_x)
lmfcc_val_x = lmfcc_val_x.astype('float32')
dlmfcc_val_x = dlmfcc_val_x.astype('float32')
mspec_val_x = mspec_val_x.astype('float32')
dmspec_val_x = dmspec_val_x.astype('float32')

lmfcc_test_x = lmfcc_scaler.transform(lmfcc_test_x)
dlmfcc_test_x = dlmfcc_scaler.transform(dlmfcc_test_x)
mspec_test_x = mspec_scaler.transform(mspec_test_x)
dmspec_test_x = dmspec_scaler.transform(dmspec_test_x)
lmfcc_test_x = lmfcc_test_x.astype('float32')
dlmfcc_test_x = dlmfcc_test_x.astype('float32')
mspec_test_x = mspec_test_x.astype('float32')
dmspec_test_x = dmspec_test_x.astype('float32')

In [16]:
output_dim = len(stateList)
train_y = F.one_hot(torch.tensor(train_y), num_classes=output_dim)
val_y = F.one_hot(torch.tensor(val_y), num_classes=output_dim)
test_y = F.one_hot(torch.tensor(test_y), num_classes=output_dim)

# Phoneme Recognition with Deep Neural Networks

In [17]:
# Define the DNN model class
class PhonemeDNN(nn.Module):
    def __init__(self, input_size, num_classes, num_layers=1, num_units=256):
        super(PhonemeDNN, self).__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_size, num_units))
        self.layers.append(nn.ReLU())
        
        # Hidden layers
        for _ in range(num_layers - 1):
            self.layers.append(nn.Linear(num_units, num_units))
            self.layers.append(nn.ReLU())
        
        # Output layer
        self.layers.append(nn.Linear(num_units, num_classes))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [18]:
def train_model(model, train_loader, val_loader, epochs=10, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        print(f'Epoch {epoch+1}: Train Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')


In [19]:
# Example usage:
input_size = 13  # Assuming MFCC features of dimension 13
num_classes = 10  # Total number of phoneme classes
num_layers = 2    # Number of hidden layers

# Load your data into DataLoader
# train_loader and val_loader should be defined with your actual data
# model = PhonemeDNN(input_size, num_classes, num_layers)
# train_model(model, train_loader, val_loader)

# Implement similar setup for 1 to 4 layers, and for different features (MFCC, filterbank)

# Possible questions

### What is the influence of feature kind and size of input context window?
The model performance. 

The context window size determines how much of the temporal information surrounding each frame is considered by the network. A larger window provides more contextual information, potentially improving recognition accuracy but also increasing the complexity of the model. Dynamic features, which include several frames of context, typically lead to better performance compared to static features.

### What is the purpose of normalising (standardising) the input feature vectors depending on the activation functions in the network?
Normalising the feature vectors ensures that each feature contributes equally to the learning process, preventing features with larger numerical ranges from dominating the training dynamics. 

This is crucial when using activation functions like sigmoid or tanh in the network, as these functions are sensitive to input scale and can suffer from saturation issues: inputs with large absolute values cause the function outputs to be near the function's asymptotes, resulting in gradients close to zero.

### What is the influence of the number of units per layer and the number of layers?
Increasing the number of units per layer typically allows the network to learn more complex patterns and representations, improving its capability to differentiate between more subtle features of the input. 

Similarly, more layers can enable deeper (hence, potentially more abstract) feature learning. 

However, larger and deeper networks are more prone to overfitting and require more data and computational resources to train effectively.

### What is the influence of the activation function (when you try other activation functions than ReLU, you do not need to reach convergence in case you do not have enough time)
The choice of activation function affects the training dynamics and the performance of the network. 

ReLU (Rectified Linear Unit) is commonly used because it helps in alleviating the vanishing gradient problem encountered with sigmoid or tanh functions. 

Trying other activation functions like Leaky ReLU, ELU, or sigmoid might affect convergence speed and the final model performance, especially if the network architecture or data characteristics make it susceptible to issues like dying ReLU problem or gradient vanishing/explosion.

### What is the influence of the learning rate/learning rate strategy?
Too high a rate can cause the training to diverge, while too low a rate might result in a painfully slow convergence or getting stuck in local minima. 

Adaptive learning rate strategies like Adam adjust the learning rate during training, which can lead to faster convergence and can alleviate some of the tuning requirements.

### How stable are the posteriograms from the network in time?
Posteriors produced by the network should ideally be stable for consistent phonetic units across similar contexts. 

However, in practice, stability can vary depending on factors like network architecture, training sufficiency, and the inherent variability of speech. Observing fluctuations in posteriograms can indicate issues with model generalization or insufficient training.

### How do the errors distribute depending on phonetic class?
Errors in phoneme recognition often vary by phonetic class. 

Some phonemes might be consistently harder to recognize due to their acoustic similarity to other phonemes, less distinctiveness in their spectral features, or their shorter duration. 

Confusion matrices by phonetic class can help identify which phonemes are most frequently misclassified and which phonemes they are confused with, guiding further model adjustments or targeted data augmentation.
