In [1]:
import sys
sys.path.append('/Users/tim/Desktop/Speech/lab1')
sys.path.append('/Users/tim/Desktop/Speech/lab2')
import math
import warnings
import numpy as np
from matplotlib import pyplot as plt
import os
import random
import time


from lab3_tools import *
from lab3_proto import *

In [2]:
# phoneHMMs is a dictionary with 21 keys, each corresponding to a phonetic model
phoneHMMs = np.load('../lab2/lab2_models_all.npz', allow_pickle=True)['phoneHMMs'].item()
phones = sorted(phoneHMMs.keys())
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
# A list of unique states for reference
# Note that we model three segments for each phoneme
stateList = [ph + '_' + str(id) for ph in phones for id in range(nstates[ph])]
stateList[39]

'sil_0'

# Forced Alignment

In [3]:
# Read the audio and compute liftered MFCC features
from lab1_proto import mfcc

filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = loadAudio(filename)
lmfcc = mfcc(samples)

In [4]:
# Recover the sequence of digits (word level transcription) in the file
wordTrans = list(path2info(filename)[2])
wordTrans

['z', '4', '3']

In [5]:
from prondict import prondict
phoneTrans = words2phones(wordTrans, prondict)
phoneTrans

['sil',
 'z',
 'iy',
 'r',
 'ow',
 'sp',
 'f',
 'ao',
 'r',
 'sp',
 'th',
 'r',
 'iy',
 'sp',
 'sil']

In [6]:
from lab2_proto import concatHMMs

# Create a combined model for this specific utterance:
utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

In [7]:
stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])]

In [8]:
from lab2_tools import log_multivariate_normal_density_diag
from lab2_proto import viterbi

# NxM array of emission(observation) log likelihoods, N frames, M states
obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars']) 
log_startprob = np.log(utteranceHMM['startprob'][:-1])
log_transmat = np.log(utteranceHMM['transmat'][:-1, :-1])
vloglik, vpath = viterbi(obsloglik, log_startprob, log_transmat)

stateList = [stateTrans[i] for i in vpath]
stateList

['sil_0',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_2',
 'z_0',
 'z_0',
 'z_0',
 'z_0',
 'z_1',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_1',
 'iy_2',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_1',
 'r_2',
 'ow_0',
 'ow_1',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'f_0',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_2',
 'ao_0',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'r_0',
 'r_0',
 'r_0',
 'r_1',
 'r_2',
 'th_0',


# Feature Extraction

In [9]:
# from lab1_proto import mspec

# traindata = []
# for root, dirs, files in os.walk('tidigits/disc_4.1.1/tidigits/train'):
#     for file in files:
#         if file.endswith('.wav'):
#             filename = os.path.join(root, file)
#             samples, samplingrate = loadAudio(filename)
#             lmfcc = mfcc(samples)
#             mspecs = mspec(samples)
#             targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
#             traindata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': mspecs, 'targets': targets})
# np.savez('traindata.npz', traindata=traindata)

# testdata = []
# for root, dirs, files in os.walk('tidigits/disc_4.2.1/tidigits/test'):
#     for file in files:
#         if file.endswith('.wav'):
#             filename = os.path.join(root, file)
#             samples, samplingrate = loadAudio(filename)
#             lmfcc = mfcc(samples)
#             mspecs = mspec(samples)
#             targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
#             testdata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': mspecs, 'targets': targets})
# np.savez('testdata.npz', testdata=testdata)

# Training and Validation Sets

In [10]:
train_data = np.load('traindata.npz', allow_pickle=True)['traindata']
test_data = np.load('testdata.npz', allow_pickle=True)['testdata']

In [11]:
random.seed(420)

# Include original data with parsed information
data_list = []
for item in train_data:
    filename = item['filename']
    info_tuple = path2info(filename)
    data_list.append((info_tuple, item))  # Store tuple of parsed info and the full data item

from collections import defaultdict
# Separate data by gender
gender_data = defaultdict(list)
for info, full_data in data_list:
    gender_data[info[0]].append((info, full_data))


train_set = []
valid_set = []

# Split data for each gender
for gender, items in gender_data.items():
    # Group by speaker
    speaker_data = defaultdict(list)
    for info, full_data in items:
        speaker_data[info[1]].append(full_data)
    
    # Create lists of speakers and shuffle them
    speakers = list(speaker_data.keys())
    random.shuffle(speakers)
    
    # Calculate number of speakers for training
    num_train_speakers = int(round(len(speakers) * 0.9))
    
    # Split speakers into training and validation
    train_speakers = speakers[:num_train_speakers]
    valid_speakers = speakers[num_train_speakers:]
    
    # Aggregate the data entries for each set of speakers
    for spkr in train_speakers:
        train_set.extend(speaker_data[spkr])
    for spkr in valid_speakers:
        valid_set.extend(speaker_data[spkr])

# Shuffle the sets to ensure random order
random.shuffle(train_set)
random.shuffle(valid_set)

# Acoustic Context (Dynamic Features)

In [12]:
for i in range(len(train_set)):
    current_utterance = train_set[i]
    num_timesteps = current_utterance['lmfcc'].shape[0]
    num_mfcc_features = current_utterance['lmfcc'].shape[1]
    num_mspec_features = current_utterance['mspec'].shape[1]
    stacked_mfcc_features = np.zeros((num_timesteps, num_mfcc_features * 7))
    stacked_mspec_features = np.zeros((num_timesteps, num_mspec_features * 7))


    for t in range(num_timesteps):
        temp_mfcc_stack = []
        temp_mspec_features = []
        for j in range(t - 3, t + 3 + 1):
            if j < 0:
                # Mirror at the beginning
                temp_mfcc_stack.append(current_utterance['lmfcc'][abs(j)])
                temp_mspec_features.append(current_utterance['mspec'][abs(j)])
            elif j >= num_timesteps:
                # Mirror at the end
                temp_mfcc_stack.append(current_utterance['lmfcc'][2 * num_timesteps - j - 1])
                temp_mspec_features.append(current_utterance['mspec'][2 * num_timesteps - j - 1])
            else:
                # Normal case
                temp_mfcc_stack.append(current_utterance['lmfcc'][j])
                temp_mspec_features.append(current_utterance['mspec'][j])
        
        # Flatten the data structures as specified at 4.6
        stacked_mfcc_features[t, :] = np.concatenate(temp_mfcc_stack)
        stacked_mspec_features[t, :] = np.concatenate(temp_mspec_features)

    current_utterance['lmfcc'] = stacked_mfcc_features
    current_utterance['mspec'] = stacked_mspec_features

#---------------------------------------------------------#
for i in range(len(valid_set)):
    current_utterance = valid_set[i]
    num_timesteps = current_utterance['lmfcc'].shape[0]
    num_mfcc_features = current_utterance['lmfcc'].shape[1]
    num_mspec_features = current_utterance['mspec'].shape[1]
    stacked_mfcc_features = np.zeros((num_timesteps, num_mfcc_features * 7))
    stacked_mspec_features = np.zeros((num_timesteps, num_mspec_features * 7))


    for t in range(num_timesteps):
        temp_mfcc_stack = []
        temp_mspec_features = []
        for j in range(t - 3, t + 3 + 1):
            if j < 0:
                # Mirror at the beginning
                temp_mfcc_stack.append(current_utterance['lmfcc'][abs(j)])
                temp_mspec_features.append(current_utterance['mspec'][abs(j)])
            elif j >= num_timesteps:
                # Mirror at the end
                temp_mfcc_stack.append(current_utterance['lmfcc'][2 * num_timesteps - j - 1])
                temp_mspec_features.append(current_utterance['mspec'][2 * num_timesteps - j - 1])
            else:
                # Normal case
                temp_mfcc_stack.append(current_utterance['lmfcc'][j])
                temp_mspec_features.append(current_utterance['mspec'][j])
        
        # Flatten the data structures as specified at 4.6
        stacked_mfcc_features[t, :] = np.concatenate(temp_mfcc_stack)
        stacked_mspec_features[t, :] = np.concatenate(temp_mspec_features)

    current_utterance['lmfcc'] = stacked_mfcc_features
    current_utterance['mspec'] = stacked_mspec_features
    
#---------------------------------------------------------#
for i in range(len(test_data)):
    current_utterance = test_data[i]
    num_timesteps = current_utterance['lmfcc'].shape[0]
    num_mfcc_features = current_utterance['lmfcc'].shape[1]
    num_mspec_features = current_utterance['mspec'].shape[1]
    stacked_mfcc_features = np.zeros((num_timesteps, num_mfcc_features * 7))
    stacked_mspec_features = np.zeros((num_timesteps, num_mspec_features * 7))


    for t in range(num_timesteps):
        temp_mfcc_stack = []
        temp_mspec_features = []
        for j in range(t - 3, t + 3 + 1):
            if j < 0:
                # Mirror at the beginning
                temp_mfcc_stack.append(current_utterance['lmfcc'][abs(j)])
                temp_mspec_features.append(current_utterance['mspec'][abs(j)])
            elif j >= num_timesteps:
                # Mirror at the end
                temp_mfcc_stack.append(current_utterance['lmfcc'][2 * num_timesteps - j - 1])
                temp_mspec_features.append(current_utterance['mspec'][2 * num_timesteps - j - 1])
            else:
                # Normal case
                temp_mfcc_stack.append(current_utterance['lmfcc'][j])
                temp_mspec_features.append(current_utterance['mspec'][j])
        
        # Flatten the data structures as specified at 4.6
        stacked_mfcc_features[t, :] = np.concatenate(temp_mfcc_stack)
        stacked_mspec_features[t, :] = np.concatenate(temp_mspec_features)

    current_utterance['lmfcc'] = stacked_mfcc_features
    current_utterance['mspec'] = stacked_mspec_features

# Feature Standardisation

### What will happen with the very short utterances in the files containing isolated digits when normalizing each utterance individually?

Short utterances will have less reliable estimates of mean and variance, leading to potentially unstable feature scaling.

Extreme normalization values might occur if a particular feature deviates slightly in a short utterance, as there's less data to average out noise and variability. This can exaggerate the importance of minor variations in short utterances, potentially skewing the model training or performance.

In [13]:
lmfcc_train_x = []
mspec_train_x = []
train_y = []
for i in range(len(train_set)):
    lmfcc_train_x.append(train_set[i]['lmfcc'])
    mspec_train_x.append(train_set[i]['mspec'])
    train_y.append(train_set[i]['targets'])
lmfcc_train_x = np.concatenate(lmfcc_train_x, axis=0)
mspec_train_x = np.concatenate(mspec_train_x, axis=0)
train_y = np.concatenate(train_y, axis=0)
#---------------------------------------------------------#
lmfcc_val_x = []
mspec_val_x = []
val_y = []
for i in range (len(valid_set)):
    lmfcc_val_x.append(valid_set[i]['lmfcc'])
    mspec_val_x.append(valid_set[i]['mspec'])
    val_y.append(valid_set[i]['targets'])
lmfcc_val_x = np.concatenate(lmfcc_val_x, axis=0)
mspec_val_x = np.concatenate(mspec_val_x, axis=0)
val_y = np.concatenate(val_y, axis=0)
#---------------------------------------------------------#
lmfcc_test_x = []
mspec_test_x = []
test_y = []
for i in range (len(test_data)):
    lmfcc_test_x.append(test_data[i]['lmfcc'])
    mspec_test_x.append(test_data[i]['mspec'])
    test_y.append(test_data[i]['targets'])
lmfcc_test_x = np.concatenate(lmfcc_test_x, axis=0)
mspec_test_x = np.concatenate(mspec_test_x, axis=0)
test_y = np.concatenate(test_y, axis=0)

84
359
106
205
100
164
176
319
100
205
215
193
92
338
143
98
157
79
111
265
347
311
96
91
166
84
169
167
201
119
321
182
127
120
325
132
129
123
114
162
135
101
167
141
169
207
147
215
102
155
157
88
221
98
297
312
225
75
225
98
95
161
93
171
138
112
114
298
123
184
96
175
130
249
120
71
139
115
88
343
271
129
183
221
351
280
214
84
230
111
219
279
279
83
258
119
224
87
124
162
276
93
109
127
242
170
233
184
167
188
220
276
173
246
252
125
84
175
97
180
89
197
210
75
173
83
162
325
84
302
115
398
235
295
87
189
193
111
265
228
95
162
119
156
299
338
173
134
287
148
303
289
248
138
304
390
253
132
115
78
211
210
128
199
100
103
162
152
123
217
139
174
307
134
82
167
87
115
129
327
235
78
214
93
156
92
115
380
256
142
347
159
100
101
164
107
103
114
358
343
83
228
77
260
174
173
160
88
121
98
111
225
130
109
319
169
215
118
248
319
246
137
103
150
178
93
119
92
275
269
150
114
263
178
201
155
116
290
103
175
194
280
135
269
100
164
205
78
151
225
220
179
114
87
130
121
178
111
83
114
247

In [14]:
print(lmfcc_train_x.shape)
print(lmfcc_val_x.shape)
print(mspec_val_x.shape)
print(mspec_train_x.shape)
print(lmfcc_test_x.shape)
print(mspec_test_x.shape)

(1363305, 91)
(144087, 91)
(144087, 280)
(1363305, 280)
(1527014, 91)
(1527014, 280)


In [15]:
from sklearn.preprocessing import StandardScaler

lmfcc_scaler = StandardScaler()
mspec_scaler = StandardScaler()

lmfcc_train_x = lmfcc_scaler.fit_transform(lmfcc_train_x)
mspec_train_x = mspec_scaler.fit_transform(mspec_train_x)
lmfcc_train_x = lmfcc_train_x.astype('float32')
mspec_train_x = mspec_train_x.astype('float32')

# Transform validation and test data using the normalization coefficients from training data
lmfcc_val_x = lmfcc_scaler.transform(lmfcc_val_x)
mspec_val_x = mspec_scaler.transform(mspec_val_x)
lmfcc_val_x = lmfcc_val_x.astype('float32')
mspec_val_x = mspec_val_x.astype('float32')

lmfcc_test_x = lmfcc_scaler.transform(lmfcc_test_x)
mspec_test_x = mspec_scaler.transform(mspec_test_x)
lmfcc_test_x = lmfcc_test_x.astype('float32')
mspec_test_x = mspec_test_x.astype('float32')

In [16]:
import torch
import torch.nn.functional as F

output_dim = len(stateList)
train_y = F.one_hot(torch.tensor(train_y), num_classes=output_dim)
val_y = F.one_hot(torch.tensor(val_y), num_classes=output_dim)
test_y = F.one_hot(torch.tensor(test_y), num_classes=output_dim)

# Possible questions

### What is the influence of feature kind and size of input context window?

### What is the purpose of normalising (standardising) the input feature vectors depending on the activation functions in the network?

### What is the influence of the number of units per layer and the number of layers?

### What is the influence of the activation function (when you try other activation functions than ReLU, you do not need to reach convergence in case you do not have enough time)

### What is the influence of the learning rate/learning rate strategy?

### How stable are the posteriograms from the network in time?

### How do the errors distribute depending on phonetic class?
