In [1]:
import sys
sys.path.append('/Users/tim/Desktop/Speech/lab1')
sys.path.append('/Users/tim/Desktop/Speech/lab2')
import math
import warnings
import numpy as np
from matplotlib import pyplot as plt
import os
import random
import time


from lab3_tools import *
from lab3_proto import *

In [2]:
# phoneHMMs is a dictionary with 21 keys, each corresponding to a phonetic model
phoneHMMs = np.load('../lab2/lab2_models_all.npz', allow_pickle=True)['phoneHMMs'].item()
phones = sorted(phoneHMMs.keys())
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
# A list of unique states for reference
# Note that we model three segments for each phoneme
stateList = [ph + '_' + str(id) for ph in phones for id in range(nstates[ph])]
stateList[39]

'sil_0'

# Forced Alignment

In [3]:
# Read the audio and compute liftered MFCC features
from lab1_proto import mfcc

filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = loadAudio(filename)
lmfcc = mfcc(samples)

In [4]:
# Recover the sequence of digits (word level transcription) in the file
wordTrans = list(path2info(filename)[2])
wordTrans

['z', '4', '3']

In [5]:
from prondict import prondict
phoneTrans = words2phones(wordTrans, prondict)
phoneTrans

['sil',
 'z',
 'iy',
 'r',
 'ow',
 'sp',
 'f',
 'ao',
 'r',
 'sp',
 'th',
 'r',
 'iy',
 'sp',
 'sil']

In [6]:
from lab2_proto import concatHMMs

# Create a combined model for this specific utterance:
utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

In [7]:
stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])]

In [8]:
from lab2_tools import log_multivariate_normal_density_diag
from lab2_proto import viterbi

# NxM array of emission(observation) log likelihoods, N frames, M states
obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars']) 
log_startprob = np.log(utteranceHMM['startprob'][:-1])
log_transmat = np.log(utteranceHMM['transmat'][:-1, :-1])
vloglik, vpath = viterbi(obsloglik, log_startprob, log_transmat)

stateList = [stateTrans[i] for i in vpath]
stateList

['sil_0',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_1',
 'sil_2',
 'z_0',
 'z_0',
 'z_0',
 'z_0',
 'z_1',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'z_2',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_0',
 'iy_1',
 'iy_2',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_0',
 'r_1',
 'r_2',
 'ow_0',
 'ow_1',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'ow_2',
 'f_0',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_1',
 'f_2',
 'ao_0',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_1',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'ao_2',
 'r_0',
 'r_0',
 'r_0',
 'r_1',
 'r_2',
 'th_0',


# Feature Extraction

In [9]:
# from lab1_proto import mspec

# traindata = []
# for root, dirs, files in os.walk('tidigits/disc_4.1.1/tidigits/train'):
#     for file in files:
#         if file.endswith('.wav'):
#             filename = os.path.join(root, file)
#             samples, samplingrate = loadAudio(filename)
#             lmfcc = mfcc(samples)
#             mspecs = mspec(samples)
#             targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
#             traindata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': mspecs, 'targets': targets})
# np.savez('traindata.npz', traindata=traindata)

# testdata = []
# for root, dirs, files in os.walk('tidigits/disc_4.2.1/tidigits/test'):
#     for file in files:
#         if file.endswith('.wav'):
#             filename = os.path.join(root, file)
#             samples, samplingrate = loadAudio(filename)
#             lmfcc = mfcc(samples)
#             mspecs = mspec(samples)
#             targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
#             testdata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': mspecs, 'targets': targets})
# np.savez('testdata.npz', testdata=testdata)

# Training and Validation Sets

In [10]:
train_data = np.load('traindata.npz', allow_pickle=True)['traindata']
random.seed(420)

data_tuple_list = []
for item in train_data:
    filename = item['filename']
    info_tuple = path2info(filename)
    data_tuple_list.append(info_tuple)


from collections import defaultdict
# Separate data by gender
gender_data = defaultdict(list)
for item in data_tuple_list:
    gender_data[item[0]].append(item)

train_set = []
valid_set = []

# Split data for each gender
for gender, items in gender_data.items():
    # Group by speaker
    speaker_data = defaultdict(list)
    for item in items:
        speaker_data[item[1]].append(item)
    
    # Create lists of speakers and shuffle them to ensure randomness
    speakers = list(speaker_data.keys())
    random.shuffle(speakers)
    
    # Calculate the number of speakers to include in the training set
    num_train_speakers = int(round(len(speakers) * 0.9))
    
    # Split the speakers into training and validation sets
    train_speakers = speakers[:num_train_speakers]
    valid_speakers = speakers[num_train_speakers:]
    
    # Aggregate the data entries corresponding to each set of speakers
    for spkr in train_speakers:
        train_set.extend(speaker_data[spkr])
    for spkr in valid_speakers:
        valid_set.extend(speaker_data[spkr])

# Shuffle the training and validation sets to ensure random order
random.shuffle(train_set)
random.shuffle(valid_set)

In [11]:
print(train_set[:3])

[('man', 'kd', '8', 'b'), ('man', 'gr', '5o25676', 'a'), ('man', 'bd', '6', 'b')]
