In [1]:
from config import *
from data import *
from hmm import *
from evaluation import *
from encoder import *

In [2]:
data_config = DataConfig(
    folder_path="audio",
    sr=8000,
    window_ms=10,
    overlap_pct=0.25,
    mel_banks=20,
    n_mfcc=12,
    test_size=0.25,
)
hmm_config = HMMConfig(n_states=2)
nn_config = NNConfig(
    input_dim=data_config.n_mfcc,
    output_dim=None,
    encoding_dim=64,
    n_heads=4,
    dropout=0.2,
    num_epochs=50,
    learning_rate=0.001,
    batch_size=32,
)

# Simulated Data

In [3]:
hmm_samples = generate_HMM_samples()

In [4]:
self = BatchStableGMMHMM(n_states=2, n_dims=3)
for _ in range(25):
    self.em_step(hmm_samples)

Initializing mu...
Initializing covs...


In [5]:
self = DNNHMM(n_states=2, n_dims=3)
for _ in range(25):
    self.em_step(hmm_samples)

Initializing nn...


# HMM Audio Data

In [6]:
train_hmm, test_hmm = prepare_hmm_dataset(
    folder_path=data_config.folder_path,
    sr=data_config.sr,
    window_ms=data_config.window_ms,
    overlap_pct=data_config.overlap_pct,
    mel_banks=data_config.mel_banks,
    n_mfcc=data_config.n_mfcc,
    test_size=data_config.test_size,
)
spoken_words = list(train_hmm.keys())

Words spoken: ['apple', 'banana', 'kiwi', 'lime', 'orange', 'peach', 'pineapple']


In [7]:
gmm_hmm_models = {}
for word in spoken_words:
    gmm_hmm_models[word] = BatchStableGMMHMM(
        n_states=hmm_config.n_states, n_dims=data_config.n_mfcc
    )
    for _ in range(25):
        gmm_hmm_models[word].em_step(train_hmm[word][0])

Initializing mu...
Initializing covs...
Initializing mu...
Initializing covs...
Initializing mu...
Initializing covs...
Initializing mu...
Initializing covs...
Initializing mu...
Initializing covs...
Initializing mu...
Initializing covs...
Initializing mu...
Initializing covs...


In [8]:
calculate_hmm_accuracy(test_hmm, gmm_hmm_models)

(1.0, 1.0)

In [9]:
dnn_hmm_models = {}
for word in train_hmm.keys():
    dnn_hmm_models[word] = DNNHMM(
        n_states=hmm_config.n_states, n_dims=data_config.n_mfcc
    )
    for _ in range(25):
        dnn_hmm_models[word].em_step(train_hmm[word][0], train_hmm[word][1])

Initializing nn...
Initializing nn...
Initializing nn...
Initializing nn...
Initializing nn...
Initializing nn...
Initializing nn...


In [10]:
calculate_hmm_accuracy(test_hmm, dnn_hmm_models)

(0.14285714285714285, 0.4642857142857143)

# NN Audio Data

In [11]:
train_dataset, test_dataset, label_map, reverse_label_map = prepare_nn_datasets(
    folder_path=data_config.folder_path,
    sr=data_config.sr,
    window_ms=data_config.window_ms,
    overlap_pct=data_config.overlap_pct,
    mel_banks=data_config.mel_banks,
    n_mfcc=data_config.n_mfcc,
    test_size=data_config.test_size,
)

Words spoken: ['apple', 'banana', 'kiwi', 'lime', 'orange', 'peach', 'pineapple']


In [12]:
nn_model = Encoder(
    input_dim=nn_config.input_dim,
    output_dim=len(label_map),
    encoding_dim=nn_config.encoding_dim,
    n_heads=nn_config.n_heads,
    dropout=nn_config.dropout,
)

In [13]:
train_model(
    nn_model,
    train_dataset,
    num_epochs=nn_config.num_epochs,
    learning_rate=nn_config.learning_rate,
    batch_size=nn_config.batch_size,
)

Epoch: 1, Loss: 2.0135
Epoch: 2, Loss: 1.6994
Epoch: 3, Loss: 1.6494
Epoch: 4, Loss: 1.6056
Epoch: 5, Loss: 1.5653
Epoch: 6, Loss: 1.5410
Epoch: 7, Loss: 1.5067
Epoch: 8, Loss: 1.4838
Epoch: 9, Loss: 1.4661
Epoch: 10, Loss: 1.4416
Epoch: 11, Loss: 1.4260
Epoch: 12, Loss: 1.4168
Epoch: 13, Loss: 1.4104
Epoch: 14, Loss: 1.3976
Epoch: 15, Loss: 1.3860
Epoch: 16, Loss: 1.3729
Epoch: 17, Loss: 1.3653
Epoch: 18, Loss: 1.3533
Epoch: 19, Loss: 1.3422
Epoch: 20, Loss: 1.3306
Epoch: 21, Loss: 1.3200
Epoch: 22, Loss: 1.3086
Epoch: 23, Loss: 1.3007
Epoch: 24, Loss: 1.2868
Epoch: 25, Loss: 1.2760
Epoch: 26, Loss: 1.2681
Epoch: 27, Loss: 1.2558
Epoch: 28, Loss: 1.2475
Epoch: 29, Loss: 1.2506
Epoch: 30, Loss: 1.2301
Epoch: 31, Loss: 1.2161
Epoch: 32, Loss: 1.2070
Epoch: 33, Loss: 1.1973
Epoch: 34, Loss: 1.1889
Epoch: 35, Loss: 1.1749
Epoch: 36, Loss: 1.1675
Epoch: 37, Loss: 1.1549
Epoch: 38, Loss: 1.1452
Epoch: 39, Loss: 1.1347
Epoch: 40, Loss: 1.1253
Epoch: 41, Loss: 1.1101
Epoch: 42, Loss: 1.0838
E

In [14]:
compute_nn_accuracies(nn_model, test_dataset)

(0.9629629850387573, 1.0)

# Plot

In [15]:
# # Create a time array for the x-axis
# t = np.arange(len(y)) / sr

# # Create a plot
# plt.figure(figsize=(14, 5))
# plt.plot(t, y)
# plt.title('Time-Amplitude plot')
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.show()

In [16]:
# # STFT calculation
# stft_result = stft(y)

# # Time-Frequency plot
# plt.figure(figsize=(14, 5))
# librosa.display.specshow(librosa.amplitude_to_db(stft_result.T), sr=sr, x_axis='time', y_axis='log')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Time-Frequency plot')
# plt.show()

In [17]:
# # Generate MFCCs
# mfccs = get_mfcc(y, sr)

# # Plot the MFCCs
# plt.figure(figsize=(10, 4))
# plt.imshow(mfccs.T, origin='lower', aspect='auto', cmap='viridis')
# plt.title('MFCC')
# plt.ylabel('MFCC Coefficients')
# plt.xlabel('Frame')
# plt.colorbar()
# plt.tight_layout()
# plt.show()