In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import librosa
from IPython.display import Audio
from batch_hmm import *
from dnn_hmm import *
from mfcc import *
from hmm import *

In [2]:
import numpy as np
from hmmlearn import hmm

# Number of hidden states
n_components = 2

# Dimensionality of the observations
n_features = 3

# Create a Gaussian HMM
model = hmm.GaussianHMM(
    n_components=n_components, 
    covariance_type="diag"
)

# Manually specify the model parameters
model.startprob_ = np.array([1, 0])

model.transmat_ = np.array([
    [0.7, 0.3],
    [0.4, 0.6],
])

# Means of each hidden state
model.means_ = np.array([
    [0.0, 0.0, 0.0], 
    [1.0, 1.0, 1.0]
])

# Covariances of each hidden state
model.covars_ = np.ones((n_components, n_features))

# Generate samples
X, Z = model.sample(100)

print("Shape of X:", X.shape)
print("Shape of Z:", Z.shape)

Shape of X: (100, 3)
Shape of Z: (100,)


In [18]:
self = BatchStableGMMHMM(n_states = 2, n_dims = 3)
#self.mu = np.array([[3, 3, 3], [10, 10, 10]])

In [36]:
for _ in range(1):
    self.em_step(X[np.newaxis])

In [37]:
self.mu

array([[1.23132303, 0.41501557, 1.2354097 ],
       [0.22346937, 0.31676605, 0.18439523]])

In [38]:
self.A

array([[0.55482701, 0.44517299],
       [0.22494962, 0.77505038]])

In [None]:
self = BatchStableGMMHMM(n_states = 2, n_dims = 3, deterministic_start = True)

In [None]:
for _ in range(50):
    self.em_step(X[np.newaxis])

In [None]:
self.mu

In [None]:
np.around(self.A)

In [None]:
fpaths = []
labels = []
spoken = []
for f in os.listdir('audio'):
    for w in os.listdir('audio/' + f):
        fpaths.append('audio/' + f + '/' + w)
        labels.append(f)
        if f not in spoken:
            spoken.append(f)
print('Words spoken:', spoken)

In [None]:
SR = 8000
N_STATES = 5
N_DIMENSIONS = 12
N_ENCODING = 64

In [None]:
def get_path_mfcc(path):
    y, sr = librosa.load(path, sr = SR)
    return get_mfcc(y, sr, n_mfcc = N_DIMENSIONS)

In [None]:
raw_data = [{"label": label, "mfcc": get_path_mfcc(path)} for path, label in zip(fpaths, labels)]
data = {}
for word in spoken:
    mfcc_samples = [d["mfcc"] for d in raw_data if d["label"] == word]
    data[word] = pad_and_stack(mfcc_samples)

In [None]:
dnn_models = {}
for word in spoken:
    dnn_models[word] = BatchStableGMMHMM(N_STATES, N_DIMENSIONS)
    for _ in range(1):
        dnn_models[word].em_step(data[word][0].transpose(0,2,1))

In [None]:
# gmm_models = {}
# for word in spoken:
#     gmm_models[word] = BatchStableGMMHMM(N_STATES, N_DIMENSIONS)
#     for _ in range(50):
#         gmm_models[word].em_step(data[word])

In [None]:
def validate_model(models):
    # Initialize counters for top 1 and top 3 predictions
    top1_count = 0
    top3_count = 0

    # Total number of items
    total_count = len(raw_data)

    # Iterate over all elements in raw_data
    for index in range(total_count):
        test_data = raw_data[index]
        true_label = test_data['label']
        obs = test_data['mfcc'][np.newaxis, :]
        prediction = {k:v.log_likelihood(obs) for k,v in models.items()}

        # Sort predictions in descending order
        sorted_predictions = sorted(prediction.items(), key=lambda x: x[1], reverse=True)

        # Check if true_label is the top prediction
        is_top1 = true_label == sorted_predictions[0][0]
        if is_top1:
            top1_count += 1  # Increase counter if true_label is the top prediction

        # Check if true_label is in the top three predictions
        is_top3 = true_label in [item[0] for item in sorted_predictions[:3]]
        if is_top3:
            top3_count += 1  # Increase counter if true_label is in the top three predictions

    # Calculate percentages
    top1_percentage = (top1_count / total_count) * 100
    top3_percentage = (top3_count / total_count) * 100

    print("Top 1 prediction accuracy: ", top1_percentage, "%")
    print("Top 3 prediction accuracy: ", top3_percentage, "%")


In [None]:
validate_model(dnn_models)

In [None]:
# # Create a time array for the x-axis
# t = np.arange(len(y)) / sr

# # Create a plot
# plt.figure(figsize=(14, 5))
# plt.plot(t, y)
# plt.title('Time-Amplitude plot')
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.show()

In [None]:
# # STFT calculation
# stft_result = stft(y)

# # Time-Frequency plot
# plt.figure(figsize=(14, 5))
# librosa.display.specshow(librosa.amplitude_to_db(stft_result.T), sr=sr, x_axis='time', y_axis='log')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Time-Frequency plot')
# plt.show()

In [None]:
# # Generate MFCCs
# mfccs = get_mfcc(y, sr)

# # Plot the MFCCs
# plt.figure(figsize=(10, 4))
# plt.imshow(mfccs.T, origin='lower', aspect='auto', cmap='viridis')
# plt.title('MFCC')
# plt.ylabel('MFCC Coefficients')
# plt.xlabel('Frame')
# plt.colorbar()
# plt.tight_layout()
# plt.show()