In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import librosa
from IPython.display import Audio
from batch_hmm import *
from dnn_hmm import *
from mfcc import *
from hmm import *

In [None]:
import numpy as np
from hmmlearn import hmm

# Number of hidden states
n_components = 2

# Dimensionality of the observations
n_features = 3

# Create a Gaussian HMM
model = hmm.GaussianHMM(
    n_components=n_components, 
    covariance_type="diag"
)

# Manually specify the model parameters
model.startprob_ = np.array([1, 0])

model.transmat_ = np.array([
    [0.7, 0.3],
    [0.4, 0.6],
])

# Means of each hidden state
model.means_ = np.array([
    [0.0, 0.0, 0.0], 
    [5.0, 5.0, 5.0]
])

# Covariances of each hidden state
model.covars_ = np.ones((n_components, n_features))

# Generate samples
X = np.stack([model.sample(100)[0] for _ in range(100)])

print("Shape of X:", X.shape)

In [None]:
self = BatchStableGMMHMM(n_states = 2, n_dims = 3)
self.mu = np.array([[1, 2, 0], [6, 8, 10]])

In [None]:
for _ in range(25):
    self.em_step(X)

In [None]:
self.mu

In [None]:
self.A

In [None]:
SR = 8000
N_STATES = 5
N_DIMENSIONS = 12
N_ENCODING = 64

In [52]:
model = Encoder(input_dim = N_DIMENSIONS, output_dim = len(spoken), encoding_dim = N_ENCODING, n_heads = 4, dropout = 0.2)

In [57]:
# create dataloaders for train and test set
train_data = DataLoader(WordDataset(x_train, y_train, mask_train), batch_size=32, shuffle=True)
test_data = DataLoader(WordDataset(x_test, y_test, mask_test), batch_size=32, shuffle=True)

In [58]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(50):  # loop over the dataset multiple times
    for x, y, mask in train_data:
        optimizer.zero_grad()  # zero the parameter gradients
        
        outputs = model(x, mask)  # forward pass, get the output of the network
        
        loss = criterion(outputs, y)  # calculate the loss
        loss.backward()  # backward pass, compute gradient of the loss with respect to model parameters
        optimizer.step()  # update model parameters
        
        #print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

In [60]:
def top1_accuracy(output, target):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        pred = torch.argmax(output, 1)
        correct = pred.eq(target.view_as(pred))
        correct = correct.float().sum()
        return correct / len(target)


def topk_accuracy(output, target, k=3):
    """Computes the topk accuracy"""
    with torch.no_grad():
        pred = torch.topk(output, k, dim=1)[1]
        correct = pred.eq(target.view(-1, 1).expand_as(pred))
        correct_k = correct.view(-1).float().sum()
        return correct_k / len(target)


# Let's say our model is called `model`
# and make sure it is in evaluation mode
model.eval()

top1_acc = 0
top3_acc = 0
total_samples = 0

with torch.no_grad():
    for i, (inputs, targets, masks) in enumerate(test_data):
        outputs = model(inputs)

        top1_acc += top1_accuracy(outputs, targets) * len(targets)
        top3_acc += topk_accuracy(outputs, targets) * len(targets)
        total_samples += len(targets)

# Calculate the average accuracies
top1_acc = top1_acc / total_samples
top3_acc = top3_acc / total_samples

print("Top 1 Accuracy: ", top1_acc.item())
print("Top 3 Accuracy: ", top3_acc.item())

Top 1 Accuracy:  1.0
Top 3 Accuracy:  1.0


In [None]:
dnn_models = {}
for word in spoken:
    dnn_models[word] = BatchStableGMMHMM(N_STATES, N_DIMENSIONS)
    for _ in range(1):
        dnn_models[word].em_step(data[word][0])

In [None]:
# gmm_models = {}
# for word in spoken:
#     gmm_models[word] = BatchStableGMMHMM(N_STATES, N_DIMENSIONS)
#     for _ in range(50):
#         gmm_models[word].em_step(data[word])

In [None]:
def validate_model(models):
    # Initialize counters for top 1 and top 3 predictions
    top1_count = 0
    top3_count = 0

    # Total number of items
    total_count = len(raw_data)

    # Iterate over all elements in raw_data
    for index in range(total_count):
        test_data = raw_data[index]
        true_label = test_data['label']
        obs = test_data['mfcc'][np.newaxis, :]
        prediction = {k:v.log_likelihood(obs) for k,v in models.items()}

        # Sort predictions in descending order
        sorted_predictions = sorted(prediction.items(), key=lambda x: x[1], reverse=True)

        # Check if true_label is the top prediction
        is_top1 = true_label == sorted_predictions[0][0]
        if is_top1:
            top1_count += 1  # Increase counter if true_label is the top prediction

        # Check if true_label is in the top three predictions
        is_top3 = true_label in [item[0] for item in sorted_predictions[:3]]
        if is_top3:
            top3_count += 1  # Increase counter if true_label is in the top three predictions

    # Calculate percentages
    top1_percentage = (top1_count / total_count) * 100
    top3_percentage = (top3_count / total_count) * 100

    print("Top 1 prediction accuracy: ", top1_percentage, "%")
    print("Top 3 prediction accuracy: ", top3_percentage, "%")


In [None]:
validate_model(dnn_models)

In [None]:
# # Create a time array for the x-axis
# t = np.arange(len(y)) / sr

# # Create a plot
# plt.figure(figsize=(14, 5))
# plt.plot(t, y)
# plt.title('Time-Amplitude plot')
# plt.xlabel('Time (s)')
# plt.ylabel('Amplitude')
# plt.show()

In [None]:
# # STFT calculation
# stft_result = stft(y)

# # Time-Frequency plot
# plt.figure(figsize=(14, 5))
# librosa.display.specshow(librosa.amplitude_to_db(stft_result.T), sr=sr, x_axis='time', y_axis='log')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Time-Frequency plot')
# plt.show()

In [None]:
# # Generate MFCCs
# mfccs = get_mfcc(y, sr)

# # Plot the MFCCs
# plt.figure(figsize=(10, 4))
# plt.imshow(mfccs.T, origin='lower', aspect='auto', cmap='viridis')
# plt.title('MFCC')
# plt.ylabel('MFCC Coefficients')
# plt.xlabel('Frame')
# plt.colorbar()
# plt.tight_layout()
# plt.show()