# GMM-based Classification and Visualization on Dataset2a_Speech

This notebook demonstrates Gaussian Mixture Model (GMM) based classification and visualization for the Dataset2a_Speech dataset. The workflow includes data loading, clustering initialization, GMM fitting, classification, metrics calculation, and visualization.

## 1. Import Libraries and Utility Functions

Import essential libraries and define utility functions for saving/loading `.npy` and `.json` files, and for directory creation.

In [None]:
import numpy as np
import os
import json
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

def save_npy(obj, path):
    np.save(path, obj)

def load_npy(path):
    return np.load(path, allow_pickle=True)

def save_json(obj, path):
    with open(path, 'w') as f:
        json.dump(obj, f)

def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

## 2. Data Loading

Load training and testing data for each class from text files using the `load_txt_data` function.

In [None]:
def load_txt_data(base_path, class_names):
    train, test = [], []
    for cls in class_names:
        train.append(np.loadtxt(os.path.join(base_path, 'train', f'{cls}_train.txt')))
        test.append(np.loadtxt(os.path.join(base_path, 'test', f'{cls}_test.txt')))
    return train, test

# Specify dataset path and class names
base_path = r'd:\IITDH\Sem1\SPRL\SPR_Assignment\SPR\Dataset\Group04\rd_group4'
class_names = ['class1', 'class2', 'class3']

train2a, test2a = load_txt_data(base_path, class_names)

## 3. K-means Clustering Implementation

Implement the `kmeans` function to initialize cluster centroids for GMM.

In [None]:
def kmeans(X, K, max_iter=100, tol=1e-4):
    np.random.seed(42)
    N, D = X.shape
    centroids = X[np.random.choice(N, K, replace=False)]
    for it in range(max_iter):
        dists = np.linalg.norm(X[:, None, :] - centroids[None, :, :], axis=2)
        labels = np.argmin(dists, axis=1)
        new_centroids = np.array([X[labels == k].mean(axis=0) if np.any(labels == k) else centroids[k] for k in range(K)])
        if np.linalg.norm(new_centroids - centroids) < tol:
            break
        centroids = new_centroids
    return centroids, labels

## 4. Gaussian Mixture Model (GMM) via EM Algorithm

Implement the `gmm_em` function for fitting GMMs to each class using the EM algorithm, including the `gaussian_pdf` helper.

In [None]:
def gaussian_pdf(X, mean, cov):
    D = X.shape[1]
    cov_det = np.linalg.det(cov)
    cov_inv = np.linalg.inv(cov)
    norm_const = 1.0 / (np.power((2*np.pi), D/2) * np.sqrt(cov_det + 1e-10))
    diff = X - mean
    exp_term = np.exp(-0.5 * np.sum(diff @ cov_inv * diff, axis=1))
    return norm_const * exp_term

def gmm_em(X, K, max_iter=100, tol=1e-4):
    N, D = X.shape
    means, labels = kmeans(X, K)
    covs = np.array([np.cov(X[labels == k].T) + 1e-6*np.eye(D) if np.any(labels == k) else np.eye(D) for k in range(K)])
    weights = np.array([np.mean(labels == k) for k in range(K)])
    log_likelihoods = []
    for it in range(max_iter):
        resp = np.zeros((N, K))
        for k in range(K):
            resp[:, k] = weights[k] * gaussian_pdf(X, means[k], covs[k])
        resp_sum = resp.sum(axis=1, keepdims=True)
        resp = resp / (resp_sum + 1e-10)
        Nk = resp.sum(axis=0)
        weights = Nk / N
        means = np.array([np.sum(resp[:, k][:, None] * X, axis=0) / Nk[k] for k in range(K)])
        covs = np.array([
            ((resp[:, k][:, None] * (X - means[k])).T @ (X - means[k])) / Nk[k] + 1e-6*np.eye(D)
            for k in range(K)
        ])
        ll = np.sum(np.log(resp_sum + 1e-10))
        log_likelihoods.append(ll)
        if it > 0 and abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            break
    params = {'means': means, 'covs': covs, 'weights': weights}
    return params, log_likelihoods

## 5. Bayes Classifier Implementation

Implement `bayes_classifier` to predict class labels for test data using fitted GMM parameters.

In [None]:
def bayes_classifier(X, gmm_params_list):
    N = X.shape[0]
    num_classes = len(gmm_params_list)
    scores = np.zeros((N, num_classes))
    for c, params in enumerate(gmm_params_list):
        K = len(params['weights'])
        prob = np.zeros((N, K))
        for k in range(K):
            prob[:, k] = params['weights'][k] * gaussian_pdf(X, params['means'][k], params['covs'][k])
        scores[:, c] = prob.sum(axis=1)
    preds = np.argmax(scores, axis=1)
    return preds

## 6. Metrics Calculation

Implement `compute_metrics` to calculate accuracy, precision, recall, F1-score, and confusion matrix. Include `print_metrics` for formatted output.

In [None]:
def compute_metrics(y_true, y_pred, num_classes):
    cm = np.zeros((num_classes, num_classes), dtype=int)
    for t, p in zip(y_true, y_pred):
        cm[t, p] += 1
    acc = np.trace(cm) / np.sum(cm)
    precision = np.zeros(num_classes)
    recall = np.zeros(num_classes)
    f1 = np.zeros(num_classes)
    for i in range(num_classes):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        precision[i] = tp / (tp + fp + 1e-10)
        recall[i] = tp / (tp + fn + 1e-10)
        f1[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i] + 1e-10)
    metrics = {
        'accuracy': acc,
        'precision': precision.tolist(),
        'mean_precision': np.mean(precision),
        'recall': recall.tolist(),
        'mean_recall': np.mean(recall),
        'f1': f1.tolist(),
        'mean_f1': np.mean(f1),
        'confusion_matrix': cm.tolist()
    }
    return metrics

def print_metrics(metrics):
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print("Precision per class:", ["{:.4f}".format(p) for p in metrics['precision']])
    print("Mean Precision:", "{:.4f}".format(metrics['mean_precision']))
    print("Recall per class:", ["{:.4f}".format(r) for r in metrics['recall']])
    print("Mean Recall:", "{:.4f}".format(metrics['mean_recall']))
    print("F1 per class:", ["{:.4f}".format(f) for f in metrics['f1']])
    print("Mean F1:", "{:.4f}".format(metrics['mean_f1']))
    print("Confusion Matrix:")
    print(np.array(metrics['confusion_matrix']))

## 7. Experiment Runner for Dataset2a_Speech

Use `run_experiment` to fit GMMs for each class, run classification, and save results for multiple values of K.

In [None]:
def run_experiment(dataset_name, train_data, test_data, mixture_counts, result_dir):
    ensure_dir(result_dir)
    num_classes = len(train_data)
    for K in mixture_counts:
        print(f'Running {dataset_name} with {K} mixtures...')
        gmm_params_list = []
        log_likelihoods_list = []
        for i, cls_data in enumerate(train_data):
            params, log_likelihoods = gmm_em(cls_data, K)
            gmm_params_list.append(params)
            log_likelihoods_list.append(log_likelihoods)
            save_npy(params, f'{result_dir}/gmm_params_class{i}_K{K}.npy')
            save_npy(log_likelihoods, f'{result_dir}/gmm_loglik_class{i}_K{K}.npy')
        y_true = []
        X_test = []
        for i, cls_test in enumerate(test_data):
            y_true.extend([i]*len(cls_test))
            X_test.append(cls_test)
        X_test = np.vstack(X_test)
        y_true = np.array(y_true)
        y_pred = bayes_classifier(X_test, gmm_params_list)
        save_npy(y_true, f'{result_dir}/y_true_K{K}.npy')
        save_npy(y_pred, f'{result_dir}/y_pred_K{K}.npy')
        metrics = compute_metrics(y_true, y_pred, num_classes)
        save_json(metrics, f'{result_dir}/metrics_K{K}.json')
        print(f'Accuracy: {metrics["accuracy"]:.4f}, Mean F1: {metrics["mean_f1"]:.4f}')

# Run experiment for Dataset2a_Speech
result_dir = 'results/Dataset2a_Speech'
run_experiment('Dataset2a_Speech', train2a, test2a, [1,2,4], result_dir)

## 8. Load and Visualize GMM Results

Load saved GMM parameters, log-likelihoods, and metrics. Visualize results using contour plots, decision regions, log-likelihood curves, and elliptical contours.