In [None]:
from pathlib import Path

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import OneHotEncoder

In [None]:
results_dir = Path("../data/results").resolve()
fig_dir = Path("../figures").resolve()
DATA_FOLDER_PATH = Path("../data/processed/feature_extracted/pickle").resolve()
WIN_SIZES = [1024, 2048, 4096, 8192]

# Data Loading

Original Data

In [None]:
def load_data():
    """
    Load the training and testing data from pickle files.

    Returns:
        X_train (pd.DataFrame): Training data features.
        y_train (np.ndarray): Training data target.
        X_test (pd.DataFrame): Testing data features.
    """
    train_dict = {}
    test_dict = {}

    y_train = None

    for win_size in WIN_SIZES:
        train_df: pd.DataFrame = pd.read_pickle(
            DATA_FOLDER_PATH / f"train_features_{win_size}.pkl"
        )
        X_test: pd.DataFrame = pd.read_pickle(
            DATA_FOLDER_PATH / f"test_features_{win_size}.pkl"
        )

        X_train = train_df.drop(columns=["target"], level=0)

        train_dict[win_size] = X_train
        test_dict[win_size] = X_test

        if y_train is None:
            y_train = train_df["target"].values.ravel()

    X_train = pd.concat(
        train_dict.values(),
        axis=1,
        keys=WIN_SIZES,
        names=["win_size", "feature", "stat"],
    )
    X_test = pd.concat(
        test_dict.values(),
        axis=1,
        keys=WIN_SIZES,
        names=["win_size", "feature", "stat"],
    )

    return X_train, y_train, X_test

In [None]:
x_train, y_train, _ = load_data()
encoder = OneHotEncoder(sparse_output=False)
y_train_one_hot = encoder.fit_transform(y_train.reshape(-1, 1))

Logistic Regression Results

In [None]:
lr_results = pd.read_csv(
    results_dir / "logistic_regression_cv_results.csv",
    dtype={"param_logreg__regularization": str},
)

# Make columns a bit more readable
lr_results.columns = lr_results.columns.str.replace("param_\w+__", "", regex=True)

lr_results["regularization"] = lr_results["regularization"].fillna("None")
lr_results["lam"] = lr_results["lam"].fillna("NA")
lr_results["regularization"] = lr_results["regularization"].replace(
    ["l1", "l2"], ["L1", "L2"]
)

# Filter out results which did not converge
lr_results = lr_results.query("mean_test_score > 0")

Model Comparisons Results

In [None]:
sns.set_theme(style="darkgrid")

# Logistic Regression Results

## Regularization vs Accuracy

In [None]:
# Define logarithmic colormap
cmap = mcolors.LogNorm(vmin=0.01, vmax=10)
colors = plt.cm.viridis(cmap(np.linspace(0.01, 10, 4)))

# Create a dictionary mapping hue levels to colors
hue_dict = {
    0.01: colors[0],
    0.1: colors[1],
    1.0: colors[2],
    10: colors[3],
    "NA": "grey",
}

ax = sns.boxplot(
    y="mean_test_score",
    data=lr_results,
    x="regularization",
    hue="lam",
    palette=hue_dict,
)
# ax.set_title("Logistic Regression\nCV Mean Test Accuracy by Regularization and Lambda")
ax.set_ylabel("CV Mean Test Accuracy")
ax.set_xlabel("Regularization Method")
_ = ax.legend(loc="lower left", title="Lambda")
ax.set_ylim(0, 1)
plt.savefig(
    fig_dir / "lr_accuracy_regularization_lam.png", dpi=600, bbox_inches="tight"
)

## Tolerance vs Fit Time

In [None]:
# Plot tolerance vs Fit Time
ax = sns.boxplot(y="mean_fit_time", data=lr_results, x="tol", hue="regularization")
ax.set_ylabel("CV Mean Fit Time (s)")
ax.set_xlabel("Loss Change Tolerance")
ax.legend(title="Regularization")
plt.savefig(fig_dir / "lr_fit_time_tolerance.png", dpi=600, bbox_inches="tight")

## Window Size vs Accuracy

In [None]:
# Win size vs Accuracy
ax = sns.boxplot(x="win_size", y="mean_test_score", data=lr_results)
ax.set_ylabel("CV Mean Test Accuracy")
ax.set_xlabel("Window Size (N_FFT)")
ax.set_ylim(0, 1)
plt.savefig(fig_dir / "lr_accuracy_win_size_uniform.png", dpi=600, bbox_inches="tight")

## Export CV Results to Latex table

In [None]:
gb = lr_results.groupby(["regularization", "lam"])

cols = ["mean_test_score", "mean_fit_time"]
col_map = {
    "mean_test_score": "Mean Fold Test Accuracy",
    "mean_fit_time": "Mean Fold Fit Time (s)",
}
temp_dict = {}

for col in cols:
    temp_dict[col_map[col]] = gb[col].describe()[["mean", "std"]]

temp_df = pd.concat(temp_dict, axis=1)
temp_df = temp_df.rename_axis(["Regularization", "Lambda"])

# Set lambdas to scientific notation
temp_df.index = temp_df.index.set_levels(
    temp_df.index.levels[1].map(lambda x: f"{float(x):.0e}" if x != "NA" else x),
    level=1,
)
temp_df.to_latex(
    results_dir / "lr_results_summary.tex",
    float_format="%.3f",
    caption="Logistic Regression 5-Fold CV Results Summary",
    label="tab:lr_results_summary",
)
del temp_df

## PCA

In [None]:
plt_window_sizes = [2048, "All"]
n_components = 5
fig, axs = plt.subplots(nrows=len(plt_window_sizes), ncols=1, figsize=(5, 5))

for ax, win_size in zip(axs, plt_window_sizes):
    if win_size == "All":
        pca = PCA(n_components=n_components).fit(x_train)
    else:
        pca = PCA(n_components=n_components).fit(x_train[win_size])

    sns.lineplot(
        np.cumsum(pca.explained_variance_ratio_),
        ax=ax,
        color="red",
        label="Cumulative Explained Variance",
    )
    ax = sns.barplot(pca.explained_variance_ratio_, ax=ax)

    _ = ax.bar_label(ax.containers[0], fontsize=10, fmt="{:.2%}")
    ax.set(
        title=f"Window Size: {win_size}",
        xlabel="Principal Component",
        ylabel="Explained Variance",
    )
    ax.legend(loc="center right")

plt.tight_layout()
plt.savefig(fig_dir / "pca_explained_variance.png", dpi=600, bbox_inches="tight")

## t-SNE

In [None]:
tsne = TSNE(perplexity=60, random_state=42)
X_tsne = tsne.fit_transform(x_train)

In [None]:
n_classes = y_train_one_hot.shape[1]
perplexities = [5, 30, 50, 100, 150]


def fix_label(l):
    if l == "1.0":
        return "Genre"
    return "Rest"


# Plot a t-SNE for each perplexity to see what looks best
for p in perplexities:
    tsne = TSNE(perplexity=p, random_state=42)
    X_tsne = tsne.fit_transform(x_train)
    ncols = 2
    fig, axs = plt.subplots(nrows=n_classes // ncols, ncols=ncols, figsize=(8, 16))

    # Plot a t-SNE for each class since there's many classes
    for i in range(n_classes):
        genre = encoder.categories_[0][i]
        ax = axs[i // ncols, i % ncols]
        ax = sns.scatterplot(
            x=X_tsne[:, 0], y=X_tsne[:, 1], hue=y_train_one_hot[:, i], ax=ax
        )
        ax.set_title(f"{genre.title()}")
        ax.tick_params(left=False, bottom=False, labelbottom=False, labelleft=False)
        ax.legend_.remove()

        # Steal the legend content from the first plot
        if i == 0:
            handles, labels = ax.get_legend_handles_labels()
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.025)
    fig.legend(
        handles=handles, labels=map(fix_label, labels), loc="lower center", ncol=2
    )

    plt.savefig(fig_dir / f"tsne_perplexity_{p}.png", dpi=600, bbox_inches="tight")

# Model Comparison