In [270]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

import pickle as pkl

In [271]:
with open('dense_rank_distance_matrix_experts_BSU1_BSU2_df.pkl', 'rb') as f:
    dense_rank_distance_matrix_experts_BSU1_BSU2_df = pkl.load(f)

with open('kmeans_rank_distance_matrix_experts_BSU1_BSU2_df.pkl', 'rb') as f:
    kmeans_rank_distance_matrix_experts_BSU1_BSU2_df = pkl.load(f)

In [272]:
with open('dense_rank_linkage_matrix_experts_BSU1_BSU2.pkl', 'rb') as f:
    dense_rank_linkage_matrix_experts_BSU1_BSU2 = pkl.load(f)

with open('kmeans_rank_linkage_matrix_experts_BSU1_BSU2.pkl', 'rb') as f:
    kmeans_rank_linkage_matrix_experts_BSU1_BSU2 = pkl.load(f)

# Cluster Assignments

## Clusters for 30 trials

In [273]:
# Perform hierarchical clustering on columns (trials/audio samples) for both distance matrices
linkage_matrix_dense_audio = linkage(dense_rank_distance_matrix_experts_BSU1_BSU2_df.T, method='ward')
linkage_matrix_kmeans_audio = linkage(kmeans_rank_distance_matrix_experts_BSU1_BSU2_df.T, method='ward')

# Extract 5 clusters from both hierarchical trees
num_clusters_trials = 5
dense_audio_clusters = fcluster(linkage_matrix_dense_audio, num_clusters_trials, criterion='maxclust')
kmeans_audio_clusters = fcluster(linkage_matrix_kmeans_audio, num_clusters_trials, criterion='maxclust')

# Create DataFrames mapping audio samples to their clusters
dense_audio_cluster_df = pd.DataFrame({
    'Audio Sample': dense_rank_distance_matrix_experts_BSU1_BSU2_df.columns,
    'Dense Cluster': dense_audio_clusters
})

kmeans_audio_cluster_df = pd.DataFrame({
    'Audio Sample': kmeans_rank_distance_matrix_experts_BSU1_BSU2_df.columns,
    'KMeans Cluster': kmeans_audio_clusters
})

# Merge to align the clusters from both methods
merged_audio_clusters = dense_audio_cluster_df.merge(kmeans_audio_cluster_df, on="Audio Sample")

# Build a contingency table comparing the two clustering results
contingency_table_trials = pd.crosstab(merged_audio_clusters['Dense Cluster'], merged_audio_clusters['KMeans Cluster'])

# Display the contingency table
contingency_table_trials


KMeans Cluster,1,2,3,4,5
Dense Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0,2,0,2
2,1,0,3,3,2
3,0,1,0,1,1
4,2,1,2,2,0
5,0,1,2,1,2


In [274]:
# Initialize contingency table for trials/audio samples with lists
contingency_table_trials_specified = pd.DataFrame(
    [[[] for _ in range(5)] for _ in range(5)],  # Adjust for 5 clusters
    columns=[1, 2, 3, 4, 5], 
    index=[1, 2, 3, 4, 5]
)
contingency_table_trials_specified.index.name = 'Dense Cluster'
contingency_table_trials_specified.columns.name = 'KMeans Cluster'

# Define a function to retrieve audio sample lists
def get_audio_samples(dense_label, kmeans_label):
    return merged_audio_clusters[
        (merged_audio_clusters['Dense Cluster'] == dense_label) & 
        (merged_audio_clusters['KMeans Cluster'] == kmeans_label)
    ]['Audio Sample'].tolist()

# Populate the contingency table with actual audio sample lists
for dense in [1, 2, 3, 4, 5]:
    for kmeans in [1, 2, 3, 4, 5]:
        contingency_table_trials_specified.at[dense, kmeans] = get_audio_samples(dense, kmeans)

In [275]:
contingency_table_trials_specified

KMeans Cluster,1,2,3,4,5
Dense Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,[PE_CreatureFromTheBlackjackTable],[],"[TM_AmateurOnPurpose, TM_CreatureFromTheBlackj...",[],"[TM_01b_trumpet, DE_ElephantsDream_LD0]"
2,[PE_AmateurOnPurpose],[],"[LP_11_guitar, LP_CreatureFromTheBlackjackTabl...","[LP_23_jazz, LP_AmateurOnPurpose, DE_female_sp...","[UN_AmateurOnPurpose, UN_CreatureFromTheBlackj..."
3,[],[PE_27_castanets],[],[SH_13_glockenspiel],[PE_39_clapping]
4,"[SH_AmateurOnPurpose, SH_CreatureFromTheBlackj...",[DE_SitaSings_remix2_LD6],"[TM_02_violin, UN_21_violin]","[UN_20c_accordion, SH_04_choral]",[]
5,[],[DE_SitaSings_remix1_LD0],"[DE_CosmosLandromat_remix1_LD6, DE_CosmosLandr...",[DE_female_speech_music_3_LD3],"[DE_female_speech_music_1_LD0, DE_Meridian_rem..."


In [276]:
# export the contingency table to a csv file
contingency_table_trials_specified.to_csv('contingency_table_trials_specified.csv')

## Clusters for 42 Subjects

In [277]:
# Perform hierarchical clustering on rows (subjects) for both distance matrices
linkage_matrix_dense_subjects = linkage(dense_rank_distance_matrix_experts_BSU1_BSU2_df, method='ward')
linkage_matrix_kmeans_subjects = linkage(kmeans_rank_distance_matrix_experts_BSU1_BSU2_df, method='ward')

# Extract 3 clusters from both hierarchical trees
num_clusters_subjects = 3
dense_subject_clusters = fcluster(linkage_matrix_dense_subjects, num_clusters_subjects, criterion='maxclust')
kmeans_subject_clusters = fcluster(linkage_matrix_kmeans_subjects, num_clusters_subjects, criterion='maxclust')

# Create DataFrames mapping subjects to their clusters
dense_subject_cluster_df = pd.DataFrame({
    'Subject': dense_rank_distance_matrix_experts_BSU1_BSU2_df.index,
    'Dense Cluster': dense_subject_clusters
})

kmeans_subject_cluster_df = pd.DataFrame({
    'Subject': kmeans_rank_distance_matrix_experts_BSU1_BSU2_df.index,
    'KMeans Cluster': kmeans_subject_clusters
})

# Merge to align clusters from both methods
merged_subject_clusters = dense_subject_cluster_df.merge(kmeans_subject_cluster_df, on="Subject")

# Build a contingency table for subjects
contingency_table_subjects = pd.crosstab(merged_subject_clusters['Dense Cluster'], merged_subject_clusters['KMeans Cluster'])

# Display the contingency table
contingency_table_subjects

KMeans Cluster,1,2,3
Dense Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2,14
2,9,0,5
3,10,0,1


In [None]:
contingency_table_subjects_specified = pd.DataFrame(
    [[[] for _ in range(3)] for _ in range(3)],
    columns=[1, 2, 3], 
    index=[1, 2, 3]
)
contingency_table_subjects_specified.index.name = 'Dense Cluster'
contingency_table_subjects_specified.columns.name = 'KMeans Cluster'

def get_subjects(dense_label, kmeans_label):
    return merged_subject_clusters[
        (merged_subject_clusters['Dense Cluster'] == dense_label) & 
        (merged_subject_clusters['KMeans Cluster'] == kmeans_label)
    ]['Subject'].tolist()
1
# Populate the contingency table with actual subject lists
for dense in [1, 2, 3]:
    for kmeans in [1, 2, 3]:
        contingency_table_subjects_specified.at[dense, kmeans] = get_subjects(dense, kmeans)


In [279]:
contingency_table_subjects_specified

KMeans Cluster,1,2,3
Dense Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,[42],"[16, 23]","[1, 4, 5, 7, 8, 9, 12, 17, 22, 26, 37, 39, 40,..."
2,"[13, 15, 18, 19, 20, 21, 25, 27, 36]",[],"[3, 6, 11, 14, 24]"
3,"[10, 28, 29, 30, 31, 32, 33, 34, 35, 38]",[],[2]


In [280]:
# export the contingency table to a csv file
contingency_table_subjects_specified.to_csv('contingency_table_subjects_specified.csv')