In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.cluster import KMeans

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

import os
import kaleido

In [104]:
ODAQ_results = pd.read_csv('./ODAQ/ODAQ_listening_test/ODAQ_results.csv')
ODAQ_results_BSU1 = pd.read_csv('./ODAQ_v1_BSU/Cohort_B1_results.csv')
ODAQ_results_BSU2 = pd.read_csv('./ODAQ_v1_BSU/Cohort_B2_results.csv')

In [105]:
methods = ODAQ_results['method'].unique()
conditions = ODAQ_results['condition'].unique()
processes = ODAQ_results['process'].unique()
items = ODAQ_results['item'].unique()

print(methods)
print(conditions)
print(processes)
print(items)

['LP' 'TM' 'UN' 'SH' 'PE' 'DE']
['LP3.5' 'LP7' 'Q1' 'Q2' 'Q3' 'Q4' 'Q5' 'Ref']
['LP35' 'LP70' 'LP50' 'LP90' 'LP105' 'LP120' 'LP150' 'reference' 'TM3k'
 'TM5k' 'TM7k' 'TM9k' 'TM10.5k' 'UN3k' 'UN5k' 'UN7k' 'UN9k' 'UN10.5k'
 'SH70_MS' 'SH50_MS' 'SH30_MS' 'SH20_MS' 'SH10_MS' 'PE_4096_MS_NMR10'
 'PE_2048_MS_NMR10' 'PE_1024_MS_NMR10' 'PE_2048_MS_NMR16'
 'PE_1024_MS_NMR16' 'OpenUnmix_mid' 'TFC_TDF_U_Net_mid' 'Cocktail_mid'
 'DeepFilterNet2_mid' 'PSM_quantize_mask']
['LP_11_guitar' 'LP_23_jazz' 'LP_AmateurOnPurpose'
 'LP_CreatureFromTheBlackjackTable' 'TM_01b_trumpet' 'TM_02_violin'
 'TM_AmateurOnPurpose' 'TM_CreatureFromTheBlackjackTable'
 'UN_20c_accordion' 'UN_21_violin' 'UN_AmateurOnPurpose'
 'UN_CreatureFromTheBlackjackTable' 'SH_04_choral' 'SH_13_glockenspiel'
 'SH_AmateurOnPurpose' 'SH_CreatureFromTheBlackjackTable'
 'PE_27_castanets' 'PE_39_clapping' 'PE_AmateurOnPurpose'
 'PE_CreatureFromTheBlackjackTable' 'DE_CosmosLandromat_remix1_LD6'
 'DE_CosmosLandromat_remix3_LD3' 'DE_ElephantsD

In [106]:
# Dynamically create expert variables
unique_subjects = ODAQ_results['subject'].unique()
for i, subject in enumerate(unique_subjects, start=1):
    globals()[f"expert{i}"] = ODAQ_results[ODAQ_results['subject'] == subject]

# Dynamically create BSU1 variables
unique_subjects_BSU1 = ODAQ_results_BSU1['subject'].unique()
for i, subject in enumerate(unique_subjects_BSU1, start=1):
    globals()[f"BSU1_{i}"] = ODAQ_results_BSU1[ODAQ_results_BSU1['subject'] == subject]

# Dynamically create BSU2 variables
unique_subjects_BSU2 = ODAQ_results_BSU2['subject'].unique()
for i, subject in enumerate(unique_subjects_BSU2, start=1):
    globals()[f"BSU2_{i}"] = ODAQ_results_BSU2[ODAQ_results_BSU2['subject'] == subject]
    
print('Experts: ', unique_subjects)
print('BSU1: ', unique_subjects_BSU1)
print('BSU2: ', unique_subjects_BSU2)

Experts:  ['Subject 1: USLA08' 'Subject 2: DEID44' 'Subject 3: DEID1115'
 'Subject 4: DEID337' 'Subject 5: USLA06' 'Subject 6: DEID5'
 'Subject 7: DEID9' 'Subject 8: DEID4' 'Subject 9: USLG04'
 'Subject 10: USLA04' 'Subject 11: USLA07' 'Subject 12: DEID256'
 'Subject 13: DEID6' 'Subject 14: USLG05' 'Subject 15: USLA09'
 'Subject 16: USLG02' 'Subject 17: USLG03' 'Subject 18: DEID7'
 'Subject 19: USLA12' 'Subject 20: DEID10' 'Subject 21: DEID8'
 'Subject 22: DEID2' 'Subject 23: USLA01' 'Subject 24: USLA05'
 'Subject 25: DEID1' 'Subject 26: DEID3']
BSU1:  ['D001' 'D002' 'D003' 'D004' 'D008' 'D009' 'D010' 'D011']
BSU2:  ['D005' 'D013' 'D014' 'D015' 'D016' 'D017' 'D018' 'D019']


In [107]:
# Initialize score lists dynamically for 26 experts
for i in range(1, 27):  # Assuming 26 experts
    globals()[f"expert{i}_scores"] = []

# Initialize score lists dynamically for BSU1
for i in range(1, 9):  # Assuming 26 experts
    globals()[f"BSU1_{i}_scores"] = []

# Initialize score lists dynamically for BSU2
for i in range(1, 9):  # Assuming 26 experts
    globals()[f"BSU2_{i}_scores"] = []

# Append scores systematically
for item in items:
    for i in range(1, 27):
        expert_df = globals()[f"expert{i}"]  # Access expert data frame
        scores = expert_df[expert_df['item'] == item]['score'].values
        globals()[f"expert{i}_scores"].append(scores)

    for i in range(1, 9):
        BSU1_df = globals()[f"BSU1_{i}"]
        scores = BSU1_df[BSU1_df['item'] == item]['score'].values
        globals()[f"BSU1_{i}_scores"].append(scores)

    for i in range(1, 9):
        BSU2_df = globals()[f"BSU2_{i}"]
        scores = BSU2_df[BSU2_df['item'] == item]['score'].values
        globals()[f"BSU2_{i}_scores"].append(scores)
        

In [108]:
# create expert{}_scores_df

# Initialize expert{}_scores_df
for i in range(1, 27):
    globals()[f"expert{i}_scores_df"] = pd.DataFrame()

# Append scores systematically
for i in range(1, 27):
    globals()[f"expert{i}_scores_df"]['item'] = items
    globals()[f"expert{i}_scores_df"]['score'] = globals()[f"expert{i}_scores"]
    globals()[f"expert{i}_scores_df"]['condition'] = [list(conditions)] * len(items)

# expand the scores column such that each element in the vector is a row
for i in range(1, 27):
    globals()[f"expert{i}_scores_df"] = globals()[f"expert{i}_scores_df"].explode(['score', 'condition'])


# create BSU1_{}_scores_df

# Initialize BSU1_{}_scores_df
for i in range(1, 9):
    globals()[f"BSU1_{i}_scores_df"] = pd.DataFrame()

# Append scores systematically
for i in range(1, 9):
    globals()[f"BSU1_{i}_scores_df"]['item'] = items
    globals()[f"BSU1_{i}_scores_df"]['score'] = globals()[f"BSU1_{i}_scores"]
    globals()[f"BSU1_{i}_scores_df"]['condition'] = [list(conditions)] * len(items)

# expand the scores column such that each element in the vector is a row
for i in range(1, 9):
    globals()[f"BSU1_{i}_scores_df"] = globals()[f"BSU1_{i}_scores_df"].explode(['score', 'condition'])


# create BSU2_{}_scores_df

# Initialize BSU2_{}_scores_df
for i in range(1, 9):
    globals()[f"BSU2_{i}_scores_df"] = pd.DataFrame()

# Append scores systematically
for i in range(1, 9):
    globals()[f"BSU2_{i}_scores_df"]['item'] = items
    globals()[f"BSU2_{i}_scores_df"]['score'] = globals()[f"BSU2_{i}_scores"]
    globals()[f"BSU2_{i}_scores_df"]['condition'] = [list(conditions)] * len(items)

# expand the scores column such that each element in the vector is a row
for i in range(1, 9):
    globals()[f"BSU2_{i}_scores_df"] = globals()[f"BSU2_{i}_scores_df"].explode(['score', 'condition'])

# K-Means for Ranking Experts and Students

In [109]:
# K-means clustering for expert scores for ranking

for i in range(1, 27):
    # Dynamically access each expert's DataFrame
    df = globals()[f"expert{i}_scores_df"]
    
    # Apply K-means clustering (k=8)
    kmeans = KMeans(n_clusters=8, random_state=0).fit(df['score'].values.reshape(-1, 1))
    
    # Assign initial cluster labels
    df['cluster'] = kmeans.labels_
    
    # Compute mean score for each cluster
    cluster_means = df.groupby('cluster')['score'].mean()
    
    # Rank clusters by mean score, assigning new labels from 1 to 8
    cluster_rank = {old_label: new_label for new_label, old_label in enumerate(np.argsort(cluster_means.values), start=1)}
    
    # Reassign cluster labels based on ranking
    df['cluster'] = df['cluster'].map(cluster_rank)
    
    # Store back the updated DataFrame
    globals()[f"expert{i}_scores_df"] = df


# K-means clustering for BSU1 scores for ranking

for i in range(1, 9):
    # Dynamically access each expert's DataFrame
    df = globals()[f"BSU1_{i}_scores_df"]
    
    # Apply K-means clustering (k=8)
    kmeans = KMeans(n_clusters=8, random_state=0).fit(df['score'].values.reshape(-1, 1))
    
    # Assign initial cluster labels
    df['cluster'] = kmeans.labels_
    
    # Compute mean score for each cluster
    cluster_means = df.groupby('cluster')['score'].mean()
    
    # Rank clusters by mean score, assigning new labels from 1 to 8
    cluster_rank = {old_label: new_label for new_label, old_label in enumerate(np.argsort(cluster_means.values), start=1)}
    
    # Reassign cluster labels based on ranking
    df['cluster'] = df['cluster'].map(cluster_rank)
    
    # Store back the updated DataFrame
    globals()[f"BSU1_{i}_scores_df"] = df


# K-means clustering for BSU2 scores for ranking

for i in range(1, 9):
    # Dynamically access each expert's DataFrame
    df = globals()[f"BSU2_{i}_scores_df"]
    
    # Apply K-means clustering (k=8)
    kmeans = KMeans(n_clusters=8, random_state=0).fit(df['score'].values.reshape(-1, 1))
    
    # Assign initial cluster labels
    df['cluster'] = kmeans.labels_
    
    # Compute mean score for each cluster
    cluster_means = df.groupby('cluster')['score'].mean()
    
    # Rank clusters by mean score, assigning new labels from 1 to 8
    cluster_rank = {old_label: new_label for new_label, old_label in enumerate(np.argsort(cluster_means.values), start=1)}
    
    # Reassign cluster labels based on ranking
    df['cluster'] = df['cluster'].map(cluster_rank)
    
    # Store back the updated DataFrame
    globals()[f"BSU2_{i}_scores_df"] = df


In [110]:
# Convert back to simple dataframe

for i in range(1, 27):
    df = globals()[f"expert{i}_scores_df"]
    
    # Group by 'item' and aggregate the lists back
    df = df.groupby('item').agg({
        'score': list,
        'condition': list,
        'cluster': list
    }).reset_index()

    # Rename 'cluster' to 'rankings'
    df = df.rename(columns={'cluster': 'rankings'})
    
    # Store the updated DataFrame back
    globals()[f"expert{i}_scores_df"] = df

for i in range(1, 9):
    df = globals()[f"BSU1_{i}_scores_df"]
    
    # Group by 'item' and aggregate the lists back
    df = df.groupby('item').agg({
        'score': list,
        'condition': list,
        'cluster': list
    }).reset_index()

    # Rename 'cluster' to 'rankings'
    df = df.rename(columns={'cluster': 'rankings'})
    
    # Store the updated DataFrame back
    globals()[f"BSU1_{i}_scores_df"] = df

for i in range(1, 9):
    df = globals()[f"BSU2_{i}_scores_df"]
    
    # Group by 'item' and aggregate the lists back
    df = df.groupby('item').agg({
        'score': list,
        'condition': list,
        'cluster': list
    }).reset_index()

    # Rename 'cluster' to 'rankings'
    df = df.rename(columns={'cluster': 'rankings'})
    
    # Store the updated DataFrame back
    globals()[f"BSU2_{i}_scores_df"] = df
# Extract all the 'rankings' column value for each expert and create a 30x8 matrix for each expert. Name as expert1_rankings_kmeans_based, expert2_rankings_kmeans_based, etc.

for i in range(1, 27):
    df = globals()[f"expert{i}_scores_df"]
    
    # Extract the 'cluster' values as a 30x8 matrix
    rankings_matrix = np.array(df['rankings'].tolist())  # Convert list of lists to array
    
    # Store as a variable dynamically
    globals()[f"expert{i}_rankings_kmeans_based"] = rankings_matrix

for i in range(1, 9):
    df = globals()[f"BSU1_{i}_scores_df"]
    
    # Extract the 'cluster' values as a 30x8 matrix
    rankings_matrix = np.array(df['rankings'].tolist())  # Convert list of lists to array
    
    # Store as a variable dynamically
    globals()[f"BSU1_{i}_rankings_kmeans_based"] = rankings_matrix

for i in range(1, 9):
    df = globals()[f"BSU2_{i}_scores_df"]
    
    # Extract the 'cluster' values as a 30x8 matrix
    rankings_matrix = np.array(df['rankings'].tolist())  # Convert list of lists to array
    
    # Store as a variable dynamically
    globals()[f"BSU2_{i}_rankings_kmeans_based"] = rankings_matrix

In [111]:
easy_easiest_trials_competition_kmeans = ['TM_01b_trumpet', 'DE_ElephantsDream_LD0', 'LP_23_jazz', 'LP_AmateurOnPurpose', 'DE_female_speech_music_2_LD9', 'UN_AmateurOnPurpose', 'UN_CreatureFromTheBlackjackTable']
hard_hardest_trials_competition_kmeans = ['SH_AmateurOnPurpose', 'SH_CreatureFromTheBlackjackTable', 'DE_SitaSings_remix2_LD6', 'PE_27_castanets']

In [112]:
reversed_perfect_ranking = np.array([8, 7, 6, 5, 4, 3, 2, 1])

# Create DataFrame for perfect ranking
perfect_df = pd.DataFrame({
    'Sample': ['Perfect Ranking'] * len(conditions),
    'Condition': conditions,
    'Ranking': reversed_perfect_ranking
})

In [None]:
# Spaghetti plot for BSU2_1 rankings

reversed_BSU1_5_rankings = BSU1_5_rankings_kmeans_based[:, ::-1]

# Reshape rankings data for Plotly
BSU1_5_rankings_df = pd.DataFrame(reversed_BSU1_5_rankings, columns=conditions)

BSU1_5_rankings_df['Sample'] = items

# Melt dataframe for better visualization
BSU1_5_rankings_df_melted = BSU1_5_rankings_df.melt(id_vars=['Sample'], var_name='Condition', value_name='Ranking')

BSU1_5_rankings_df_melted = pd.concat([perfect_df, BSU1_5_rankings_df_melted])

# Define custom colors
color_map = {
    "Perfect Ranking": "black"
}

# Assign green to easy items
for item in easy_easiest_trials_competition_kmeans:
    color_map[item] = "green"

# Assign red to hard items
for item in hard_hardest_trials_competition_kmeans:
    color_map[item] = "red"

# Assign light grey to all other samples
all_samples = BSU1_5_rankings_df_melted['Sample'].unique()
for item in all_samples:
    if item not in color_map:
        color_map[item] = "lightgrey"

# Create the line plot with custom colors
fig = px.line(BSU1_5_rankings_df_melted, x='Condition', y='Ranking', color='Sample', markers=True,
              title="Cohort 1 Student 5 (K-Means) Rankings per Condition with Perfect Ranking Reference",
              labels={"Ranking": "Ranking", "Condition": "Conditions (Low to High Quality)"},
              color_discrete_map=color_map,
              template="plotly_white")

# Adjust figure dimensions
fig.update_layout(width=1000, height=800)

fig.update_layout(
    title=dict(
        text="Cohort 1 Student 5 (K-Means) Rankings per Condition with Perfect Ranking Reference",
        font=dict(size=20)  # Increase this value for a larger title
    )
)


# Invert y-axis (lower ranks at top)
fig.update_yaxes(autorange="reversed")

# Modify the "Perfect Ranking" line to be more visible
fig.update_traces(
    selector=dict(name="Perfect Ranking"),
    line=dict(width=10, color='black'),
    marker=dict(size=14, color='black')
)

# Update all traces except "Perfect Ranking" to be slightly transparent
fig.for_each_trace(
    lambda trace: trace.update(line=dict(color='rgba(211,211,211,0.4)')) 
    if trace.name != "Perfect Ranking" and color_map.get(trace.name) == "lightgrey" else None
)

# Optional: update easy and hard items to semi-transparent green and red
fig.for_each_trace(
    lambda trace: trace.update(line=dict(color='rgba(0,128,0,0.5)'))  # green with opacity
    if color_map.get(trace.name) == "green" else None
)
fig.for_each_trace(
    lambda trace: trace.update(line=dict(color='rgba(255,0,0,0.5)'))  # red with opacity
    if color_map.get(trace.name) == "red" else None
)

fig.show()

In [None]:
# Spaghetti plot for Expert 16 rankings

reversed_expert16_rankings_hard_hardest = expert16_rankings_kmeans_based[:, ::-1]

# Reshape rankings data for Plotly
expert16_rankings_df = pd.DataFrame(reversed_expert16_rankings_hard_hardest, columns=conditions)
expert16_rankings_df['Sample'] = items

# Melt dataframe for better visualization
expert16_rankings_df_melted = expert16_rankings_df.melt(id_vars=['Sample'], var_name='Condition', value_name='Ranking')

expert16_rankings_df_melted = pd.concat([perfect_df, expert16_rankings_df_melted])


# Define custom colors
color_map = {
    "Perfect Ranking": "black"
}

# Assign green to easy items
for item in easy_easiest_trials_competition_kmeans:
    color_map[item] = "green"

# Assign red to hard items
for item in hard_hardest_trials_competition_kmeans:
    color_map[item] = "red"

# Assign light grey to all other samples
all_samples = expert16_rankings_df_melted['Sample'].unique()
for item in all_samples:
    if item not in color_map:
        color_map[item] = "lightgrey"

# Create plot
fig = px.line(expert16_rankings_df_melted, x='Condition', y='Ranking', color='Sample', markers=True,
              title="Expert 16 (K-Means) Rankings per Condition with Perfect Ranking Reference",
              labels={"Ranking": "Ranking", "Condition": "Conditions (Low to High Quality)"},
              color_discrete_map=color_map,
              template="plotly_white")

# Adjust figure dimensions
fig.update_layout(width=1000, height=800)

# Invert y-axis (lower ranks at top)
fig.update_yaxes(autorange="reversed")

# Modify the "Perfect Ranking" line to be more visible
fig.update_traces(
    selector=dict(name="Perfect Ranking"),
    line=dict(width=10, color='black'),
    marker=dict(size=14, color='black')
)

fig.update_layout(
    title=dict(
        text="Expert 16 (K-Means) Rankings per Condition with Perfect Ranking Reference",
        font=dict(size=20)  # Increase this value for a larger title
    )
)


# Update all traces except "Perfect Ranking" to be slightly transparent
fig.for_each_trace(
    lambda trace: trace.update(line=dict(color='rgba(211,211,211,0.4)')) 
    if trace.name != "Perfect Ranking" and color_map.get(trace.name) == "lightgrey" else None
)

# Optional: update easy and hard items to semi-transparent green and red
fig.for_each_trace(
    lambda trace: trace.update(line=dict(color='rgba(0,128,0,0.5)'))  # green with opacity
    if color_map.get(trace.name) == "green" else None
)
fig.for_each_trace(
    lambda trace: trace.update(line=dict(color='rgba(255,0,0,0.5)'))  # red with opacity
    if color_map.get(trace.name) == "red" else None
)


# Show figure
fig.show()