In [1]:
import os
import wandb
import numpy as np
from scipy.stats import ttest_ind

In [2]:
TYPE_A = ["early", "late"]
FEATURE_TYPES = ["coordinates", "angles", "both"]
MODELS = ["RandomForestClassifier", "LSTMModel", "CNN1D"]
METRIC = ["Test/Test AUROC", "Test/Test AUPR", "Test/Test accuracy"]

In [3]:
def get_runs_iterations_dict(wandb_api: wandb.Api, project_str: str, name_filter: str = "") -> dict:
    iterations_dict = {}
    
    for type_a in TYPE_A:
        iterations_dict[type_a] = {}
        for feature_type in FEATURE_TYPES:
            iterations_dict[type_a][feature_type] = {}
            for model in MODELS:
                iterations_dict[type_a][feature_type][model] = { metric: [] for metric in METRIC }
                runs = wandb_api.runs(
                    path=project_str,
                    filters={
                        "config.type_a": type_a,
                        "config.feature_type": feature_type,
                        "config.model": model
                    }
                )
                
                for run in runs:
                    if name_filter and name_filter not in run.name:
                        continue
                    run_summary = run.summary
                    for metric in METRIC:
                        if metric in run_summary:
                            iterations_dict[type_a][feature_type][model][metric].append(run_summary[metric])
                        else:
                            print(f"Metric {metric} not found in run {run.id}")
    
    return iterations_dict

In [4]:
def do_p_test(group1, group2):
    """
    Perform a two-sample t-test between two groups.
    
    Args:
        group1 (list or np.array): First group of data.
        group2 (list or np.array): Second group of data.
    
    Returns:
        tuple: t-statistic and p-value.
    """
    t_stat, p_val = ttest_ind(group1, group2)
    return t_stat, p_val

# Example data
# group1 = np.random.normal(0.81055, 0.11545, 90)
# group2 = np.random.normal(0.5076, 0.19, 30)

# # Two-sample t-test
# t_stat, p_val = ttest_ind(group1, group2)
# print(f"Two-sample t-test: t={t_stat}, p={p_val}")

In [None]:
# Get resuls for label and track
os.environ['HTTP_PROXY'] = "socks5h://localhost:10080"
os.environ['WANDB_API_KEY'] = "<LOCAL_WANDB_API_KEY>"
wandb_host = "http://wandb-vogtlab.leomed.ethz.ch:1337"
wandb.login(host=wandb_host)
api = wandb.Api()
label_and_track_iterations = get_runs_iterations_dict(api, "kchincheong/GMA Results Tuned 2")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkcc[0m ([33mkchincheong[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Get Aggpose results
import os
import wandb
os.environ.unsetenv('HTTP_PROXY')
os.environ['WANDB_API_KEY'] = "<WANDB_API_KEY>"
wandb.login(host="https://api.wandb.ai")
api = wandb.Api()
aggpose_iterations = get_runs_iterations_dict(api, "dachopard/GMA Project", "best_params")

In [7]:
# Calculate table values
table_value_dict = {}

for type_a in TYPE_A:
        table_value_dict[type_a] = {}
        for feature_type in FEATURE_TYPES:
            table_value_dict[type_a][feature_type] = {}
            for model in MODELS:
                table_value_dict[type_a][feature_type][model] = { metric: {} for metric in METRIC }
                
                for metric in METRIC:
                    label_and_track_values = label_and_track_iterations[type_a][feature_type][model][metric]
                    aggpose_values = aggpose_iterations[type_a][feature_type][model][metric]
                    
                    if len(label_and_track_values) > 0 and len(aggpose_values) > 0:
                        t_stat, p_val = do_p_test(label_and_track_values, aggpose_values)
                        table_value_dict[type_a][feature_type][model][metric]['p-value'] = p_val
                        table_value_dict[type_a][feature_type][model][metric]['diff'] = np.mean(label_and_track_values) - np.mean(aggpose_values)
                    else:
                        print(f"No data for {type_a}, {feature_type}, {model}, {metric}. Skipping p-test.")


In [8]:
# Print difference table rows (latex)
feature_type_map = {
    "coordinates": "Coord.",
    "angles": "Angles",
    "both": "Both"
}

print("Differences")
for metric in METRIC:
    print(metric)
    for feature_type in FEATURE_TYPES:
        line = f"& {feature_type_map[feature_type]}"
        for type_a in TYPE_A:
            for model in MODELS:
                val = table_value_dict[type_a][feature_type][model][metric]['diff']
                val_str = f"\\textbf{{{val:.4f}}}" if table_value_dict[type_a][feature_type][model][metric]['p-value'] < 0.05 else f"{val:.4f}"
                line += f" & ${val_str}$"

        line += " \\\\"
        
        print(line)
    print("")

print("")
# Print p-value table rows (latex)
print("p-values")
for metric in METRIC:
    print(metric)
    for feature_type in FEATURE_TYPES:
        line = f"& {feature_type_map[feature_type]}"
        for type_a in TYPE_A:
            for model in MODELS:
                val = table_value_dict[type_a][feature_type][model][metric]['p-value']
                val_str = f"\\textbf{{{val:.4f}}}" if table_value_dict[type_a][feature_type][model][metric]['p-value'] < 0.05 else f"{val:.4f}"
                line += f" & ${val_str}$"

        line += " \\\\"
        
        print(line)
    print("")

Differences
Test/Test AUROC
& Coord. & $\textbf{0.1005}$ & $\textbf{0.1226}$ & $\textbf{0.2578}$ & $0.0463$ & $-0.0889$ & $0.0071$ \\
& Angles & $\textbf{0.2641}$ & $0.0075$ & $\textbf{0.2546}$ & $\textbf{0.1511}$ & $\textbf{-0.1146}$ & $\textbf{-0.1625}$ \\
& Both & $\textbf{0.2122}$ & $\textbf{0.1091}$ & $\textbf{0.2383}$ & $0.0735$ & $\textbf{-0.1737}$ & $0.0342$ \\

Test/Test AUPR
& Coord. & $\textbf{0.0890}$ & $\textbf{0.0954}$ & $\textbf{0.1865}$ & $-0.0452$ & $\textbf{-0.1412}$ & $0.0008$ \\
& Angles & $\textbf{0.1623}$ & $-0.0290$ & $\textbf{0.1500}$ & $0.0878$ & $\textbf{-0.1529}$ & $\textbf{-0.2629}$ \\
& Both & $\textbf{0.1587}$ & $\textbf{0.0857}$ & $\textbf{0.1591}$ & $-0.0005$ & $\textbf{-0.2397}$ & $-0.0207$ \\

Test/Test accuracy
& Coord. & $0.0671$ & $0.0540$ & $\textbf{0.1980}$ & $\textbf{0.0939}$ & $-0.0122$ & $-0.0277$ \\
& Angles & $\textbf{0.1912}$ & $-0.0235$ & $\textbf{0.2000}$ & $\textbf{0.0504}$ & $\textbf{-0.0693}$ & $\textbf{-0.0876}$ \\
& Both & $\textbf{0.

In [21]:
# Print difference table rows (markdown)
feature_type_map = {
    "coordinates": "Coord.",
    "angles": "Angles",
    "both": "Both"
}

print("Differences")
for metric in METRIC:
    print(metric)
    for feature_type in FEATURE_TYPES:
        line = f"| | {feature_type_map[feature_type]} | "
        for type_a in TYPE_A:
            for model in MODELS:
                val = table_value_dict[type_a][feature_type][model][metric]['diff']
                val_str = f"**{val:.4f}**" if table_value_dict[type_a][feature_type][model][metric]['p-value'] < 0.05 else f"{val:.4f}"
                line += f" {val_str}"
                
            line += " |"

        print(line)
    print("")

print("")
# Print p-value table rows (markdown)
print("p-values")
for metric in METRIC:
    print(metric)
    for feature_type in FEATURE_TYPES:
        line = f"| | {feature_type_map[feature_type]} | "
        for type_a in TYPE_A:
            for model in MODELS:
                val = table_value_dict[type_a][feature_type][model][metric]['p-value']
                val_str = f"**{val:.4f}**" if table_value_dict[type_a][feature_type][model][metric]['p-value'] < 0.05 else f"{val:.4f}"
                line += f" {val_str}"
            
            line += " |"

        print(line)
    print("")

Differences
Test/Test AUROC
| | Coord. |  **0.1005** **0.1226** **0.2578** | 0.0463 -0.0889 0.0071 |
| | Angles |  **0.2641** 0.0075 **0.2546** | **0.1511** **-0.1146** **-0.1625** |
| | Both |  **0.2122** **0.1091** **0.2383** | 0.0735 **-0.1737** 0.0342 |

Test/Test AUPR
| | Coord. |  **0.0890** **0.0954** **0.1865** | -0.0452 **-0.1412** 0.0008 |
| | Angles |  **0.1623** -0.0290 **0.1500** | 0.0878 **-0.1529** **-0.2629** |
| | Both |  **0.1587** **0.0857** **0.1591** | -0.0005 **-0.2397** -0.0207 |

Test/Test accuracy
| | Coord. |  0.0671 0.0540 **0.1980** | **0.0939** -0.0122 -0.0277 |
| | Angles |  **0.1912** -0.0235 **0.2000** | **0.0504** **-0.0693** **-0.0876** |
| | Both |  **0.1271** **0.0987** **0.1863** | 0.0426 0.0143 0.0098 |


p-values
Test/Test AUROC
| | Coord. |  **0.0213** **0.0016** **0.0000** | 0.3859 0.0908 0.8785 |
| | Angles |  **0.0000** 0.8431 **0.0000** | **0.0027** **0.0192** **0.0030** |
| | Both |  **0.0000** **0.0069** **0.0000** | 0.1381 **0.0004** 0.518