In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from utils import get_data, save_data, get_table, get_predictions, other_stats, add_intervals_to_test, all_tests
from plots import plot_error, plot_finish_groups, plot_interval_checks, plot_finish_age_gender
np.random.seed(2025)

race = "chi"
size1, size2 = 2000, 20000
train_yr, test_yr = [2021, 2022, 2023], [2024]

# save_data(race_list=["bos", "nyc", "chi"], size_train=size1, size_test=size2, train_lis=train_yr, test_lis=test_yr)
train, test = get_data(racename=race, size_train=size1, size_test=size2, train_lis=train_yr, test_lis=test_yr, save=False)

In [2]:
train_full, test_full = get_data(racename=race, size_train=size1, size_test=None, train_lis=train_yr, test_lis=test_yr, save=False)
stats.ks_2samp(test["finish"], test_full["finish"]).pvalue

0.9585246815750921

In [3]:
model_info = [
    ("M1", f"stan_results/model1/params_{race}.csv", ["alpha", "total_pace"]),
    ("M2", f"stan_results/model2/params_{race}.csv", ["alpha", "total_pace", "curr_pace"]),
    ("M3", f"stan_results/model3/params_{race}.csv", ["alpha", "total_pace", "curr_pace", "male", "age"]),
]
mpreds = {name: get_predictions(test, path, feats_lis=feats, full=False) for (name, path, feats) in model_info}
models, baseline = ["M1", "M2", "M3"], "BL"
test2 = get_table(test, mpreds, baseline_name=baseline)
test2

Unnamed: 0,id,dist,curr_pace,total_pace,finish,age,gender,year,male,malexage,alpha,lvl,BL,M1,M2,M3
23464,138676,5K,2.981515,2.981515,2.907794,39,M,2024,1,39,1,1,-5.979950,8.912917,8.907331,11.792266
41746,156958,5K,2.364066,2.364066,2.264895,44,W,2024,0,0,1,1,-13.025250,14.142419,14.143349,10.379038
8633,123845,5K,3.750938,3.750938,3.492385,49,M,2024,1,49,1,1,-13.880217,-6.005302,-6.013336,-4.465483
3865,119077,5K,3.918495,3.918495,3.835909,39,M,2024,1,39,1,1,-3.863933,3.060184,3.051949,3.929655
26977,142189,5K,2.932551,2.932551,2.773250,34,W,2024,0,0,1,1,-13.775083,1.788097,1.782809,-2.063114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388017,137374,40K,2.369668,2.952030,2.949049,59,M,2024,1,59,1,8,-0.240729,0.361903,1.326438,1.547082
417494,166851,40K,1.319958,1.684636,1.660763,24,W,2024,0,0,1,8,-6.000800,-4.878744,-4.014440,-4.846559
390499,139856,40K,2.530364,2.865124,2.860290,59,W,2024,0,0,1,8,-0.414835,0.207642,0.415563,0.464957
370866,120223,40K,3.548616,3.750938,3.727473,29,M,2024,1,29,1,8,-1.180217,-0.714335,-0.842911,-0.862899


In [4]:
tbl2 = all_tests(test2, [["BL", "M1"], ["M1", "M2"], ["M2", "M3"]], savename=f"analysis/tables/{race}_stattest.csv")
print(tbl2)
tbl, other_tbl = plot_error(test2, models, baseline, save_name=race, bar=True, other=True)

c_model = "M2"
a = plot_finish_groups(test2, model=c_model, baseline=baseline, save_name=race)
plot_finish_age_gender(test2, model=c_model, baseline=baseline, save_name=race, grouping="age")
other_tbl

  value = stats.anderson_ksamp([arr1, arr2]).pvalue


            KS Wilcoxon      CVM      AD
BL-M1  <0.0001  <0.0001  <0.0001  0.0010
M1-M2  <0.0001  <0.0001  <0.0001  0.0010
M2-M3   0.0189  <0.0001   0.0366  0.0573
File saved: analysis/plots/chi_error_bar.png
File saved: analysis/tables/chi_error.csv
File saved: analysis/tables/chi_error2.csv
File saved: analysis/plots/chi_error_groups.png
File saved: analysis/plots/chi_error_gender_age.png


Unnamed: 0,BL,M1,M2,M3,pcnt_BL,pcnt_M1,pcnt_M2,pcnt_M3
OVRL MAE,12.60397,9.467745,8.551597,8.422857,-,-,-,-
OVRL R^2,0.887863,0.935675,0.941011,0.94129,-,-,-,-


In [5]:
mpreds2 = {name: (42195 / 60) / get_predictions(test, path, feats_lis=feats, full=True) for (name, path, feats) in model_info}
intervals_tbl = add_intervals_to_test(test2, mpreds2, models)
i_check, i_sizes = plot_interval_checks(intervals_tbl, models, save_name=race)

File saved: analysis/plots/chi_intervals
analysis/tables/chi_intsizes.csv
analysis/tables/chi_intcheck.csv


In [6]:
param_lists = []
rnd = 4
d = pd.read_csv("stan_results/model2/params_bos.csv", index_col="Unnamed: 0")
params1 = d.describe().T["mean"]
params2 = d.describe().T["std"]
param_lists.append(params1[[f"beta.{i}.1" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('alpha_mean'))
param_lists.append(params2[[f"beta.{i}.1" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('alpha_std'))
param_lists.append(params1[[f"beta.{i}.2" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('total_pace_mean'))
param_lists.append(params2[[f"beta.{i}.2" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('total_pace_std'))
param_lists.append(params1[[f"beta.{i}.3" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('curr_pace_mean'))
param_lists.append(params2[[f"beta.{i}.3" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('curr_pace_std'))
param_lists.append(params1[[f"sigma.{i}" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('sigma_mean'))
param_lists.append(params2[[f"sigma.{i}" for i in range(1, 9)]].round(rnd).reset_index(drop=True).rename('sigma_std'))
pd.concat(param_lists, axis=1).to_csv("analysis/tables/params_bos.csv")