In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from utils import get_data, save_data, get_table, get_predictions, add_intervals_to_test, all_tests, save_param_values, get_race_info
from plots import plot_error, plot_finish_groups, plot_interval_checks, plot_finish_age_gender
np.random.seed(2025)

race = "bos"
size1, size2 = 2000, 20000
train_yr, test_yr = [2021, 2022, 2023], [2024]

# save_data(race_list=["bos", "nyc", "chi"], size_train=size1, size_test=size2, train_lis=train_yr, test_lis=test_yr)
train, test = get_data(racename=race, size_train=size1, size_test=size2, train_lis=train_yr, test_lis=test_yr, save=False)
params = save_param_values(race)
get_race_info()

Unnamed: 0,Name,Age,M/F,5K,10K,15K,20K,HALF,25K,30K,35K,40K,Finish Net,Year,Marathon
0,"Kipruto, Benson",30,M,929,1870,2805,3773,3982,4696,5637,6566,7411,7791,2021,Boston
1,"Berhanu, Lemi",27,M,931,1872,2806,3774,3983,4697,5639,6566,7448,7837,2021,Boston
2,"Yimer, Jemal",38,M,928,1869,2804,3772,3981,4695,5638,6565,7448,7838,2021,Boston
3,"Ayana, Tsedat",25,M,930,1872,2805,3773,3982,4696,5638,6565,7448,7847,2021,Boston
4,"Barsoton, Leonard",26,M,931,1871,2805,3772,3982,4696,5638,6566,7448,7871,2021,Boston
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167472,"Zarate, Laura (USA)",59,W,3997,7994,11991,15988,16865,19985,23981,27978,31975,33730,2024,Chicago
167473,"Kalp, Celena (USA)",54,W,3998,7995,11992,15989,16866,19986,23983,27981,31978,33733,2024,Chicago
167474,"Jimenez, Sandra (MEX)",49,W,3329,7571,11813,16054,16985,20296,24538,28780,33022,34884,2024,Chicago
167475,"Barton, Tom (USA)",69,M,7997,11226,14945,18663,19479,22382,26101,29820,33538,35171,2024,Chicago


In [2]:
# train_full, test_full = get_data(racename=race, size_train=None, size_test=None, train_lis=train_yr, test_lis=test_yr, save=False)
# print(stats.ks_2samp(train["finish"], train_full["finish"]).pvalue)
# print(stats.anderson_ksamp([train["finish"], train_full["finish"]]).pvalue)
# print(stats.ks_2samp(test["finish"], test_full["finish"]).pvalue)
# print(stats.anderson_ksamp([test["finish"], test_full["finish"]]).pvalue)

# # print(stats.ks_2samp(test["finish"], test_full["finish"]).pvalue)
# # print(stats.ks_2samp(test["curr_pace"], test_full["curr_pace"]).pvalue)
# # print(stats.ks_2samp(test["total_pace"], test_full["total_pace"]).pvalue)
# # print(stats.ks_2samp(test["age"], test_full["age"]).pvalue)
# # print(stats.ks_2samp(test["male"], test_full["male"]).pvalue)

In [2]:
model_info = [
    ("M1", f"stan_results/model1/params_{race}.csv", ["alpha", "total_pace"]),
    ("M2", f"stan_results/model2/params_{race}.csv", ["alpha", "total_pace", "curr_pace"]),
    ("M3", f"stan_results/model3/params_{race}.csv", ["alpha", "total_pace", "curr_pace", "male", "age"]),
]
mpreds = {name: get_predictions(test, path, feats_lis=feats, full=False) for (name, path, feats) in model_info}
models, baseline = ["M1", "M2", "M3"], "BL"
test2 = get_table(test, mpreds, baseline_name=baseline)
test2

Unnamed: 0,id,dist,curr_pace,total_pace,finish,age,gender,year,male,malexage,alpha,lvl,BL,M1,M2,M3
11773,77411,5K,3.238342,3.238342,3.201199,46,F,2024,0,0,1,1,-2.519733,12.056735,12.056501,13.326336
14516,80154,5K,3.649635,3.649635,3.030162,56,F,2024,0,0,1,1,-39.392833,-28.381198,-28.384137,-25.420826
4385,70023,5K,4.111842,4.111842,3.720571,36,M,2024,1,36,1,1,-17.986267,-9.696028,-9.700707,-10.102270
17258,82896,5K,3.373819,3.373819,2.833591,49,M,2024,1,49,1,1,-39.740033,-26.494270,-26.495560,-26.511159
23673,89311,5K,2.554931,2.554931,2.180507,30,F,2024,0,0,1,1,-47.264617,-22.059761,-22.050185,-26.712398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190181,78985,40K,2.645503,3.111872,3.103030,37,F,2024,0,0,1,8,-0.643946,0.107741,1.362726,1.237928
192506,81310,40K,2.862049,2.957705,2.954004,58,F,2024,0,0,1,8,-0.297842,0.500792,0.055633,0.121552
186340,75144,40K,2.930832,3.383808,3.345889,37,M,2024,1,37,1,8,-2.355377,-1.674565,-0.649451,-0.669213
190297,79101,40K,3.158560,3.088326,3.096199,57,M,2024,1,57,1,8,0.579017,1.337522,0.197157,0.348782


In [3]:
tbl2 = all_tests(test2, [["BL", "M1"], ["M1", "M2"], ["M2", "M3"]], savename=f"analysis/tables/{race}_stattest.csv")
tbl2 = all_tests(test2, [["BL", "M1"], ["BL", "M2"], ["BL", "M3"],["M1", "M2"], ["M1", "M3"], ["M2", "M3"]], savename=f"analysis/tables/{race}_stattest.csv")
print(tbl2)
tbl = plot_error(test2, models, baseline, save_name=race, bar=True, other=True)

c_model = "M2"
a = plot_finish_groups(test2, model=c_model, baseline=baseline, save_name=race)
plot_finish_age_gender(test2, model=c_model, baseline=baseline, save_name=race, grouping="age", bins=[0, 30, 40, 50])
tbl

  value = stats.anderson_ksamp([arr1, arr2]).pvalue
  value = stats.anderson_ksamp([arr1, arr2]).pvalue


            KS Wilcoxon      CVM     AD
BL-M1  <0.0001  <0.0001  <0.0001  0.001
BL-M2  <0.0001  <0.0001  <0.0001  0.001
BL-M3  <0.0001  <0.0001  <0.0001  0.001
M1-M2  <0.0001  <0.0001  <0.0001  0.001
M1-M3  <0.0001  <0.0001  <0.0001  0.001
M2-M3   0.1925   0.1045    0.181  0.250
File saved: analysis/plots/bos_error_bar.png
File saved: analysis/tables/bos_error.csv
File saved: analysis/plots/bos_error_groups.png
File saved: analysis/plots/bos_error_gender_age.png


Unnamed: 0,BL,M1,M2,M3,pcnt_M1,pcnt_M2,pcnt_M3
5K,24.431269,15.391344,15.39154,15.408137,0.370015,0.370007,0.369327
10K,22.890636,14.078874,13.046438,12.940036,0.38495,0.430053,0.434702
15K,20.833577,12.073174,10.878845,10.820338,0.420494,0.477822,0.48063
20K,18.622429,10.731013,9.363291,9.214942,0.423759,0.497204,0.50517
25K,14.777327,8.37763,6.939267,6.808557,0.433075,0.530411,0.539257
30K,10.392143,6.508241,4.445924,4.411519,0.373734,0.572184,0.575495
35K,5.2298,3.758383,2.573513,2.65226,0.281352,0.507914,0.492856
40K,1.252623,1.041888,0.765009,0.744646,0.168235,0.389274,0.405531
Overall MAE,14.828,9.003,7.929,7.878,,,
Overall $R^2$,0.787,0.897,0.903,0.904,,,


In [15]:
# mpreds2 = {name: (42195 / 60) / get_predictions(test, path, feats_lis=feats, full=True) for (name, path, feats) in model_info}
# intervals_tbl = add_intervals_to_test(test2, mpreds2, models)
# i_check, i_sizes = plot_interval_checks(intervals_tbl, models, save_name=race)

In [None]:
# import pandas as pd
# params = pd.read_csv(f"stan_results/model3/params_{race}.csv", index_col="Unnamed: 0")
# params.describe().T[["mean", "std", "25%", "50%", "75%"]].round(4)