In [17]:
import numpy as np
import pandas as pd
from utils import get_data, get_preds, get_table, plot_rmse, other_stats, add_intervals_to_test, plot_interval_check, plot_interval_sizes, plot_finish_groups
np.random.seed(2024)

size = 500
save_val = False
train_yr, test_yr = [2022], [2023]
train_bos, test_bos = get_data(racename="bos", size_train=size, size_test=size, train_lis=train_yr, test_lis=test_yr, save=save_val)
train_nyc, test_nyc = get_data(racename="nyc", size_train=size, size_test=size, train_lis=train_yr, test_lis=test_yr, save=save_val)
train_chi, test_chi = get_data(racename="chi", size_train=size, size_test=size, train_lis=train_yr, test_lis=test_yr, save=save_val)
data = {"bos": (train_bos, test_bos), "nyc": (train_nyc, test_nyc), "chi": (train_chi, test_chi)}
# test_nyc = pd.read_csv("processed_data/test_nyc.csv")

In [18]:
race = "nyc"
train, test = data[race]

model_info = [
    # ("rstan2d", "stan_results/params_bos0d.csv", ["total_pace", "curr_pace", "prop"], ["beta[1]", "beta[2]", "beta[3]"], True),
    ("model1", f"stan_results/model1/params_{race}.csv", ['total_pace', 'prop'], ['beta.1', 'beta.2'], True),
    ("model2", f"stan_results/model2/params_{race}.csv", ['total_pace', 'curr_pace', 'prop'], ['beta.1', 'beta.2', 'beta.3'], True),
    # ("model3", f"stan_results/model3/params_{race}.csv", ['total_pace', 'prop', 'male', 'age', 'malexage'], ['beta.1', 'beta.2', 'beta.3', 'beta.4', 'beta.5'], True),
]

# models = {name: pd.read_csv(path) for (name, path, feats, betas, pleft) in model_info}
mpreds = {name: get_preds(test, pd.read_csv(path), feats_lis=feats, beta_lis=betas, propleft=pleft, name=name)
           for (name, path, feats, betas, pleft) in model_info}

test2 = get_table(test, mpreds)
test2

Unnamed: 0,id,dist,curr_pace,total_pace,finish,age,gender,year,prop,propleft,male,propxcurr,malexage,extrap,model1,model2
0,95478,5K,2.626050,2.626050,2.602541,47,M,2023,0.118497,0.881503,1,0.311180,47,-2.419067,47.061604,46.826006
1,99637,5K,2.790179,2.790179,2.483520,38,M,2023,0.118497,0.881503,1,0.330629,38,-31.121867,12.576561,12.237279
2,114605,5K,2.630195,2.630195,2.009094,30,M,2023,0.118497,0.881503,1,0.311671,30,-82.657683,-33.337344,-33.575975
3,84118,5K,3.615329,3.615329,2.975250,28,W,2023,0.118497,0.881503,0,0.428407,0,-41.847717,-15.866232,-16.430386
4,75343,5K,3.543586,3.543586,3.506898,29,W,2023,0.118497,0.881503,0,0.419906,0,-2.076183,24.955027,24.399179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,108921,40K,2.067825,2.218525,2.215891,50,W,2023,0.947980,0.052020,0,1.960256,0,-0.376729,0.528712,-0.600634
3996,112954,40K,1.795977,2.098746,2.075606,60,W,2023,0.947980,0.052020,0,1.702550,0,-3.735623,-3.043349,-2.595987
3997,98566,40K,1.774938,2.536622,2.512355,40,M,2023,0.947980,0.052020,1,1.682605,40,-2.677935,-1.403344,1.913742
3998,72006,40K,3.620565,4.218074,4.164117,38,M,2023,0.947980,0.052020,1,3.432222,38,-2.160340,-0.581490,-0.260908


In [19]:
# s_data =pd.DataFrame([[4, 0.2, .5], [3, 0.5, .75]], columns=["total_pace", "prop", "propleft"])
# get_preds(s_data, models['rstan2c'], feats_lis = ["total_pace", "prop"], beta_lis = ["beta[1]", "beta[2]"], full=True)

In [20]:
labels = ["model1", "model2", "extrap"]
# rmse_table = plot_rmse(test2, labels, save_name=f"{race}")
rmse_table = plot_rmse(test2, labels, save_name=f"{race}", bar=True)
rmse_table = plot_rmse(test2, labels, save_name=f"{race}", bar=False)
plot_finish_groups(test2, label_pair=["extrap", "model2"], num=4, overall=True, save_name=race, palette="inferno")
for lbl in labels[:-1]:
    rmse_table[f"pcnt_{lbl}"] = 1 - (rmse_table[lbl] / rmse_table["extrap"])

rmse_table

File saved: analysis/nyc_rmse_bar.png
File saved: analysis/nyc_rmse_line.png
File saved: analysis/nyc_rmse_groups.png


Unnamed: 0_level_0,extrap,model1,model2,pcnt_model1,pcnt_model2
dist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5K,35.478472,33.651218,33.678365,0.051503,0.050738
10K,34.168541,26.113734,25.87338,0.235738,0.242772
15K,31.941435,21.485546,21.189766,0.327346,0.336606
20K,27.321324,18.082037,17.842246,0.338171,0.346948
25K,19.868263,14.033234,13.771944,0.293686,0.306837
30K,14.597126,9.840593,9.222936,0.325854,0.368168
35K,7.689932,5.86695,5.336842,0.237061,0.305996
40K,2.033884,1.922466,1.702436,0.054781,0.162963


In [21]:
# group.plot(label=table_group.columns, style=styles, linewidth=2, grid=True, alpha=0.8, color=colors)
# group.swaplevel(0,1).stack(0)#.loc["F"]

In [22]:
other_stats(test2[labels], test2["finish"])

Unnamed: 0,model1,model2,extrap
0,19.153349,18.964194,24.657699
1,0.898999,0.900984,0.832605


In [23]:
mpreds2 = {name: (42195 / 60) / get_preds(test, pd.read_csv(path), feats_lis=feats, beta_lis=betas, propleft=pleft, name=name, full=True)
           for (name, path, feats, betas, pleft) in model_info}
# model_preds2["new1"] = (42195 / 60) / get_preds(test, pd.read_csv("stan_results/params_bos1.csv"), feats_lis=['total_pace', 'curr_pace', 'prop'], beta_lis=['beta.1', 'beta.2', 'beta.3'], propleft=True, full=True)

In [24]:
pred_names = labels[:2]
intervals_tbl = add_intervals_to_test(test2, mpreds2, pred_names)
i_check = plot_interval_check(intervals_tbl, pred_names, save_name=f"{race}")
i_sizes = plot_interval_sizes(intervals_tbl, pred_names, save_name=f"{race}")

File saved: analysis/nyc_interval_check.png
File saved: analysis/nyc_interval_sizes.png
