In [1]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
import shared

ve_or_ju = "ve"

In [2]:
running_order = pd.read_csv(f'data/running_order_j2019_{ve_or_ju}.tsv', delimiter="\t")

In [3]:
running_order["leg_nro"] = running_order["leg"]
running_order["orig_name"] = running_order["name"]
running_order["name"] = running_order["name"].str.lower()

In [4]:
running_order.head()

Unnamed: 0,team_id,team,team_base_name,team_country,leg,leg_dist,name,leg_nro,orig_name
0,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,1,6.0,fiona bunn,1,Fiona Bunn
1,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,2,5.7,joana wälti,2,Joana Wälti
2,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,3,7.3,anna kindlundh,3,Anna Kindlundh
3,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,4,7.9,jeanette jönsson hellstadius,4,Jeanette Jönsson Hellstadius
4,30,Suunta Jyväskylä 1,Suunta Jyväskylä,FIN,1,6.0,veera klemettinen,1,Veera Klemettinen


In [5]:
predictions_and_history = pd.read_csv(f"data/preds_for_runners_with_history_2_{ve_or_ju}.csv", delimiter="\t")
predictions_and_history.head()


Unnamed: 0.1,Unnamed: 0,mean_team_id,num_valid_times,mean_pace,stdev,prior_mean,prior_log_std,predicted_pace,pred_pace_error,predicted_log_pace_mean,predicted_log_pace_std,name,teams
0,0,7.0,3,6.649,0.404,7.002,0.102,6.981,0.332,1.943,0.099,anna mårsell,STORA TUNA OK
1,1,27.5,6,7.096,0.609,6.54,0.124,6.658,0.438,1.896,0.108,magdalena olsson,IFK MORAS OK;STORA TUNA OK
2,2,3.0,1,5.944,0.0,6.871,0.137,6.838,0.894,1.922,0.138,julia jakob,STORA TUNA OK
3,3,8.5,6,6.077,0.362,6.624,0.129,6.47,0.393,1.867,0.111,tove alexandersson,STORA TUNA OK
4,4,61.0,3,7.328,0.845,7.742,0.115,7.704,0.376,2.042,0.111,lisa holer,EKSJÖ SOK;GÖTEBORG MAJORNA OK


In [6]:

predictions_and_history["num_runs"] = predictions_and_history["num_valid_times"]
no_history_row = pd.DataFrame([[0, 0, 0]], columns=["predicted_log_pace_mean", "predicted_log_pace_std", "num_valid_times"])
def get_history_and_preds(running_order_row):
    history_row = shared.get_matching_history_row_for_runner(running_order_row, predictions_and_history, no_history_row)
    #print(f"estimate_row log_means {history_row.log_means} {history_row.log_stdevs}")
    pred_log_mean = history_row.predicted_log_pace_mean.values[0]
    pred_log_std = history_row.predicted_log_pace_std.values[0]
    num_valid_times = history_row.num_valid_times.values[0]
    return pd.Series({"pred_log_mean": pred_log_mean, "pred_log_std": pred_log_std, "num_valid_times": num_valid_times})

history_and_preds = running_order.apply(lambda row: get_history_and_preds(row), axis=1)
running_order = running_order.assign(num_runs = history_and_preds.num_valid_times)
running_order = running_order.assign(pred_log_mean = history_and_preds.pred_log_mean)
running_order = running_order.assign(pred_log_std = history_and_preds.pred_log_std)



In [7]:

(top_countries, top_first_names) = shared.read_persisted_dummy_column_values(ve_or_ju)

In [8]:
features = shared.preprocess_features(running_order, top_countries, ve_or_ju)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2173 entries, 0 to 2172
Data columns (total 12 columns):
team_id           2173 non-null int64
team              2173 non-null object
team_base_name    2173 non-null object
team_country      2173 non-null object
leg               2173 non-null int64
leg_dist          2173 non-null float64
name              2173 non-null object
leg_nro           2173 non-null int64
orig_name         2173 non-null object
num_runs          2173 non-null float64
pred_log_mean     2173 non-null float64
pred_log_std      2173 non-null float64
dtypes: float64(4), int64(3), object(5)
memory usage: 203.8+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 931 entries, 0 to 930
Data columns (total 3 columns):
first_name           930 non-null object
fn_pace_class        931 non-null int64
fn_pace_std_class    931 non-null int64
dtypes: int64(2), object(1)
memory usage: 21.9+ KB


None

[]

In [9]:
features.info()

<class 'pandas.core.sparse.frame.SparseDataFrame'>
RangeIndex: 2173 entries, 0 to 2172
Data columns (total 40 columns):
team_id                  2173 non-null Sparse[int64, nan]
team_id_log10            2173 non-null Sparse[float64, nan]
team_id_square           2173 non-null Sparse[int64, nan]
leg_1                    2173 non-null Sparse[uint8, 0]
leg_2                    2173 non-null Sparse[uint8, 0]
leg_3                    2173 non-null Sparse[uint8, 0]
leg_4                    2173 non-null Sparse[uint8, 0]
c_EST                    2173 non-null Sparse[uint8, 0]
c_FIN                    2173 non-null Sparse[uint8, 0]
c_GBR                    2173 non-null Sparse[uint8, 0]
c_LAT                    2173 non-null Sparse[uint8, 0]
c_NOR                    2173 non-null Sparse[uint8, 0]
c_OTHER                  2173 non-null Sparse[uint8, 0]
c_RUS                    2173 non-null Sparse[uint8, 0]
c_SUI                    2173 non-null Sparse[uint8, 0]
c_SWE                    2173 no

In [10]:

#gbr_sd_estimate = shared.predict_without_history(features)
gbr_sd_estimate = shared.predict_without_history(features, ve_or_ju)

Unnamed: 0,log_q_low,predicted,log_q_high,log_std
0,1.864,6.744,2.034,0.085
1,1.809,6.883,2.054,0.122
2,1.727,6.108,2.057,0.165
3,1.845,7.475,2.09,0.122
4,1.873,7.199,2.05,0.088
5,1.858,7.124,2.08,0.111
6,1.858,6.516,2.14,0.141
7,1.902,7.467,2.099,0.098
8,1.894,7.139,2.085,0.095
9,1.884,6.769,2.11,0.113


0.16642944843589602

In [11]:
running_order["predicted"] = gbr_sd_estimate["predicted"]
running_order["log_q_low"] = gbr_sd_estimate["log_q_low"]
running_order["log_q_high"] = gbr_sd_estimate["log_q_high"]
running_order["log_std"] = gbr_sd_estimate["log_std"]


In [12]:
running_order.head()

Unnamed: 0,team_id,team,team_base_name,team_country,leg,leg_dist,name,leg_nro,orig_name,num_runs,pred_log_mean,pred_log_std,predicted,log_q_low,log_q_high,log_std
0,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,1,6.0,fiona bunn,1,Fiona Bunn,1.0,2.11,0.172,6.743671,1.86384,2.033659,0.084909
1,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,2,5.7,joana wälti,2,Joana Wälti,2.0,2.102,0.124,6.882701,1.809328,2.05361,0.122141
2,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,3,7.3,anna kindlundh,3,Anna Kindlundh,6.0,2.077,0.12,6.107582,1.726928,2.056589,0.16483
3,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,4,7.9,jeanette jönsson hellstadius,4,Jeanette Jönsson Hellstadius,3.0,2.041,0.134,7.474857,1.845341,2.089668,0.122164
4,30,Suunta Jyväskylä 1,Suunta Jyväskylä,FIN,1,6.0,veera klemettinen,1,Veera Klemettinen,6.0,2.064,0.106,7.198831,1.873431,2.050207,0.088388


In [13]:
running_order["log_std"].describe(percentiles=[0.01, 0.05, .25, .5, .75, .95, .99])

count    2173.000000
mean        0.166429
std         0.053790
min        -0.057098
1%          0.006605
5%          0.033826
25%         0.150815
50%         0.176335
75%         0.198851
95%         0.232640
99%         0.256546
max         0.319118
Name: log_std, dtype: float64

In [14]:
running_order["log_std_fixed"] = np.clip(running_order["log_std"], 0.1, 0.5)
#running_order["log_std"].values[running_order["log_std"].values < 0] = 0.1

In [15]:
#def select_final_ind_preds(row):
#    return pd.Series({"pred_log_mean": pred_log_mean, "pred_log_std": pred_log_std, "num_valid_times": num_valid_times})
    
    
#final_ind_preds = running_order.apply(lambda row: select_final_ind_preds(row), axis=1)

running_order["final_pace_mean"] = np.log(running_order["predicted"])
running_order["final_pace_std"] = running_order["log_std_fixed"]
use_predicted_mean = running_order["num_runs"].values >= 1
running_order["final_pace_mean"].values[use_predicted_mean] = running_order["pred_log_mean"].values[use_predicted_mean]
use_predicted_std = running_order["num_runs"].values >= 3
running_order["final_pace_std"].values[use_predicted_std] = running_order["pred_log_std"].values[use_predicted_std]
running_order.head().round(3)

Unnamed: 0,team_id,team,team_base_name,team_country,leg,leg_dist,name,leg_nro,orig_name,num_runs,pred_log_mean,pred_log_std,predicted,log_q_low,log_q_high,log_std,log_std_fixed,final_pace_mean,final_pace_std
0,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,1,6.0,fiona bunn,1,Fiona Bunn,1.0,2.11,0.172,6.744,1.864,2.034,0.085,0.1,2.11,0.1
1,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,2,5.7,joana wälti,2,Joana Wälti,2.0,2.102,0.124,6.883,1.809,2.054,0.122,0.122,2.102,0.122
2,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,3,7.3,anna kindlundh,3,Anna Kindlundh,6.0,2.077,0.12,6.108,1.727,2.057,0.165,0.165,2.077,0.12
3,8,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,SWE,4,7.9,jeanette jönsson hellstadius,4,Jeanette Jönsson Hellstadius,3.0,2.041,0.134,7.475,1.845,2.09,0.122,0.122,2.041,0.134
4,30,Suunta Jyväskylä 1,Suunta Jyväskylä,FIN,1,6.0,veera klemettinen,1,Veera Klemettinen,6.0,2.064,0.106,7.199,1.873,2.05,0.088,0.1,2.064,0.106


In [16]:
#running_order.tail(15).round(3)

In [17]:
running_order.to_csv(f"data/running_order_2019_with_estimates_{ve_or_ju}.tsv", "\t")

In [18]:
running_order.tail().round(3)

Unnamed: 0,team_id,team,team_base_name,team_country,leg,leg_dist,name,leg_nro,orig_name,num_runs,pred_log_mean,pred_log_std,predicted,log_q_low,log_q_high,log_std,log_std_fixed,final_pace_mean,final_pace_std
2168,1719,SisuHirvi 1,SisuHirvi,FIN,4,7.9,hirvikoski julia,4,Hirvikoski Julia,0.0,0.0,0.0,13.986,2.669,2.749,0.04,0.1,2.638,0.1
2169,1721,Fana IL 1,Fana IL,NOR,1,6.0,mari fjellbirkeland,1,Mari Fjellbirkeland,0.0,0.0,0.0,13.41,2.538,2.635,0.048,0.1,2.596,0.1
2170,1721,Fana IL 1,Fana IL,NOR,2,5.7,kristine bog vikane,2,Kristine Bog Vikane,0.0,0.0,0.0,11.649,2.543,2.558,0.008,0.1,2.455,0.1
2171,1721,Fana IL 1,Fana IL,NOR,3,7.3,lise christensen,3,Lise Christensen,1.0,2.373,0.18,13.753,2.581,2.61,0.014,0.1,2.373,0.1
2172,1721,Fana IL 1,Fana IL,NOR,4,7.9,rannveig nordhagen,4,Rannveig Nordhagen,0.0,0.0,0.0,10.453,2.477,2.651,0.087,0.1,2.347,0.1


In [19]:
running_order[
    ['num_runs', 'pred_log_mean', "pred_log_std", "predicted", "log_std_fixed", "final_pace_mean", "final_pace_std"]
].groupby('num_runs').agg(["mean"]).round(2)



Unnamed: 0_level_0,pred_log_mean,pred_log_std,predicted,log_std_fixed,final_pace_mean,final_pace_std
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean
num_runs,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.0,0.0,0.0,13.16,0.17,2.56,0.17
1.0,2.55,0.19,13.24,0.19,2.55,0.19
2.0,2.54,0.18,12.81,0.18,2.54,0.18
3.0,2.5,0.17,12.46,0.17,2.5,0.17
4.0,2.45,0.16,11.95,0.17,2.45,0.16
5.0,2.44,0.15,11.88,0.17,2.44,0.15
6.0,2.42,0.14,11.35,0.18,2.42,0.14
7.0,2.41,0.13,11.34,0.16,2.41,0.13
9.0,2.08,0.13,8.16,0.18,2.08,0.13
