In [228]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error

In [186]:
in_file_name = 'data/grouped_paces_ju.tsv'
df_all = pd.read_csv(in_file_name, delimiter="\t")

In [187]:
#df = df_all[np.isfinite(df_all.pace_1)]
df = df_all

In [188]:
paces = df.as_matrix(["pace_2", "pace_3", "pace_4", "pace_5", "pace_6"])
logs = np.log(paces)
means = np.nanmean(logs, axis=1)
stdevs = np.nanstd(logs, axis=1)
df = df.assign(log_means=pd.Series(means).values)
df = df.assign(log_stdevs=pd.Series(stdevs).values)

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until
  keepdims=keepdims)


In [189]:
# Estimate values for all, but only use them if not other value is available
with_history = df[np.isfinite(df_all.pace_3)]
x = with_history.as_matrix(["mean_team_id"])
x = x.reshape(len(x), 1)

log_means = with_history.log_means.values.reshape(len(with_history.log_means), 1)
log_means_model = linear_model.LinearRegression()
log_means_model.fit(x, log_means)
estimated_log_means = log_means_model.predict(df.mean_team_id.values.reshape(len(df.mean_team_id), 1))
df = df.assign(estimated_log_means=estimated_log_means)

log_stdevs = with_history.log_stdevs.values.reshape(len(with_history.log_stdevs), 1)
log_stdevs_model = linear_model.LinearRegression()
log_stdevs_model.fit(x, log_stdevs)
estimated_log_stdevs = log_stdevs_model.predict(df.mean_team_id.values.reshape(len(df.mean_team_id), 1))
df = df.assign(estimated_log_stdevs=estimated_log_stdevs)



  This is separate from the ipykernel package so we can avoid doing imports until


In [190]:
df

Unnamed: 0,mean_team_id,teams,name,num_runs,num_valid_times,mean_pace,stdev,pace_1,pace_2,pace_3,pace_4,pace_5,pace_6,log_means,log_stdevs,estimated_log_means,estimated_log_stdevs
0,52.7,BROMMA-VÄLLINGBY SOK/JÄRFÄL;IFK GÖTEBORG,max peter bejmer,3,3,6.337,1.050,5.582,5.608,7.822,,,,1.890567,0.166373,1.947046,0.087765
1,91.0,MARKBYGDENS OK;IFK GÖTEBORG,johan högstrand,4,4,6.198,0.468,5.591,5.902,6.571,6.728,,,1.854745,0.057003,1.964758,0.088575
2,13.2,IFK GÖTEBORG,vetle ruud bråten,4,4,5.750,0.181,5.688,5.636,5.615,6.060,,,1.752109,0.035106,1.928778,0.086929
3,33.2,IFK GÖTEBORG,jonas pilblad,5,5,6.083,0.389,5.978,5.772,6.819,5.756,6.089,,1.807365,0.068628,1.938028,0.087352
4,17.5,OK KÅRE;IFK GÖTEBORG,fredrik edn,2,2,6.333,0.472,5.861,6.805,,,,,1.917658,0.000000,1.930767,0.087020
5,10.5,IFK LIDINGÖ SOK;IFK GÖTEBORG,fredrik bakkman,2,2,5.483,0.028,5.511,5.455,,,,,1.696533,0.000000,1.927530,0.086872
6,16.8,IFK GÖTEBORG,eskil kinneberg,4,4,5.623,0.084,5.611,5.494,5.719,5.668,,,1.727429,0.017203,1.930443,0.087005
7,21.3,KOOVEE,topi anjala,6,6,6.080,0.516,5.621,6.100,5.977,7.164,5.642,5.974,1.816586,0.080558,1.932524,0.087101
8,1.0,KOOVEE,olexander kratov,1,1,5.874,0.000,5.874,,,,,,,,1.923136,0.086671
9,6.7,KOOVEE;VAAJAKOSKEN TERÄ,jani lakanen,6,6,6.044,0.456,5.874,5.734,6.208,6.717,5.326,6.403,1.801252,0.082382,1.925772,0.086792


In [191]:
# s = sigma and scale = exp(mu).
df['final_log_means'] = np.where(np.isfinite(df["log_means"]) & df["log_means"]>0, df["log_means"], df["estimated_log_means"])
final_log_means = np.exp(df['final_log_means']) 
df['final_log_stdevs'] = np.where(np.isfinite(df["log_stdevs"]) & df["log_stdevs"]>0, df["log_stdevs"], df["estimated_log_stdevs"])
final_log_stdevs = df['final_log_stdevs']

intervals95 = lognorm.interval(0.95, s = final_log_stdevs, scale = final_log_means)
means = lognorm.mean(s = final_log_stdevs, scale = final_log_means)
medians = lognorm.median(s = final_log_stdevs, scale = final_log_means)

In [192]:

df = df.assign(interval95_start = pd.Series( intervals95[0] ).values)
df = df.assign(interval95_end = pd.Series( intervals95[1] ).values)
df = df.assign(p1_in_interval = pd.Series( (df.interval95_start <= df.pace_1) & (df.interval95_end >= df.pace_1) ).values)
df = df.assign(mean_ln = pd.Series(means).values)
df = df.assign(med_ln = pd.Series(medians).values)

In [194]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
mean_err = rmse(df.mean_ln, df.pace_1)
med_err = rmse(df.med_ln, df.pace_1)
(mean_err, med_err)

(2.743955611857693, 2.672648533105014)

In [195]:
np.mean(df.p1_in_interval)

0.6903853837492281

In [196]:
df.to_csv('data/log_normal_estimates_ju.tsv', sep="\t")

In [197]:
runs17 = pd.read_csv('data/csv-results_j2017_ju.tsv', delimiter="\t")


In [198]:
def get_estimate_row(row):
    name = row["competitor-name"].lower()
    
    by_name = df[df['name'] == name]
    by_name_and_colon = df[df['name'].str.contains(name +":", regex=False)]

    runners = by_name.append(by_name_and_colon)
    if(len(runners) == 1):
        return runners
    team_name = row["team-name"].upper()
    runners = runners[runners['teams'].str.contains(team_name, regex=False)]
    if(len(runners) == 1):
        return runners
    print(f"name '{name}' team_name '{team_name}'")
    print(f"by_name {len(by_name)} by_name_and_colon {len(by_name_and_colon)} runners {len(runners)}")
    print(f"Duplicate runner {runners}")
    #print(f"TEAMS by_name_and_colon {by_name_and_colon['teams']}")
    return runners.sort_values("num_runs", ascending = False).head(1)

def get_estimate_params(row):
    estimate_row = get_estimate_row(row)
    #print(f"estimate_row final_log_means {estimate_row.final_log_means} {estimate_row.final_log_stdevs}")
    final_log_means = estimate_row.final_log_means.values[0]
    final_log_stdevs = estimate_row.final_log_stdevs.values[0]
    return pd.Series({"final_log_means": final_log_means, "final_log_stdevs": final_log_stdevs})

#runs17 = runs17[runs17['team-name'].str.contains("Reak")]
estimate_params = runs17.apply(lambda row: get_estimate_params(row), axis=1)
runs17 = runs17.assign(final_log_means = estimate_params.final_log_means)
runs17 = runs17.assign(final_log_stdevs = estimate_params.final_log_stdevs)

name 'panu kärkkäinen' team_name 'JÄMSÄN RETKI-VEIKOT'
by_name 0 by_name_and_colon 4 runners 2
Duplicate runner       mean_team_id                        teams  \
1841         570.0          JÄMSÄN RETKI-VEIKOT   
1843         954.0  JÄMSÄN RETKI-VEIKOT/JUKOLA2   

                                             name  num_runs  num_valid_times  \
1841          panu kärkkäinen:JÄMSÄN RETKI-VEIKOT         4                3   
1843  panu kärkkäinen:JÄMSÄN RETKI-VEIKOT/JUKOLA2         1                1   

      mean_pace  stdev  pace_1  pace_2  pace_3    ...      log_stdevs  \
1841      9.109  1.401   7.938    8.31  11.079    ...        0.143796   
1843      8.686  0.000   8.686     NaN     NaN    ...             NaN   

      estimated_log_means  estimated_log_stdevs  final_log_means  \
1841             2.186281              0.098710         2.261256   
1843             2.363868              0.106835         2.363868   

      final_log_stdevs  interval95_start  interval95_end  p1_in_inte

In [199]:
def leg_dist(leg):
    dist = [12.7, 14.2, 12.3, 7.6, 7.9, 10.9, 13.8]
    return dist[leg - 1]

runs17 = runs17.assign(leg_dist = runs17["leg-nro"].apply(lambda nro: leg_dist(nro)))

final_means = np.exp(runs17.final_log_means)
#intervals95 = lognorm.interval(0.95, s = runs17.final_log_stdevs, scale = final_means)

#runs17 = runs17.assign(start95 = intervals95[0] * runs17["leg_dist"])
#runs17 = runs17.assign(end95 = intervals95[1] * runs17["leg_dist"])

runs17["est_median"] = lognorm.median(s = runs17.final_log_stdevs, scale = final_means) * runs17["leg_dist"]


In [200]:
runs17

Unnamed: 0,team-id,placement,team-time,team-name,team-nro,leg-nro,emit,leg-time,competitor-name,control-times,final_log_means,final_log_stdevs,leg_dist,est_median
0,5,1.0,27105.0,IFK Göteborg,1,1,1217511.0,4287.0,Max Peter Bejmer,927;1080;1144;1193;1334;1436;1548;1612;1725;19...,1.890567,0.166373,12.7,84.113679
1,5,1.0,27105.0,IFK Göteborg,1,2,1217440.0,4797.0,Johan Högstrand,905;981;1103;1148;1314;1380;1451;1542;1605;167...,1.854745,0.057003,14.2,90.738981
2,5,1.0,27105.0,IFK Göteborg,1,3,1221058.0,4198.0,Vetle Ruud Bråten,897;1025;1118;1165;1293;1386;1475;1537;1616;18...,1.752109,0.035106,12.3,70.931025
3,5,1.0,27105.0,IFK Göteborg,1,4,1217402.0,2762.0,Jonas Pilblad,325;457;659;917;989;1107;1574;1809;1897;1971;2...,1.807365,0.068628,7.6,46.317172
4,5,1.0,27105.0,IFK Göteborg,1,5,1217509.0,2743.0,Fredrik Edn,285;442;603;802;926;979;1074;1531;1711;1799;18...,1.917658,0.087020,7.9,53.759500
5,5,1.0,27105.0,IFK Göteborg,1,6,1216571.0,3670.0,Fredrik Bakkman,260;390;642;1101;1322;1404;1591;1729;1807;1988...,1.696533,0.086872,10.9,59.459500
6,5,1.0,27105.0,IFK Göteborg,1,7,1217441.0,4646.0,Eskil Kinneberg,304;524;717;1131;1189;1309;1411;1862;2154;2231...,1.727429,0.017203,13.8,77.641148
7,1,2.0,27171.0,Koovee,1,1,1201850.0,4317.0,Topi Anjala,933;1087;1150;1198;1338;1438;1537;1605;1720;19...,1.816586,0.080558,12.7,78.115482
8,1,2.0,27171.0,Koovee,1,2,1230611.0,5040.0,Olexander Kratov,896;1036;1151;1197;1375;1441;1508;1599;1666;17...,1.923136,0.086671,14.2,97.161867
9,1,2.0,27171.0,Koovee,1,3,1214773.0,4335.0,Jani Lakanen,949;1096;1193;1241;1375;1495;1598;1660;1754;20...,1.801252,0.082382,12.3,74.503898


In [251]:
by_teams = runs17.set_index(["team-id", "leg-nro"]).unstack()
by_teams.head()

Unnamed: 0_level_0,placement,placement,placement,placement,placement,placement,placement,team-time,team-time,team-time,...,leg_dist,leg_dist,leg_dist,est_median,est_median,est_median,est_median,est_median,est_median,est_median
leg-nro,1,2,3,4,5,6,7,1,2,3,...,5,6,7,1,2,3,4,5,6,7
team-id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,27171.0,27171.0,27171.0,...,7.9,10.9,13.8,78.115482,97.161867,74.503898,45.243955,50.675913,62.267656,78.987342
2,21.0,21.0,21.0,21.0,21.0,21.0,21.0,29186.0,29186.0,29186.0,...,7.9,10.9,13.8,79.403068,91.433455,76.263438,49.412052,44.186301,74.616496,78.25843
3,24.0,24.0,24.0,24.0,24.0,24.0,24.0,29299.0,29299.0,29299.0,...,7.9,10.9,13.8,82.681744,88.40427,77.526052,52.050247,55.2052,65.510583,80.069319
4,5.0,5.0,5.0,5.0,5.0,5.0,5.0,27842.0,27842.0,27842.0,...,7.9,10.9,13.8,74.788233,91.738116,80.29404,46.180695,51.260091,65.659463,76.896217
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,27105.0,27105.0,27105.0,...,7.9,10.9,13.8,84.113679,90.738981,70.931025,46.317172,53.7595,59.4595,77.641148


In [252]:
for i in range(1,8):
    by_teams["fin_real", i] = np.sum([ by_teams["leg-time"][j] / 60 for j in range(1,i+1) ], axis=0)
by_teams.fin_real.head()

leg-nro,1,2,3,4,5,6,7
team-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,71.95,155.95,228.2,273.25,317.416667,375.933333,452.8
2,77.383333,170.7,241.633333,296.166667,341.0,405.933333,486.383333
3,83.8,173.333333,245.283333,292.85,346.2,411.166667,488.283333
4,71.7,162.183333,232.483333,278.366667,323.85,385.6,463.966667
5,71.45,151.4,221.366667,267.4,313.116667,374.283333,451.716667


In [253]:
for i in range(1,8):
    by_teams["fin_med", i] = np.sum([ by_teams["est_median"][j] for j in range(1,i+1) ], axis=0)
by_teams.fin_med.head()

leg-nro,1,2,3,4,5,6,7
team-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,78.115482,175.277349,249.781247,295.025202,345.701115,407.968771,486.956114
2,79.403068,170.836523,247.099961,296.512012,340.698314,415.31481,493.57324
3,82.681744,171.086014,248.612066,300.662313,355.867513,421.378096,501.447415
4,74.788233,166.52635,246.82039,293.001085,344.261176,409.920639,486.816855
5,84.113679,174.85266,245.783685,292.100857,345.860357,405.319857,482.961005


In [255]:

def get_simulated_medians(row):
    samples = pd.DataFrame()
    for i in range(1,8):
        samples[i] = row["leg_dist"][i] * lognorm.rvs(s = row["final_log_stdevs"][i], scale = np.exp(row["final_log_means"][i]), size = 10000)

    samples_sums = pd.DataFrame()
    for i in range(1,8):
        samples_sums[i] = np.sum([ samples[j] for j in range(1,i+1) ], axis=0)

    start95 = samples_sums.quantile(0.025)
    end95 = samples_sums.quantile(0.975)
    medians = samples_sums.median()
    means = samples_sums.mean()
    
    """
    for i in range(1,8):
        bins = int(samples_sums[i].max() - samples_sums[i].min())
        name = row["competitor-name"][i]
        plt.title(f"{name} bins = {bins}")
        plt.hist(samples_sums[i], bins=bins)
        plt.axvline(x=row["fin_real"][i], color="r")        
        plt.axvline(x=medians[i], color="g")
        plt.axvline(x=means[i], color="yellow")
        plt.show()
    """
    
    """
    bins = int( (samples_sums.max().max() - samples_sums.min().min()) / 5) 
    plt.figure(figsize=(20, 6))
    plt.title(f"Whole team, bins = {bins}")
    plt.hist([samples_sums[1], samples_sums[2], samples_sums[3], samples_sums[4], samples_sums[5], samples_sums[6], samples_sums[7]], bins=bins)
    for i in range(1,8):
        plt.axvline(x=row["fin_real"][i], color="r")
        plt.axvline(x=medians[i], color="g")

    plt.show()
    """
    
    fin_start95_dict = {f"fin_start95_{leg}" : start95.values[leg-1] for leg in range(1,8)}
    fin_end95_dict = {f"fin_end95_{leg}" : end95.values[leg-1] for leg in range(1,8)}
    fin_median_dict = {f"fin_median_{leg}" : medians.values[leg-1] for leg in range(1,8)}
    fin_mean_dict = {f"fin_mean_{leg}" : means.values[leg-1] for leg in range(1,8)}
    new_cols = {**fin_start95_dict, **fin_end95_dict, **fin_median_dict, **fin_mean_dict}

    #print(start95.values)
    #print(new_cols)
    return pd.Series(new_cols)

simulated = by_teams.apply(get_simulated_medians, axis=1)


simulated.head()

Unnamed: 0_level_0,fin_start95_1,fin_start95_2,fin_start95_3,fin_start95_4,fin_start95_5,fin_start95_6,fin_start95_7,fin_end95_1,fin_end95_2,fin_end95_3,...,fin_median_5,fin_median_6,fin_median_7,fin_mean_1,fin_mean_2,fin_mean_3,fin_mean_4,fin_mean_5,fin_mean_6,fin_mean_7
team-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,66.905457,156.234946,227.730467,272.3378,321.436005,382.818688,458.035404,91.815887,197.313214,274.432018,...,346.942879,409.334623,488.478337,78.469771,175.930789,250.713055,296.055482,347.060441,409.441781,488.759595
2,71.312195,158.602855,231.450748,276.544916,320.421497,391.872513,467.405147,88.280253,183.969099,264.3576,...,341.302011,416.384756,494.80051,79.494706,170.999353,247.499467,297.442157,341.678381,416.658896,495.172158
3,64.023656,149.962873,226.090703,277.134535,330.775295,394.414529,473.0239,106.267093,196.561279,275.687954,...,356.788724,422.606863,502.75008,83.412613,171.932852,249.658319,301.873484,357.300557,423.063068,503.322294
4,72.287039,150.965597,226.257852,270.713245,319.902555,384.416852,460.335786,77.499162,186.154298,270.719502,...,345.437212,411.099889,488.068897,74.813851,167.081999,247.68178,294.164746,345.738565,411.486894,488.4119
5,61.002038,149.550523,220.312167,265.914871,318.510149,376.277997,453.993932,116.115869,208.799962,280.403959,...,346.834822,406.682071,484.291469,85.293676,176.212843,247.232104,293.669892,347.659773,407.39541,485.027302


In [256]:
by_teams_2 = by_teams.copy()
by_teams_2.columns = [f'{x[0]}_{x[1]}' for x in by_teams_2.columns]
by_teams_2.reset_index()
by_teams_2.head()

Unnamed: 0_level_0,placement_1,placement_2,placement_3,placement_4,placement_5,placement_6,placement_7,team-time_1,team-time_2,team-time_3,...,fin_real_5,fin_real_6,fin_real_7,fin_med_1,fin_med_2,fin_med_3,fin_med_4,fin_med_5,fin_med_6,fin_med_7
team-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,27171.0,27171.0,27171.0,...,317.416667,375.933333,452.8,78.115482,175.277349,249.781247,295.025202,345.701115,407.968771,486.956114
2,21.0,21.0,21.0,21.0,21.0,21.0,21.0,29186.0,29186.0,29186.0,...,341.0,405.933333,486.383333,79.403068,170.836523,247.099961,296.512012,340.698314,415.31481,493.57324
3,24.0,24.0,24.0,24.0,24.0,24.0,24.0,29299.0,29299.0,29299.0,...,346.2,411.166667,488.283333,82.681744,171.086014,248.612066,300.662313,355.867513,421.378096,501.447415
4,5.0,5.0,5.0,5.0,5.0,5.0,5.0,27842.0,27842.0,27842.0,...,323.85,385.6,463.966667,74.788233,166.52635,246.82039,293.001085,344.261176,409.920639,486.816855
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,27105.0,27105.0,27105.0,...,313.116667,374.283333,451.716667,84.113679,174.85266,245.783685,292.100857,345.860357,405.319857,482.961005


In [257]:
result = pd.concat([by_teams_2, simulated], axis=1, join='inner')
result.head()

Unnamed: 0_level_0,placement_1,placement_2,placement_3,placement_4,placement_5,placement_6,placement_7,team-time_1,team-time_2,team-time_3,...,fin_median_5,fin_median_6,fin_median_7,fin_mean_1,fin_mean_2,fin_mean_3,fin_mean_4,fin_mean_5,fin_mean_6,fin_mean_7
team-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,27171.0,27171.0,27171.0,...,346.942879,409.334623,488.478337,78.469771,175.930789,250.713055,296.055482,347.060441,409.441781,488.759595
2,21.0,21.0,21.0,21.0,21.0,21.0,21.0,29186.0,29186.0,29186.0,...,341.302011,416.384756,494.80051,79.494706,170.999353,247.499467,297.442157,341.678381,416.658896,495.172158
3,24.0,24.0,24.0,24.0,24.0,24.0,24.0,29299.0,29299.0,29299.0,...,356.788724,422.606863,502.75008,83.412613,171.932852,249.658319,301.873484,357.300557,423.063068,503.322294
4,5.0,5.0,5.0,5.0,5.0,5.0,5.0,27842.0,27842.0,27842.0,...,345.437212,411.099889,488.068897,74.813851,167.081999,247.68178,294.164746,345.738565,411.486894,488.4119
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,27105.0,27105.0,27105.0,...,346.834822,406.682071,484.291469,85.293676,176.212843,247.232104,293.669892,347.659773,407.39541,485.027302


In [249]:
start_timestamp = pd.Timestamp(year = 2017, month = 6, day = 17, hour = 23)
result["fint_real_1"] = pd.to_datetime(result["fin_real_1"] * 60, unit = "s", origin= start_timestamp)

for leg in range(1,8):
    result[f"fint_real_{leg}"] = pd.to_datetime(result[f"fin_real_{leg}"] * 60, unit = "s", origin= start_timestamp)
    result[f"fint_median_{leg}"] = pd.to_datetime(result[f"fin_median_{leg}"] * 60, unit = "s", origin= start_timestamp)
    result[f"fint_start95_{leg}"] = pd.to_datetime(result[f"fin_start95_{leg}"] * 60, unit = "s", origin= start_timestamp)
    result[f"fint_end95_{leg}"] = pd.to_datetime(result[f"fin_end95_{leg}"] * 60, unit = "s", origin= start_timestamp)

result["fint_end95_2"]  
    

team-id
1      2017-06-18 02:17:27.078352000
2      2017-06-18 02:03:44.163662000
3      2017-06-18 02:16:33.689827000
4      2017-06-18 02:05:52.144775000
5      2017-06-18 02:29:48.491857000
6      2017-06-18 01:56:39.098433000
7      2017-06-18 02:26:44.299537000
8      2017-06-18 02:13:23.785512000
9      2017-06-18 02:00:30.050533000
10     2017-06-18 02:13:40.279052000
11     2017-06-18 02:05:18.023825000
12     2017-06-18 02:14:56.858873000
13     2017-06-18 02:22:15.466467000
14     2017-06-18 02:09:49.017318000
15     2017-06-18 02:28:36.711822000
16     2017-06-18 02:05:04.373186000
17     2017-06-18 02:05:59.454779000
18     2017-06-18 02:20:01.953818000
19     2017-06-18 02:08:07.408825000
20     2017-06-18 02:31:35.528746000
21     2017-06-18 01:57:02.214922000
22     2017-06-18 02:11:22.389501000
23     2017-06-18 02:14:30.744625000
24     2017-06-18 02:43:50.516985999
25     2017-06-18 02:48:19.722852000
26     2017-06-18 02:05:16.712261000
27     2017-06-18 02:10:32.128

In [250]:
result.to_csv('data/team_estimates_ju2017.tsv', sep="\t")

In [218]:
[ (rmse(result[f"fin_mean_{leg}"], result[f"fin_real_{leg}"]), rmse(result[f"fin_median_{leg}"], result[f"fin_real_{leg}"])) for leg in range(1,8)]

[(22.371055825752933, 22.029401142480737),
 (34.66291127402661, 34.09477541475599),
 (46.70714393481617, 46.12721122874638),
 (51.76751341137128, 50.611218364399),
 (59.09781637725244, 58.32093211410342),
 (68.13835213124383, 67.30695068573549),
 (76.94961123035401, 76.49842282042769)]

In [236]:
result["fin_real_7"].head()

team-id
1    452.800000
2    486.383333
3    488.283333
4    463.966667
5    451.716667
Name: fin_real_7, dtype: float64

In [215]:
qualified = result[np.isfinite(result.fin_real_7)]

In [231]:
[ (rmse(qualified[f"fin_mean_{leg}"], qualified[f"fin_real_{leg}"]), rmse(qualified[f"fin_median_{leg}"], qualified[f"fin_real_{leg}"])) for leg in range(1,8)]

[(22.21349045049956, 21.85374470685025),
 (34.346580616921216, 33.7370553647828),
 (46.6657892107734, 46.03357847019537),
 (50.94226105856988, 50.49261291630068),
 (58.26950660607369, 58.00094225465912),
 (67.4454790771967, 67.10991465883217),
 (76.94961123035401, 76.49842282042769)]

In [232]:
[ (mean_absolute_error(qualified[f"fin_real_{leg}"], qualified[f"fin_mean_{leg}"]), mean_absolute_error(qualified[f"fin_real_{leg}"], qualified[f"fin_median_{leg}"])) for leg in range(1,8)]

[(14.281530656724978, 13.946578846650764),
 (21.928703068801827, 21.527993764737992),
 (30.925430933505346, 30.39071894491211),
 (33.48524098492939, 33.04195992228056),
 (38.028652415377216, 37.717968920506415),
 (43.10059452537612, 42.78697017778889),
 (48.996112478694265, 48.67674808204247)]

In [225]:
np.mean((result["fin_start95_1"] < result["fin_real_1"]) & (result["fin_end95_1"] > result["fin_real_1"]) )

0.7948717948717948

In [226]:
np.mean((qualified["fin_start95_1"] < qualified["fin_real_1"]) & (qualified["fin_end95_1"] > qualified["fin_real_1"]) )

0.816146540027137

### 