In [831]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error

In [832]:
in_file_name = 'data/grouped_paces_ju.tsv'
df_all = pd.read_csv(in_file_name, delimiter="\t")
history = df_all

In [833]:
order18 = pd.read_csv('data/running_order_j2018_ju.tsv', delimiter="\t")
order18 = order18[np.isfinite(order18.team_id)]

In [834]:
#paces = df.as_matrix(["pace_1", "pace_2", "pace_3", "pace_4", "pace_5", "pace_6"])
paces = history[["pace_1", "pace_2", "pace_3", "pace_4", "pace_5", "pace_6"]]
logs = np.log(paces)
means = np.nanmean(logs, axis=1)
stdevs = np.nanstd(logs, axis=1)
history = history.assign(log_means=pd.Series(means).values)
history = history.assign(log_stdevs=pd.Series(stdevs).values)

  after removing the cwd from sys.path.
  keepdims=keepdims)


In [835]:
# Estimate values for all, but only use them if no history is available
with_history = history[np.isfinite(history.pace_2)]
x = with_history.mean_team_id.values
x = x.reshape(len(x), 1)

log_means = with_history.log_means.values.reshape(len(with_history.log_means), 1)
log_means_model = linear_model.LinearRegression()
log_means_model.fit(x, log_means)

estimated_log_means = log_means_model.predict(order18.team_id.values.reshape(len(order18.team_id), 1))
order18 = order18.assign(estimated_log_means=estimated_log_means)

log_stdevs = with_history.log_stdevs.values.reshape(len(with_history.log_stdevs), 1)
log_stdevs_model = linear_model.LinearRegression()
log_stdevs_model.fit(x, log_stdevs)

estimated_log_stdevs = log_stdevs_model.predict(order18.team_id.values.reshape(len(order18.team_id), 1))
order18 = order18.assign(estimated_log_stdevs=estimated_log_stdevs)
order18.head()

Unnamed: 0,team_id,team,team_base_name,leg,leg_dist,name,estimated_log_means,estimated_log_stdevs
0,1,IFK Göteborg 1,IFK Göteborg,1,11.0,Max Peter Bejmer,1.943138,0.084941
1,1,IFK Göteborg 1,IFK Göteborg,2,11.9,Johan Högstarnd,1.943138,0.084941
2,1,IFK Göteborg 1,IFK Göteborg,3,12.8,Vetle Ruud Bråten,1.943138,0.084941
3,1,IFK Göteborg 1,IFK Göteborg,4,8.7,Jonas Pilblad,1.943138,0.084941
4,1,IFK Göteborg 1,IFK Göteborg,5,8.7,Jens Wängdahl,1.943138,0.084941


In [836]:
# Combine history with 2018 runners 
no_history_row = pd.DataFrame([[0,0]], columns=["log_means", "log_stdevs"])
def get_history_row(running_order_row):
    name = running_order_row["name"].lower()
    
    by_name = history[history['name'] == name]
    by_name_and_colon = history[history['name'].str.contains(name + ":", regex=False)]

    runners = by_name.append(by_name_and_colon)
    if(len(runners) == 1):
        return runners
    team_name = running_order_row["team_base_name"].upper()
    runners = runners[runners['teams'].str.contains(team_name, regex=False)]
    if(len(runners) == 1):
        return runners
    if(len(runners) == 0):
        return no_history_row
    print(f"name '{name}' team_name '{team_name}'")
    print(f"by_name {len(by_name)} by_name_and_colon {len(by_name_and_colon)} runners {len(runners)}")
    print(f"Duplicate runner {runners}")
    #print(f"TEAMS by_name_and_colon {by_name_and_colon['teams']}")
    return runners.sort_values("num_runs", ascending = False).head(1)

def get_estimate_params(running_order_row):
    history_row = get_history_row(running_order_row)
    #print(f"estimate_row log_means {history_row.log_means} {history_row.log_stdevs}")
    log_means = history_row.log_means.values[0]
    log_stdevs = history_row.log_stdevs.values[0]
    return pd.Series({"history_log_means": log_means, "history_log_stdevs": log_stdevs})

#order18 = order18[order18['team'].str.contains("Reak") | order18['team'].str.contains("Puskasil") | order18['team'].str.contains("Rastihaukat")]
estimate_params = order18.apply(lambda row: get_estimate_params(row), axis=1)
order18 = order18.assign(history_log_means = estimate_params.history_log_means)
order18 = order18.assign(history_log_stdevs = estimate_params.history_log_stdevs)

name 'anssi koirikivi' team_name 'RASTIIMI'
by_name 0 by_name_and_colon 3 runners 2
Duplicate runner       mean_team_id              teams                               name  \
1127         231.0  RASTIIMI - NONAME  anssi koirikivi:RASTIIMI - NONAME   
1129         234.8           RASTIIMI           anssi koirikivi:RASTIIMI   

      num_runs  num_valid_times  mean_pace  stdev  pace_1  pace_2  pace_3  \
1127         1                1      5.943  0.000   5.943     NaN     NaN   
1129         5                5      7.469  0.585   6.846   7.222   8.536   

      pace_4  pace_5  pace_6  log_means  log_stdevs  
1127     NaN     NaN     NaN   1.782214    0.000000  
1129   7.143   7.596     NaN   2.007769    0.075866  
name 'petri laaksonen' team_name 'UNO SK'
by_name 0 by_name_and_colon 6 runners 2
Duplicate runner       mean_team_id            teams                             name  \
3977         611.0  UNO SK-HARRASTE  petri laaksonen:UNO SK-HARRASTE   
3979         698.0           UNO 

In [837]:
order18['log_means'] = np.where(np.isfinite(order18["history_log_means"]) & order18["history_log_means"] > 0, order18["history_log_means"], order18["estimated_log_means"])
order18['log_stdevs'] = np.where(np.isfinite(order18["history_log_stdevs"]) & order18["history_log_stdevs"] > 0, order18["history_log_stdevs"], order18["estimated_log_stdevs"])
order18.head()


Unnamed: 0,team_id,team,team_base_name,leg,leg_dist,name,estimated_log_means,estimated_log_stdevs,history_log_means,history_log_stdevs,log_means,log_stdevs
0,1,IFK Göteborg 1,IFK Göteborg,1,11.0,Max Peter Bejmer,1.943138,0.084941,1.833561,0.157965,1.833561,0.157965
1,1,IFK Göteborg 1,IFK Göteborg,2,11.9,Johan Högstarnd,1.943138,0.084941,0.0,0.0,1.943138,0.084941
2,1,IFK Göteborg 1,IFK Göteborg,3,12.8,Vetle Ruud Bråten,1.943138,0.084941,1.748671,0.030981,1.748671,0.030981
3,1,IFK Göteborg 1,IFK Göteborg,4,8.7,Jonas Pilblad,1.943138,0.084941,1.803509,0.061865,1.803509,0.061865
4,1,IFK Göteborg 1,IFK Göteborg,5,8.7,Jens Wängdahl,1.943138,0.084941,1.802502,0.079207,1.802502,0.079207


In [838]:
# Calculate personal estimates
# s = sigma and scale = exp(mu).

log_means = np.exp(order18['log_means']) 
log_stdevs = order18['log_stdevs']

intervals95 = lognorm.interval(0.95, s = log_stdevs, scale = log_means)
means = lognorm.mean(s = log_stdevs, scale = log_means)
medians = lognorm.median(s = log_stdevs, scale = log_means)

In [839]:
order18 = order18.assign(ind_95_start = pd.Series(intervals95[0] * order18.leg_dist).values)
order18 = order18.assign(ind_95_end = pd.Series(intervals95[1] * order18.leg_dist).values)
order18 = order18.assign(ind_mean = pd.Series(means * order18.leg_dist).values)
order18 = order18.assign(ind_median = pd.Series(medians * order18.leg_dist).values)

In [840]:
by_teams = order18.set_index(["team_id", "leg"]).unstack()
by_teams.head()

Unnamed: 0_level_0,team,team,team,team,team,team,team,team_base_name,team_base_name,team_base_name,...,ind_mean,ind_mean,ind_mean,ind_median,ind_median,ind_median,ind_median,ind_median,ind_median,ind_median
leg,1,2,3,4,5,6,7,1,2,3,...,5,6,7,1,2,3,4,5,6,7
team_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg,IFK Göteborg,IFK Göteborg,...,52.929556,59.2164,86.031925,68.817343,83.069436,73.561099,52.816934,52.763784,59.215628,86.022319
2,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee,Koovee,Koovee,...,54.949686,60.995212,87.455533,67.486865,72.104171,75.980847,54.32049,54.658743,60.8765,87.097402
3,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö,Tampereen Pyrintö,Tampereen Pyrintö,...,53.411364,61.058061,87.339752,67.715767,73.400057,75.555759,55.021214,53.34751,61.026089,87.281419
4,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,Södertälje Nykvarn Orientering,Södertälje Nykvarn Orientering,...,53.156447,75.761993,87.753882,65.922895,73.799486,74.831947,51.467591,53.000857,75.488853,87.492446
5,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK,Halden SK,Halden SK,...,62.097304,75.79495,85.633899,64.133501,75.142332,77.273814,59.881118,61.654866,75.521582,85.578863


In [841]:
# remove teams missing some runners
print(len(by_teams))
by_teams = by_teams[np.isfinite(by_teams.log_means[1]) & np.isfinite(by_teams.log_means[2]) & np.isfinite(by_teams.log_means[3]) & np.isfinite(by_teams.log_means[4]) & np.isfinite(by_teams.log_means[5]) & np.isfinite(by_teams.log_means[6]) & np.isfinite(by_teams.log_means[7])] 
print(len(by_teams))

1858
1843


In [842]:
# Sum of log normal variables is not defined 
# so we simulate 10000 runs for each user and sum them and then do statistics on simulated results 
def simulate_relay_estimates(row):
    samples = pd.DataFrame()
    for i in range(1,8):
        if np.isnan(row["log_means"][i]):
            print(row["log_means"])
            print(row["name"])
        samples[i] = row["leg_dist"][i] * lognorm.rvs(s = row["log_stdevs"][i], scale = np.exp(row["log_means"][i]), size = 10000)

    samples_sums = pd.DataFrame()
    # leg_1 
    # leg_1 + leg_2
    # leg_1 + leg_2 + leg_3
    # ...
    for i in range(1,8):
        samples_sums[i] = np.sum([ samples[j] for j in range(1,i+1) ], axis=0)

    start95 = samples_sums.quantile(0.025)
    end95 = samples_sums.quantile(0.975)
    medians = samples_sums.median()
    means = samples_sums.mean()
    
    sum_logs = np.log(samples_sums)
    sum_log_means = np.mean(sum_logs)
    sum_log_stds = np.std(sum_logs)
    
    """
    for i in range(1,8):
        bins = int(samples_sums[i].max() - samples_sums[i].min())
        name = row["name"][i]
        plt.title(f"{name} bins = {bins}")
        plt.hist(samples_sums[i], bins=bins)
        #plt.axvline(x=row["fin_real"][i], color="r")        
        plt.axvline(x=medians[i], color="g")
        plt.axvline(x=means[i], color="yellow")
        plt.show()    
    """
    
    """
    bins = int( (samples_sums.max().max() - samples_sums.min().min()) / 5) 
    plt.figure(figsize=(20, 6))
    plt.title(f"Whole team, bins = {bins}")
    plt.hist([samples_sums[1], samples_sums[2], samples_sums[3], samples_sums[4], samples_sums[5], samples_sums[6], samples_sums[7]], bins=bins)
    for i in range(1,8):
        #plt.axvline(x=row["fin_real"][i], color="r")
        plt.axvline(x=medians[i], color="g")

    plt.show()
    """
    
    fin_start95_dict = {f"fin_start95_{leg}" : start95.values[leg-1] for leg in range(1,8)}
    fin_end95_dict = {f"fin_end95_{leg}" : end95.values[leg-1] for leg in range(1,8)}
    fin_median_dict = {f"fin_median_{leg}" : medians.values[leg-1] for leg in range(1,8)}
    fin_mean_dict = {f"fin_mean_{leg}" : means.values[leg-1] for leg in range(1,8)}
    fin_sum_log_means_dict = {f"fin_sum_log_mean_{leg}" : sum_log_means.values[leg-1] for leg in range(1,8)}
    fin_sum_log_stds_dict = {f"fin_sum_log_std_{leg}" : sum_log_stds.values[leg-1] for leg in range(1,8)}
    new_cols = {**fin_start95_dict, **fin_end95_dict, **fin_median_dict, **fin_mean_dict, **fin_sum_log_means_dict, **fin_sum_log_stds_dict}

    #print(start95.values)
    #print(new_cols)
    return pd.Series(new_cols)

relay_estimates = by_teams.apply(simulate_relay_estimates, axis=1)
relay_estimates.head()

Unnamed: 0_level_0,fin_start95_1,fin_start95_2,fin_start95_3,fin_start95_4,fin_start95_5,fin_start95_6,fin_start95_7,fin_end95_1,fin_end95_2,fin_end95_3,...,fin_sum_log_mean_5,fin_sum_log_mean_6,fin_sum_log_mean_7,fin_sum_log_std_1,fin_sum_log_std_2,fin_sum_log_std_3,fin_sum_log_std_4,fin_sum_log_std_5,fin_sum_log_std_6,fin_sum_log_std_7
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,50.67545,129.453546,202.677491,254.324642,305.889288,365.071826,451.140539,93.449436,180.665287,254.497714,...,5.805922,5.970047,6.168793,0.156181,0.084908,0.058246,0.048863,0.043095,0.036602,0.030141
2,57.648165,125.118938,198.325044,251.890383,304.319323,363.896899,446.510564,78.841542,156.825002,236.008962,...,5.785125,5.956755,6.160633,0.080114,0.0573,0.043972,0.036564,0.034748,0.030935,0.03063
3,55.806404,122.051896,194.175708,246.03484,299.334724,360.152129,446.387433,82.505734,164.084386,243.328373,...,5.787038,5.958873,6.162339,0.100813,0.075779,0.058167,0.051999,0.044154,0.037523,0.03133
4,56.052528,127.755193,198.372198,247.816556,299.529829,372.093848,456.689274,77.688166,153.697514,232.861103,...,5.767635,5.980091,6.180765,0.082897,0.04689,0.040111,0.037092,0.033472,0.031607,0.02912
5,60.915666,126.402314,196.951123,255.450802,313.519421,386.609211,471.731699,67.450683,154.885046,239.361172,...,5.826406,6.028229,6.215854,0.025651,0.051836,0.049759,0.041816,0.040918,0.036694,0.031097


In [843]:
len(relay_estimates)
#relay_estimates[["fin_sum_log_mean_1", "fin_sum_log_std_1", "fin_sum_log_mean_2", "fin_sum_log_std_2", "fin_sum_log_mean_3", "fin_sum_log_std_3"]]

1843

In [844]:
# Flatten the troublesome multi-index to field_{leg} etc...
by_teams_flat = by_teams.copy()
by_teams_flat.columns = [f'{x[0]}_{x[1]}' for x in by_teams_flat.columns]
by_teams_flat.reset_index()


Unnamed: 0,team_id,team_1,team_2,team_3,team_4,team_5,team_6,team_7,team_base_name_1,team_base_name_2,...,ind_mean_5,ind_mean_6,ind_mean_7,ind_median_1,ind_median_2,ind_median_3,ind_median_4,ind_median_5,ind_median_6,ind_median_7
0,1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg 1,IFK Göteborg,IFK Göteborg,...,52.929556,59.216400,86.031925,68.817343,83.069436,73.561099,52.816934,52.763784,59.215628,86.022319
1,2,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee 1,Koovee,Koovee,...,54.949686,60.995212,87.455533,67.486865,72.104171,75.980847,54.320490,54.658743,60.876500,87.097402
2,3,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö 1,Tampereen Pyrintö,Tampereen Pyrintö,...,53.411364,61.058061,87.339752,67.715767,73.400057,75.555759,55.021214,53.347510,61.026089,87.281419
3,4,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering 1,Södertälje Nykvarn Orientering,Södertälje Nykvarn Orientering,...,53.156447,75.761993,87.753882,65.922895,73.799486,74.831947,51.467591,53.000857,75.488853,87.492446
4,5,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK 1,Halden SK,Halden SK,...,62.097304,75.794950,85.633899,64.133501,75.142332,77.273814,59.881118,61.654866,75.521582,85.578863
5,6,Helsingin Suunnistajat 1,Helsingin Suunnistajat 1,Helsingin Suunnistajat 1,Helsingin Suunnistajat 1,Helsingin Suunnistajat 1,Helsingin Suunnistajat 1,Helsingin Suunnistajat 1,Helsingin Suunnistajat,Helsingin Suunnistajat,...,56.917205,73.143192,95.454671,71.067530,81.893692,79.259454,56.622836,56.567705,72.479759,95.225971
6,7,Paimion Rasti 1,Paimion Rasti 1,Paimion Rasti 1,Paimion Rasti 1,Paimion Rasti 1,Paimion Rasti 1,Paimion Rasti 1,Paimion Rasti,Paimion Rasti,...,52.844717,75.860905,89.062427,67.872451,72.090295,76.532249,52.029271,52.287983,75.587084,88.886257
7,8,Hiidenkiertäjät 1,Hiidenkiertäjät 1,Hiidenkiertäjät 1,Hiidenkiertäjät 1,Hiidenkiertäjät 1,Hiidenkiertäjät 1,Hiidenkiertäjät 1,Hiidenkiertäjät,Hiidenkiertäjät,...,55.117340,67.175768,89.509771,64.842688,81.656738,74.557289,54.048333,55.029472,66.579191,89.286199
8,9,Frol IL 1,Frol IL 1,Frol IL 1,Frol IL 1,Frol IL 1,Frol IL 1,Frol IL 1,Frol IL,Frol IL,...,53.811411,65.225565,95.034567,72.038454,73.169619,82.662063,52.883030,53.695706,65.052225,94.431474
9,10,OK Ravinen 1,OK Ravinen 1,OK Ravinen 1,OK Ravinen 1,OK Ravinen 1,OK Ravinen 1,OK Ravinen 1,OK Ravinen,OK Ravinen,...,62.767118,72.069080,91.075649,71.783890,82.680350,74.387573,57.043640,62.340784,71.992580,90.607479


In [845]:
estimates = pd.concat([by_teams_flat, relay_estimates], axis=1, join='inner')

In [846]:
# Convert minutes to date and times
start_timestamp = pd.Timestamp(year = 2018, month = 6, day = 16, hour = 23)

for leg in range(1,8):
    estimates[f"fint_median_{leg}"] = pd.to_datetime(estimates[f"fin_median_{leg}"] * 60, unit = "s", origin= start_timestamp)
    estimates[f"fint_start95_{leg}"] = pd.to_datetime(estimates[f"fin_start95_{leg}"] * 60, unit = "s", origin= start_timestamp)
    estimates[f"fint_end95_{leg}"] = pd.to_datetime(estimates[f"fin_end95_{leg}"] * 60, unit = "s", origin= start_timestamp)


In [847]:
# Sort teams by estimated total time 
estimates = estimates.sort_values("fin_median_7")

estimates.to_csv('data/team_estimates_ju2018.tsv', sep="\t")

In [848]:
estimates[["team_1", "fin_median_7"]].head(10)

Unnamed: 0_level_0,team_1,fin_median_7
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Koovee 1,473.687114
3,Tampereen Pyrintö 1,474.383754
17,IL Tyrving 1,474.914994
1,IFK Göteborg 1,477.294272
35,OLV Baselland 1,481.824733
22,Turun Metsänkävijät 1,482.546375
4,Södertälje Nykvarn Orientering 1,483.388516
21,IFK Lidingö SOK 1,483.593697
7,Paimion Rasti 1,486.728707
11,OK Linne 1,486.997335


In [849]:
teams_to_follow = estimates[estimates['team_1'].str.contains("Reak") | estimates['team_1'].str.contains("Puskasil") | estimates['team_1'].str.contains("Rastihaukat")]
teams_to_follow[["team_1", "fin_median_7", "fin_start95_7", "fin_end95_7"]]

Unnamed: 0_level_0,team_1,fin_median_7,fin_start95_7,fin_end95_7
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
429,Reaktor Innovations 1,744.897317,698.264057,795.846271
568,Rastihaukat 1,751.536919,698.863921,809.785723
1270,Reaktor Innovations 2,882.548471,794.073074,993.109824
1246,Rastihaukat 2,909.867254,833.055264,993.343548
1131,Puskasilimät OK 1,912.824808,855.824923,977.891618


In [850]:
leg_1_cols = list(filter(lambda c: "_1" in c,estimates.columns.values))
column_base_names = list(map(lambda c: c[:-2], leg_1_cols))
runner_estimates = pd.wide_to_long(estimates.reset_index(), stubnames=column_base_names, i ="team_id", j="leg", sep = "_")
runner_estimates = runner_estimates.sort_values(by=['team_id', 'leg'])
runner_estimates = runner_estimates.drop(['team_base_name', 'estimated_log_means', 'estimated_log_stdevs'], axis=1)
runner_estimates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,team,leg_dist,name,history_log_means,history_log_stdevs,log_means,log_stdevs,ind_95_start,ind_95_end,ind_mean,ind_median,fin_start95,fin_end95,fin_median,fin_mean,fin_sum_log_mean,fin_sum_log_std,fint_median,fint_start95,fint_end95
team_id,leg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,IFK Göteborg 1,11.0,Max Peter Bejmer,1.833561,0.157965,1.833561,0.157965,50.493803,93.790257,69.681316,68.817343,50.67545,93.449436,68.872218,69.666324,4.231524,0.156181,2018-06-17 00:08:52.333102,2018-06-16 23:50:40.526991,2018-06-17 00:33:26.966159
1,2,IFK Göteborg 1,11.9,Johan Högstarnd,0.0,0.0,1.943138,0.084941,70.329796,98.116751,83.369649,83.069436,129.453546,180.665287,152.644836,153.139793,5.027743,0.084908,2018-06-17 01:32:38.690171,2018-06-17 01:09:27.212763,2018-06-17 02:00:39.917233
1,3,IFK Göteborg 1,12.8,Vetle Ruud Bråten,1.748671,0.030981,1.748671,0.030981,69.227322,78.166179,73.596409,73.561099,202.677491,254.497714,226.275233,226.702027,5.421937,0.058246,2018-06-17 02:46:16.513961,2018-06-17 02:22:40.649446,2018-06-17 03:14:29.862866
1,4,IFK Göteborg 1,8.7,Jonas Pilblad,1.803509,0.061865,1.803509,0.061865,46.785718,59.625643,52.918105,52.816934,254.324642,308.209131,279.188774,279.625613,5.632256,0.048863,2018-06-17 03:39:11.326460,2018-06-17 03:14:19.478501,2018-06-17 04:08:12.547835
1,5,IFK Göteborg 1,8.7,Jens Wängdahl,1.802502,0.079207,1.802502,0.079207,45.176752,61.62499,52.929556,52.763784,305.889288,362.115189,332.037501,332.570468,5.805922,0.043095,2018-06-17 04:32:02.250049,2018-06-17 04:05:53.357275,2018-06-17 05:02:06.911367


In [851]:
runner_estimates.to_csv('data/runner_estimates_ju2018.tsv', sep="\t")

In [852]:
runner_estimates[runner_estimates['team'].str.contains("Reak")][["name", "log_means", "log_stdevs", "ind_median"]]


Unnamed: 0_level_0,Unnamed: 1_level_0,name,log_means,log_stdevs,ind_median
team_id,leg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
429,1,Mikko Peltonen,2.184951,0.081299,97.792383
429,2,Pasi Huhtiniemi,2.222239,0.072482,109.812846
429,3,Joni Freeman,2.130879,0.113846,107.805019
429,4,Tuomas Kareinen,2.29324,0.16007,86.19078
429,5,Janne Vaittinen,2.324384,0.039861,88.917301
429,6,Olavi Kanerva,2.128667,0.092121,90.759473
429,7,Jyri Kytömäki,2.355319,0.048494,161.284847
1270,1,Antti-Ville Jokela,2.363116,0.10623,116.864
1270,2,Karri-Pekka Laakso,2.414677,0.081841,133.115223
1270,3,Oskari Pirttikoski,2.637533,0.194072,178.927107


In [853]:
for_print = runner_estimates.copy()
for_print = for_print.reset_index()
for_print = for_print.set_index('team_id')
for_print = for_print.round(2)
for_print.fint_median = for_print.fint_median.dt.strftime("%H:%M")
for_print.fint_start95 = for_print.fint_start95.dt.strftime("%H:%M")
for_print.fint_end95 = for_print.fint_end95.dt.strftime("%H:%M")
for_print = for_print[['team',
 'leg',
 'name',
 'ind_median', 
 'ind_95_start', 
 'ind_95_end', 
 'fin_median',
 'fint_median',
 'fint_start95',
 'fint_end95']]
for_print.head()

Unnamed: 0_level_0,team,leg,name,ind_median,ind_95_start,ind_95_end,fin_median,fint_median,fint_start95,fint_end95
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,IFK Göteborg 1,1,Max Peter Bejmer,68.82,50.49,93.79,68.87,00:08,23:50,00:33
1,IFK Göteborg 1,2,Johan Högstarnd,83.07,70.33,98.12,152.64,01:32,01:09,02:00
1,IFK Göteborg 1,3,Vetle Ruud Bråten,73.56,69.23,78.17,226.28,02:46,02:22,03:14
1,IFK Göteborg 1,4,Jonas Pilblad,52.82,46.79,59.63,279.19,03:39,03:14,04:08
1,IFK Göteborg 1,5,Jens Wängdahl,52.76,45.18,61.62,332.04,04:32,04:05,05:02


In [854]:
for_print.to_csv('for_print_ju2018_after.tsv', sep="\t")

In [855]:
for_print[for_print['team'].str.contains("Reak") | for_print['team'].str.contains("Puskasil") | for_print['team'].str.contains("Rastihaukat 2")]


Unnamed: 0_level_0,team,leg,name,ind_median,ind_95_start,ind_95_end,fin_median,fint_median,fint_start95,fint_end95
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
429,Reaktor Innovations 1,1,Mikko Peltonen,97.79,83.39,114.68,97.77,00:37,00:23,00:54
429,Reaktor Innovations 1,2,Pasi Huhtiniemi,109.81,95.27,126.58,207.9,02:27,02:07,02:51
429,Reaktor Innovations 1,3,Joni Freeman,107.81,86.24,134.75,315.86,04:15,03:45,04:51
429,Reaktor Innovations 1,4,Tuomas Kareinen,86.19,62.98,117.95,402.74,05:42,05:03,06:28
429,Reaktor Innovations 1,5,Janne Vaittinen,88.92,82.24,96.14,491.89,07:11,06:31,07:57
429,Reaktor Innovations 1,6,Olavi Kanerva,90.76,75.77,108.72,583.03,08:43,07:59,09:32
429,Reaktor Innovations 1,7,Jyri Kytömäki,161.28,146.66,177.37,744.9,11:24,10:38,12:15
1131,Puskasilimät OK 1,1,Tuomas Ketonen,124.33,101.43,152.41,124.44,01:04,00:41,01:33
1131,Puskasilimät OK 1,2,Petri Miettinen,131.06,122.12,140.64,255.72,03:15,02:50,03:45
1131,Puskasilimät OK 1,3,Jaakko Havola,146.75,135.82,158.57,402.65,05:42,05:15,06:14


In [856]:
for_web = runner_estimates.copy().reset_index()
for_web.fint_median = for_web.fint_median.dt.strftime("%H:%M")
for_web.fint_start95 = for_web.fint_start95.dt.strftime("%H:%M")
for_web.fint_end95 = for_web.fint_end95.dt.strftime("%H:%M")
for_web = for_web[[
 'team_id',
 'leg',
 'team',
 'name',
 'fin_sum_log_mean', 
 'fin_sum_log_std',
 'fin_median',
 'fint_median',
 'fint_start95',
 'fint_end95']]
for_web

Unnamed: 0,team_id,leg,team,name,fin_sum_log_mean,fin_sum_log_std,fin_median,fint_median,fint_start95,fint_end95
0,1,1,IFK Göteborg 1,Max Peter Bejmer,4.231524,0.156181,68.872218,00:08,23:50,00:33
1,1,2,IFK Göteborg 1,Johan Högstarnd,5.027743,0.084908,152.644836,01:32,01:09,02:00
2,1,3,IFK Göteborg 1,Vetle Ruud Bråten,5.421937,0.058246,226.275233,02:46,02:22,03:14
3,1,4,IFK Göteborg 1,Jonas Pilblad,5.632256,0.048863,279.188774,03:39,03:14,04:08
4,1,5,IFK Göteborg 1,Jens Wängdahl,5.805922,0.043095,332.037501,04:32,04:05,05:02
5,1,6,IFK Göteborg 1,Fredrik Bakkman,5.970047,0.036602,391.235543,05:31,05:05,06:01
6,1,7,IFK Göteborg 1,Eskil Kinneberg,6.168793,0.030141,477.294272,06:57,06:31,07:27
7,2,1,Koovee 1,Joni Hirvikallio,4.212205,0.080114,67.529805,00:07,23:57,00:18
8,2,2,Koovee 1,Topi Anjala,4.939871,0.057300,139.710263,01:19,01:05,01:36
9,2,3,Koovee 1,Lauri Sild,5.375380,0.043972,215.917583,02:35,02:18,02:56


In [857]:
for_web.to_json('web-lib/for_web_ju2018_after.json', orient="records")

In [889]:
# Read the actual times after race and analyze estimates
results18 = pd.read_csv('data/csv-results_j2018_ju.tsv', delimiter="\t")

In [890]:
results18 = results18[["team-id", "leg-nro", "leg-time"]]
results18["leg-time"] = results18["leg-time"] / 60
results18.head()

Unnamed: 0,team-id,leg-nro,leg-time
0,2,1,64.6
1,2,2,70.0
2,2,3,74.116667
3,2,4,50.183333
4,2,5,48.75


In [949]:
results18 = results18.rename(index=str, columns={"team-id": "team_id", "leg-nro": "leg"})
runner_estimates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,team,leg_dist,name,history_log_means,history_log_stdevs,log_means,log_stdevs,ind_95_start,ind_95_end,ind_mean,ind_median,fin_start95,fin_end95,fin_median,fin_mean,fin_sum_log_mean,fin_sum_log_std,fint_median,fint_start95,fint_end95
team_id,leg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,IFK Göteborg 1,11.0,Max Peter Bejmer,1.833561,0.157965,1.833561,0.157965,50.493803,93.790257,69.681316,68.817343,50.67545,93.449436,68.872218,69.666324,4.231524,0.156181,2018-06-17 00:08:52.333102,2018-06-16 23:50:40.526991,2018-06-17 00:33:26.966159
1,2,IFK Göteborg 1,11.9,Johan Högstarnd,0.0,0.0,1.943138,0.084941,70.329796,98.116751,83.369649,83.069436,129.453546,180.665287,152.644836,153.139793,5.027743,0.084908,2018-06-17 01:32:38.690171,2018-06-17 01:09:27.212763,2018-06-17 02:00:39.917233
1,3,IFK Göteborg 1,12.8,Vetle Ruud Bråten,1.748671,0.030981,1.748671,0.030981,69.227322,78.166179,73.596409,73.561099,202.677491,254.497714,226.275233,226.702027,5.421937,0.058246,2018-06-17 02:46:16.513961,2018-06-17 02:22:40.649446,2018-06-17 03:14:29.862866
1,4,IFK Göteborg 1,8.7,Jonas Pilblad,1.803509,0.061865,1.803509,0.061865,46.785718,59.625643,52.918105,52.816934,254.324642,308.209131,279.188774,279.625613,5.632256,0.048863,2018-06-17 03:39:11.326460,2018-06-17 03:14:19.478501,2018-06-17 04:08:12.547835
1,5,IFK Göteborg 1,8.7,Jens Wängdahl,1.802502,0.079207,1.802502,0.079207,45.176752,61.62499,52.929556,52.763784,305.889288,362.115189,332.037501,332.570468,5.805922,0.043095,2018-06-17 04:32:02.250049,2018-06-17 04:05:53.357275,2018-06-17 05:02:06.911367


In [950]:
with_result = pd.merge(runner_estimates, results18, how='left', on=['team_id', 'leg'])
with_result = with_result[np.isfinite(with_result["leg-time"])]

In [951]:
with_result["ind_error"] = with_result.ind_median - with_result["leg-time"]
with_result["ind_error_perc"] = with_result["ind_error"] / with_result["leg-time"]
with_result["ind_in_int"] = (with_result["ind_95_start"] < with_result["leg-time"]) & (with_result["ind_95_end"] > with_result["leg-time"])
#viewable = with_result[['team_id', "team", 'leg', "name", "ind_median", "leg-time", "ind_error", "ind_error_perc", "ind_in_int"]]
#viewable[viewable.team_id.isin([1270, 429, 1131, 1178, 1089])].sort_values(by=['ind_error_perc', 'ind_error'],ascending=False)

Unnamed: 0,team_id,team,leg,name,ind_median,leg-time,ind_error,ind_error_perc,ind_in_int
7482,1089,IC-Electrofit 1,7,Henri Äijö,160.391118,121.383333,39.007785,0.32136,False
7756,1131,Puskasilimät OK 1,1,Tuomas Ketonen,124.333,102.4,21.933,0.214189,True
8079,1178,ÄHPM! - Älä hitossa peesaa meitä! 1,2,Anniina Erkkilä,189.9716,158.366667,31.604933,0.199568,True
7478,1089,IC-Electrofit 1,3,Simo-Viljami Ojanen,143.194939,124.15,19.044939,0.153403,True
7762,1131,Puskasilimät OK 1,7,Anssi Pesonen,175.5981,153.6,21.9981,0.143217,True
2949,429,Reaktor Innovations 1,3,Joni Freeman,107.805019,94.366667,13.438352,0.142406,True
7761,1131,Puskasilimät OK 1,6,Esko Harjama,123.040544,107.95,15.090544,0.139792,True
8083,1178,ÄHPM! - Älä hitossa peesaa meitä! 1,6,Linda Wiksten,156.352701,137.183333,19.169367,0.139735,True
2952,429,Reaktor Innovations 1,6,Olavi Kanerva,90.759473,79.983333,10.77614,0.13473,True
8084,1178,ÄHPM! - Älä hitossa peesaa meitä! 1,7,Timo Waltari,177.8951,161.233333,16.661767,0.103339,True


In [952]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
rmse(with_result.ind_median, with_result["leg-time"])

26.195444841518498

In [953]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(with_result["leg-time"], with_result.ind_median)

16.784668681626297

In [954]:
with_result["ind_error_perc"].abs().mean()

0.13543036139562353

In [955]:
with_result["ind_error_perc"].mean()

0.02208268145488205

In [956]:
with_result["ind_in_int"].mean()

0.7499410052701959

In [967]:
def get_estimate_params(result_row):
    legs_so_far = [ i for i in range(1, result_row.leg + 1) ]
    rows_so_far = with_result[(with_result.team_id == result_row.team_id) & (with_result.leg.isin(legs_so_far))]
    return np.sum(rows_so_far["leg-time"])

#with_result = with_result[with_result.team_id.isin([1270, 429, 1131, 1178, 1089])]    
with_result["real_mins"] = with_result.apply(lambda row: get_estimate_params(row), axis=1)
with_result.head()

Unnamed: 0,team_id,leg,team,leg_dist,name,history_log_means,history_log_stdevs,log_means,log_stdevs,ind_95_start,...,fint_start95,fint_end95,leg-time,ind_error,ind_error_perc,ind_in_int,real_mins,team_error,team_error_perc,team_in_int
0,1,1,IFK Göteborg 1,11.0,Max Peter Bejmer,1.833561,0.157965,1.833561,0.157965,50.493803,...,2018-06-16 23:50:40.526991,2018-06-17 00:33:26.966159,63.816667,5.000676,0.07836,True,63.816667,5.055552,0.07922,True
1,1,2,IFK Göteborg 1,11.9,Johan Högstarnd,0.0,0.0,1.943138,0.084941,70.329796,...,2018-06-17 01:09:27.212763,2018-06-17 02:00:39.917233,68.95,14.119436,0.204778,False,132.766667,19.87817,0.149723,True
2,1,3,IFK Göteborg 1,12.8,Vetle Ruud Bråten,1.748671,0.030981,1.748671,0.030981,69.227322,...,2018-06-17 02:22:40.649446,2018-06-17 03:14:29.862866,70.166667,3.394432,0.048377,True,202.933333,23.341899,0.115023,True
3,1,4,IFK Göteborg 1,8.7,Jonas Pilblad,1.803509,0.061865,1.803509,0.061865,46.785718,...,2018-06-17 03:14:19.478501,2018-06-17 04:08:12.547835,57.283333,-4.466399,-0.07797,True,260.216667,18.972108,0.072909,True
4,1,5,IFK Göteborg 1,8.7,Jens Wängdahl,1.802502,0.079207,1.802502,0.079207,45.176752,...,2018-06-17 04:05:53.357275,2018-06-17 05:02:06.911367,50.683333,2.08045,0.041048,True,310.9,21.137501,0.067988,True


In [968]:
rmse(with_result.fin_median, with_result["real_mins"])

67.22883501822652

In [969]:
mean_absolute_error(with_result["real_mins"], with_result.fin_median)

40.199956090003596

In [970]:
with_result["team_error"] = with_result.fin_median - with_result["real_mins"]
with_result["team_error_perc"] = with_result["team_error"] / with_result["real_mins"]
with_result["team_in_int"] = (with_result["fin_start95"] < with_result["real_mins"]) & (with_result["fin_end95"] > with_result["real_mins"])


In [971]:
with_result["team_error_perc"].abs().mean()

0.09468762805661657

In [972]:
with_result["team_in_int"].mean()

0.7316133092110438

In [973]:
with_result[['team_id', "team", 'leg',  'leg_dist', "name", "fin_start95", "real_mins", "fin_end95", "team_in_int"]].head()

Unnamed: 0,team_id,team,leg,leg_dist,name,fin_start95,real_mins,fin_end95,team_in_int
0,1,IFK Göteborg 1,1,11.0,Max Peter Bejmer,50.67545,63.816667,93.449436,True
1,1,IFK Göteborg 1,2,11.9,Johan Högstarnd,129.453546,132.766667,180.665287,True
2,1,IFK Göteborg 1,3,12.8,Vetle Ruud Bråten,202.677491,202.933333,254.497714,True
3,1,IFK Göteborg 1,4,8.7,Jonas Pilblad,254.324642,260.216667,308.209131,True
4,1,IFK Göteborg 1,5,8.7,Jens Wängdahl,305.889288,310.9,362.115189,True


In [986]:
with_result_web = with_result.copy().reset_index()
with_result_web.fint_median = with_result_web.fint_median.dt.tz_localize('EET').dt.tz_convert('UTC')
with_result_web.fint_start95 = with_result_web.fint_start95.dt.tz_localize('EET').dt.tz_convert('UTC')
with_result_web.fint_end95 = with_result_web.fint_end95.dt.tz_localize('EET').dt.tz_convert('UTC')
with_result_web = with_result_web[[
 'team_id',
 'leg',
 'team',
 'name',
 'fin_sum_log_mean', 
 'fin_sum_log_std',
 'fin_median',
 'real_mins',
 'fint_median',
 'fint_start95',
 'fint_end95']]
with_result_web.to_json('web-lib/with_result_ju2018.json', orient="records")