In [None]:
import pandas as pd
import numpy as np
from scipy.stats import lognorm
import matplotlib.pyplot as plt
import shared

ve_or_ju = shared.race_type()
year = shared.forecast_year()
import time
startTime = time.time()

In [None]:
estimates = pd.read_json(f'web-lib/for_web_{shared.race_id_str()}.json', orient="records", convert_dates=["fin_time_median", "fin_time_start95", "fin_time_end95"])



In [None]:
estimates.head().round(3)

In [None]:
display(estimates.info())


In [None]:
# Read the actual times after race and analyze estimates

results = pd.read_csv(f'data/results_with_dist_j{year}_{ve_or_ju}.tsv', delimiter="\t")
display(results[results["team-id"] == 9].round(2))
results = results[["team-id", "leg-nro", "leg-time", "leg_distance"]]
results["leg-time"] = results["leg-time"] / 60
results = results.rename(index=str, columns={"team-id": "team_id", "leg-nro": "leg"})
num_runners = len(results)
results[results["team_id"] == 9].round(3)

In [None]:

with_result = pd.merge(estimates, results, how='left', on=['team_id', 'leg'])
with_result = with_result[np.isfinite(with_result["leg-time"])]

In [None]:
with_result[with_result["team_id"] == 9].round(3)


In [None]:
with_result["ind_leg_time_estimate"] = np.exp(with_result["ind_log_mean"]) * with_result["leg_distance"]

In [None]:
with_result["ind_error_signed"] =  with_result["leg-time"] - with_result["ind_leg_time_estimate"]
with_result["ind_error"] = np.abs(with_result["ind_leg_time_estimate"] - with_result["leg-time"])
with_result["ind_error_ratio"] = with_result["ind_error"] / with_result["leg-time"]
with_result["ind_interval_error"] = (with_result["personal_start95"] > with_result["leg-time"]) | (with_result["personal_end95"] < with_result["leg-time"])
with_result["ind_interval_error_fast"] = with_result["personal_start95"] > with_result["leg-time"]
with_result["ind_interval_error_slow"] = with_result["personal_end95"] < with_result["leg-time"]


In [None]:
with_result.sort_values(by=['ind_error']).tail(10).round(2)

In [None]:
import os
execution_timestamp = os.getenv("RUN_TS",  "unknown")
json_reports = {
    "signed_average_error_of_individual_forecast_ie_runner_was_late_minutes": {"value": np.mean(with_result["ind_error_signed"]), "desc": "Yksilöennusteen etumerkillinen keskivirhe (juoksija myöhästyi ennusteesta)"},
    "average_error_of_individual_prediction_minutes": {"value": np.mean(with_result["ind_error"]), "desc": "Yksilöennusteen keskivirhe"},
    "median_error_of_individual_prediction_minutes": {"value": np.median(with_result["ind_error"]), "desc": "Yksilöennusteen mediaanivirhe"},
    "ratio_of_error_to_final_time": {"value": np.mean(with_result["ind_error_ratio"]), "desc": "Virheen suhde loppuaikaan"},
    "individual_interval_prediction_is_wrong": {"value": np.mean(with_result["ind_interval_error"]), "desc": "Yksilön aikaväliennuste väärin"},
    "individual_faster_than_interval_prediction": {"value": np.mean(with_result["ind_interval_error_fast"]), "desc": "Yksilö nopeampi kuin aikaväliennuste"},
    "individual_slower_than_interval_forecast": {"value": np.mean(with_result["ind_interval_error_slow"]), "desc": "Yksilö hitaampi kuin aikaväliennuste"},
    "post_analysis_start_time": {"value": startTime, "desc": "Time of starting to run post race analysis"},
    "execution_timestamp": {"value": execution_timestamp, "desc": "Time of starting processing of multiple years"},
    "race_id": {"value": shared.race_id_str(), "desc": "Race type and year."},
    "num_runners": {"value": num_runners, "desc": "Number of runners in final results."}
}


reports = [f'Yksilöennusteen etumerkillinen keskivirhe (juoksija myöhästyi ennusteesta): {np.mean(with_result["ind_error_signed"]).round(1)} minuuttia',
           f'Yksilöennusteen keskivirhe: {np.mean(with_result["ind_error"]).round(1)} minuuttia',
              f'Yksilöennusteen mediaanivirhe: {np.median(with_result["ind_error"]).round(1)} minuuttia',
           f'Virheen suhde loppuaikaan: {np.mean(with_result["ind_error_ratio"] * 100).round(1)} %',
           f'Yksilön aikaväliennuste väärin: {np.mean(with_result["ind_interval_error"] * 100).round(1)} %',
           f'Yksilö nopeampi kuin aikaväliennuste: {np.mean(with_result["ind_interval_error_fast"] * 100).round(1)} %',
           f'Yksilö hitaampi kuin aikaväliennuste: {np.mean(with_result["ind_interval_error_slow"] * 100).round(1)} %']

display(reports)  

In [None]:
with_result["ind_interval"] = with_result["personal_end95"] - with_result["personal_start95"]
mean_ind_interval = np.mean(with_result["ind_interval"])
reports.append(f"Yksilön aikavälin keskikoko: {mean_ind_interval.round(1)} minuuttia")

json_reports["average_size_of_an_individual_interval_minutes"] = {"value": mean_ind_interval, "desc": "Yksilön aikavälin keskikoko"}

mean_ind_interval_error_size = mean_ind_interval * np.mean(with_result["ind_interval_error"])
reports.append(f"Yksilön aikavälin keskikoko kertaa virhe: {mean_ind_interval_error_size.round(1)} minuuttia")
json_reports["mean_ind_interval_error_size"] = {"value": mean_ind_interval_error_size, "desc": "Yksilön aikavälin keskikoko kertaa virhe"}
mean_ind_interval_error_size

In [None]:
median_ind_interval = np.median(with_result["ind_interval"])
reports.append(f"Yksilön aikavälin mediaani koko: {median_ind_interval.round(1)} minuuttia")
json_reports["median_ind_interval"] = {"value": median_ind_interval, "desc": "Yksilön aikavälin mediaani koko"}

median_ind_interval_error_size = median_ind_interval * np.mean(with_result["ind_interval_error"])
reports.append(f"Yksilön aikavälin mediaani koko kertaa virhe: {median_ind_interval_error_size.round(1)} minuuttia")
json_reports["median_ind_interval_times_error_size"] = {"value": median_ind_interval_error_size, "desc": "Yksilön aikavälin mediaani koko kertaa virhe"}
median_ind_interval_error_size

In [None]:
mean_ind_interval_error_by_num_runs = with_result[["num_runs", "ind_interval", 'ind_interval_error']].groupby('num_runs').agg("mean")
#mean_ind_interval_error_by_num_runs.columns = ["_".join(a.strip()) for a in mean_ind_interval_error_by_num_runs.columns.values]

mean_ind_interval_error_by_num_runs["ind_interval_error_size"] = mean_ind_interval_error_by_num_runs["ind_interval_error"] * mean_ind_interval_error_by_num_runs["ind_interval"]
reports.append(mean_ind_interval_error_by_num_runs.round(2).to_string())
mean_ind_interval_error_by_num_runs.round(2)

In [None]:
ind_by_num_runs = with_result[["num_runs", "ind_interval", 'ind_error', 'ind_interval_error', 'ind_error_signed']].groupby('num_runs').agg({'ind_interval': ["mean", "median"], 'ind_error': ["mean", "median"], 'ind_interval_error': ["mean", "count"], 'ind_error_signed': ["mean", "median"]}).round(2)
reports.append(ind_by_num_runs.to_string())
ind_by_num_runs

In [None]:
with_result[["leg", 'ind_error', 'ind_interval_error', "num_runs"]].groupby('leg').agg(["mean", "median", "count"]).round(2)

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}
// # To disable auto-scrolling, execute this javascript in a notebook cell before other cells are executed:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
plot = sns.scatterplot(x="team_id", y="ind_error_signed", hue="num_runs", data=with_result)
plot.axes.set_ylim(with_result.ind_error_signed.quantile(0.05), with_result.ind_error_signed.quantile(0.95))


In [None]:
plt.figure(figsize=(16, 10))
plot = sns.scatterplot(x="team_id", y="leg-time", hue="ind_interval_error", alpha=0.4, data=with_result)
plot.axes.set_ylim(with_result["leg-time"].min() - 5, with_result["leg-time"].quantile(0.95))

In [None]:
plt.figure(figsize=(16, 10))
plot = sns.scatterplot(x="leg-time", y="ind_error_signed", hue="ind_interval_error", data=with_result)
plot.axes.set_ylim(with_result.ind_error_signed.quantile(0.02), with_result.ind_error_signed.quantile(0.98))
plot.axes.set_xlim(with_result["leg-time"].min() - 1, with_result["leg-time"].quantile(0.95))

In [None]:
g = sns.FacetGrid(with_result, height=8, aspect=2, xlim=(-80,80), margin_titles=True, despine=True)
#g.map(sns.distplot, "ind_error", hist_kws={'alpha':0.8}).add_legend()
sns.histplot(with_result.ind_error_signed, bins=250, kde=True)

In [None]:
with_result["capped_num_runs"] = np.clip(with_result.num_runs, 1, shared.num_pace_years + 1)


In [None]:
g = sns.FacetGrid(with_result, hue="capped_num_runs", height=8, aspect=2, xlim=(-80,80), margin_titles=True, despine=True)
g.map(sns.distplot, "ind_error_signed", hist=False, hist_kws={'alpha':0.8}).add_legend()

In [None]:
with_result[with_result.team == "Malungs OK Skogsmårdarna 1"]

In [None]:

relay_times = with_result[['team_id',"leg-time"]].groupby('team_id').expanding().sum().add_prefix('cumulative-').reset_index()
relay_times[["team_id","cumulative-leg-time"]][relay_times["team_id"] <= 11].tail(20)


In [None]:
rolling_num_runs = with_result[['team_id',"num_runs"]].groupby('team_id').expanding().mean().add_prefix('average_').reset_index()
rolling_num_runs[["team_id","average_num_runs"]][rolling_num_runs["team_id"] <= 11].tail(20)


In [None]:
relay_times.shape

In [None]:
with_result.shape

In [None]:
with_result = with_result.reset_index() # Reset to match values in relay_times
with_result["fast_relay_time"] = relay_times['cumulative-leg-time']
with_result[['team_id', 'leg', 'leg-time', "fast_relay_time"]][with_result["team_id"] <= 11].tail(20)


In [None]:

# This takes all the time in post analysis
"""
def calculate_relay_values(result_row):
    rows_so_far = with_result[(with_result.team_id == result_row.team_id) & (with_result["leg"] <= result_row["leg"])]

    return pd.Series({"real_relay_time": np.sum(rows_so_far["leg-time"]), "mean_num_runs": np.mean(rows_so_far["num_runs"]).round(0)})

relay_values = with_result.apply(lambda row: calculate_relay_values(row), axis=1)

with_result["real_relay_time"] = relay_values["real_relay_time"]
with_result["mean_num_runs"] = relay_values["mean_num_runs"]
"""
with_result["real_relay_time"] = with_result["fast_relay_time"]
with_result["mean_num_runs"] = rolling_num_runs["average_num_runs"].round(0)


In [None]:
with_result["relay_error"] = np.abs(np.exp(with_result["fin_sum_log_mean"]) - with_result["real_relay_time"])
with_result["relay_error_ratio"] = with_result["relay_error"] / with_result["real_relay_time"]

In [None]:
display(np.mean(with_result["relay_error"]))
display(np.mean(with_result["relay_error_ratio"]))

In [None]:
reports.append(f'Viestiennusteen keskivirhe: {np.mean(with_result["relay_error"]).round(1)} minuuttia')
json_reports["average_error_of_relay_prediction_minutes"] = {"value": np.mean(with_result["relay_error"]), "desc": "Viestiennusteen keskivirhe"}

reports.append(f'Viestiennusteen virheen mediaani: {np.median(with_result["relay_error"]).round(1)} minuuttia')
json_reports["median_relay_prediction_error_minutes"] = {"value": np.median(with_result["relay_error"]), "desc": "Viestiennusteen virheen mediaani"}

display(reports)

In [None]:
reports.append(with_result[["leg", 'relay_error']].groupby('leg').agg(["mean", "median", "count"]).round(1).to_string())
display(reports)

In [None]:
plt.figure(figsize=(16, 10))
plot = sns.scatterplot(x="team_id", y="relay_error", hue="leg", data=with_result)
plot.axes.set_ylim(0, with_result.relay_error.quantile(0.95))


In [None]:
plt.figure(figsize=(16, 10))
plot = sns.scatterplot(x="team_id", y="relay_error", hue="mean_num_runs", data=with_result)
plot.axes.set_ylim(0, with_result.relay_error.quantile(0.95))

In [None]:
g = sns.FacetGrid(with_result, hue="leg", height=8, aspect=2, xlim=(0,180), margin_titles=True, despine=True)
sns.distplot(with_result.relay_error, bins=250, kde=True)

In [None]:
g = sns.FacetGrid(with_result, hue="leg", height=8, aspect=2, xlim=(0,180), margin_titles=True, despine=True)
g.map(sns.distplot, "relay_error", hist=False, hist_kws={'alpha':0.8}).add_legend()

In [None]:
with_result["real_finish_time"] = pd.to_timedelta(with_result["real_relay_time"], unit="min") + shared.start_timestamp[ve_or_ju][year]

with_result["real_interval_error"] = (with_result["real_finish_time"] < with_result["fin_time_start95"]) | (with_result["real_finish_time"] > with_result["fin_time_end95"])
with_result["real_interval_error_fast"] = (with_result["real_finish_time"] < with_result["fin_time_start95"])
with_result["real_interval_error_slow"] = (with_result["real_finish_time"] > with_result["fin_time_end95"])
with_result["real_interval_error_type"] = "No error"
with_result.loc[with_result["real_interval_error_fast"], "real_interval_error_type"] = "Fast runner"
with_result.loc[with_result["real_interval_error_slow"], "real_interval_error_type"] = "Slow runner"


reports.append(f'Viestin aikaväliennuste väärin: {np.mean(with_result["real_interval_error"] * 100).round(1)} %')
json_reports["relay_interval_prediction_wrong"] = {"value": np.mean(with_result["real_interval_error"]), "desc": "Viestin aikaväliennuste väärin"}

reports.append(f'Juoksija nopeampi kuin viestin aikaväliennuste: {np.mean(with_result["real_interval_error_fast"] * 100).round(1)} %')
reports.append(f'Juoksija hitaampi kuin viestin aikaväliennuste: {np.mean(with_result["real_interval_error_slow"] * 100).round(1)} %')
json_reports["runner_faster_than_relay_time_prediction"] = {"value": np.mean(with_result["real_interval_error_fast"]), "desc": "Juoksija nopeampi kuin viestin aikaväliennuste"}
json_reports["runner_slower_than_post_interval_prediction"] = {"value": np.mean(with_result["real_interval_error_slow"]), "desc": "Juoksija hitaampi kuin viestin aikaväliennuste"}

display(reports)

In [None]:
with_result["real_interval"] = with_result["fin_time_end95"] - with_result["fin_time_start95"]
with_result["real_interval"] =  pd.to_timedelta(with_result["real_interval"].values).total_seconds() / 60


In [None]:




reports.append(with_result[["leg", 'real_interval', 'real_interval_error', 'real_interval_error_fast', 'real_interval_error_slow']].groupby('leg').agg(["mean"]).round(3).to_string())
display(reports)

In [None]:
by_num_runs = with_result[["mean_num_runs", "real_interval", 'real_interval_error', 'real_interval_error_fast', 'real_interval_error_slow']].groupby('mean_num_runs').agg(["mean"]).round(2)
reports.append(by_num_runs.to_string())
by_num_runs

In [None]:
shared.write_simple_text_report(reports, f'post_race_analysis_{shared.race_id_str()}.txt')
shared.write_json_report(json_reports, f'post_race_analysis_{shared.race_id_str()}.json')

In [None]:
with_result.sample(10)

In [None]:
#out_of_interval = with_result[with_result["real_interval_error"]]
plt.figure(figsize=(20, 15))
plot = sns.scatterplot(x="team_id", y="real_relay_time", hue="real_interval_error_type", palette="bright", style="leg", alpha=0.5, data=with_result)
plot.axes.set_ylim(with_result.real_relay_time.min() - 5, with_result.real_relay_time.quantile(0.995))

In [None]:
g = sns.FacetGrid(with_result, row="leg", hue="real_interval_error", xlim=(0,with_result.team_id.max()), height=6, aspect=1, legend_out=False)
g.map(sns.regplot, "team_id", "fin_sum_log_std", scatter_kws={'alpha':0.1}, order=2).add_legend()


In [None]:
endTime = time.time()
shared.log_df(f"{shared.race_id_str()} runtime {round(((endTime - startTime)/ 60), 2)} mins")