In [None]:
import pandas as pd
import numpy as np
import logging
import shared

race_type = shared.race_type()
year = shared.forecast_year()
import time
startTime = time.time()

In [None]:
cleaned = pd.read_csv(f'Jukola-terrain/ideal-paces-{race_type}.tsv', delimiter='\t')

In [None]:
cleaned.sort_values(by=["year", "leg"], ascending=[False, True]).head(10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={"figure.figsize":(16, 9)}) 
plot = sns.scatterplot(x="leg_distance", y="ideal_pace", hue="year", palette="bright", data=cleaned)



In [None]:

ax = sns.boxplot(x="year", y="ideal_pace", data=cleaned)

In [None]:

ax = sns.boxplot(x="leg", y="ideal_pace", data=cleaned)

In [None]:
ax = sns.boxplot(x="year", y="marking", data=cleaned)

In [None]:
ax = sns.boxplot(x="year", y="vertical_per_km", data=cleaned[cleaned["year"] >= 2015])

In [None]:
plot = sns.scatterplot(x="vertical_per_km", y="ideal_pace", hue="year", palette="bright", data=cleaned[cleaned["year"] >= 2015])


In [None]:
cleaned

In [None]:
leg_means = cleaned[["leg", "ideal_pace"]].groupby(["leg"]).agg("mean")
leg_means

In [None]:
ax = sns.scatterplot(x="year", y="terrain_coefficient", hue="leg", data=cleaned)

In [None]:
ax = sns.scatterplot(x="leg", y="terrain_coefficient", hue="year", data=cleaned)

In [None]:
runs = pd.read_csv(f'data/runs_{shared.race_id_str()}.tsv', delimiter='\t')
runs = runs.query("num_runs > 1")

In [None]:
runner_means = runs[["name", "pace"]].groupby(["name"]).agg("mean")
runner_means


In [None]:
runs["pace_mean"] = runner_means["pace"][runs["name"]].values
runs["personal_coefficient"] = runs["pace"] / runs["pace_mean"]

runs.head()

In [None]:
runs = pd.merge(runs, cleaned[["year", "leg", "terrain_coefficient", "marking", "vertical", "vertical_per_km"]], how="left", on=["year", "leg"])
runs["personal_terain_diff"] = runs["personal_coefficient"] - runs["terrain_coefficient"] 
runs.round(4)

In [None]:
ax = sns.lmplot(x="terrain_coefficient", y="personal_coefficient", data=runs, height=8, aspect=1.7, ci=50,scatter_kws={'alpha':0.07}, x_jitter=0.0004)
ax.set(ylim=(0.6, 1.6))

In [None]:
ax = sns.residplot(x="terrain_coefficient", y="personal_coefficient", lowess=True, data=runs,scatter_kws={'alpha':0.07})

In [None]:
ax = sns.lmplot(x="terrain_coefficient", y="personal_coefficient", hue="leg",data=runs, height=8, aspect=1.7, ci=50,scatter_kws={'alpha':0.07}, x_jitter=0.0003)
ax.set(ylim=(0.6, 1.6))

In [None]:
ax = sns.lmplot(x="terrain_coefficient", y="personal_coefficient", hue="year",data=runs, height=8, aspect=1.7, ci=50,scatter_kws={'alpha':0.07}, x_jitter=0.0003)
ax.set(ylim=(0.6, 1.6))

In [None]:
ax = sns.lmplot(x="marking", y="personal_coefficient", data=runs, height=8, aspect=1.7, ci=50,scatter_kws={'alpha':0.07}, x_jitter=3)
ax.set(ylim=(0.6, 1.6))

In [None]:
ax = sns.lmplot(x="vertical", y="personal_coefficient", data=runs, height=8, aspect=1.7, ci=50,scatter_kws={'alpha':0.07}, x_jitter=1)
ax.set(ylim=(0.6, 1.6))

In [None]:
ax = sns.lmplot(x="vertical_per_km", y="personal_coefficient", data=runs, height=8, aspect=1.7, ci=50,scatter_kws={'alpha':0.07}, x_jitter=0.1)
ax.set(ylim=(0.6, 1.6))

In [None]:
ax = sns.boxplot(x="year", y="personal_terain_diff", data=runs.query("personal_terain_diff < 0.4 & personal_terain_diff > -0.3 & num_runs < 10"))


In [None]:
ax = sns.lmplot(x="terrain_coefficient", y="personal_coefficient",row="leg", data=runs)

In [None]:
ax = sns.lmplot(x="terrain_coefficient", y="personal_coefficient",row="year", data=runs)

In [None]:
#
runs["log_team_id"]= np.log(runs["team_id"])
ax = sns.lmplot(x="log_team_id", y="personal_coefficient", data=runs)

In [None]:
runner_name = "nelly carlsson"
if race_type == "ju":
    runner_name = "topi anjala"
runner = runs[runs["name"] == runner_name]
ax = sns.lmplot(x="terrain_coefficient", y="personal_coefficient",data=runner)

In [None]:
runner.round(3)

In [None]:
runner["terrain_coefficient"].values

In [None]:
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression


In [None]:
terrain_coefficients = runs["terrain_coefficient"]
personal_coefficients = runs["personal_coefficient"]
X = np.array(terrain_coefficients).reshape(-1, 1)
y = np.array(personal_coefficients).reshape(-1,1) 
lr = LinearRegression().fit(X, y)
defaults = {
    "default_intercept": lr.intercept_[0], 
    "default_coef": lr.coef_[0][0], 
    "score":lr.score(X, y)
}
logging.info(f"defaults: {defaults}")
import json
with open(f"data/default_personal_coefficients_{shared.race_id_str()}.json", 'w') as fp:
    json.dump(defaults, fp)

In [None]:
X2 = np.array(runs[["terrain_coefficient", "marking"]])
X2.shape

In [None]:
lr = LinearRegression().fit(X2, y)
lr.coef_[0]

In [None]:
#subset = runs[runs["num_runs"] > 2]
subset = runs
#subset = runs[runs["team"] == "REAKTOR INNOVATIONS"]
by_name = pd.DataFrame(data=subset.groupby("name")["terrain_coefficient"].apply(list).items(), columns=["name", "terrain_coefficients"])    
personal = pd.DataFrame(data=subset.groupby("name")["personal_coefficient"].apply(list).items(), columns=["name", "personal_coefficients"])
by_name["personal_coefficients"] = personal["personal_coefficients"]
by_name["num_runs"] = by_name["terrain_coefficients"].apply(len)
by_name = by_name[by_name["num_runs"] > 2]
by_name

In [None]:
def calculate_coef(row):
    name = row["name"]
    terrain_coefficients = row["terrain_coefficients"]
    X = np.array(terrain_coefficients).reshape(len(terrain_coefficients),1) 
    y = np.array(row["personal_coefficients"]).reshape(len(terrain_coefficients),1) 
    lr = LinearRegression().fit(X, y)
    score = lr.score(X, y)
    #logging.info(f"{name} intercept_: {lr.intercept_}, coef_: {lr.coef_[0][0]}")
    return [lr.coef_[0][0], lr.intercept_[0],  score]

by_name[["coef", "intercept", "score"]] = by_name.apply(calculate_coef, axis=1, result_type="expand")  
by_name.sample(10).round(4)

In [None]:
by_name["bad_prediction"] = (by_name["coef"] <= 0) | (by_name["score"] <= 0)
by_name["bad_prediction"].mean()

In [None]:
by_name[['bad_prediction', "num_runs", "score"]].groupby(['num_runs']).agg(["mean", "count"])

In [None]:
ax = sns.boxplot(x="num_runs", y="coef", data=by_name.query("coef < 6 & coef > -6 & num_runs <= 9 "))

In [None]:
ax = sns.histplot(data=by_name.query("coef > 0 & score > 0.01"), x="score", bins=100)

In [None]:
by_name.query("coef > 0 & score > 0.99").sort_values(by=['num_runs'], ascending=False)

In [None]:
ax = sns.scatterplot(x="score", y="coef", hue="num_runs", data=by_name.query("coef < 4 & coef > -4 & num_runs < 10"))

In [None]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'p_%s' % n
    return percentile_

leg_stats = runs[['year', "leg", "pace"]].groupby(['year', "leg"]).agg(
        ["min",percentile(1) ,percentile(5) ,"median", "mean", "std", "count"])

In [None]:
leg_stats = leg_stats.reset_index()
leg_stats.columns = [f"{pair[0]}{pair[1]}" for pair in leg_stats.columns] 
leg_stats

In [None]:
with_runs = pd.merge(cleaned, leg_stats, how="left", on=["year", "leg"])
with_runs

In [None]:
with_ideal = pd.merge(runs, cleaned, how="left", on=["year", "leg"])
with_ideal["ideal_ratio"] = with_ideal["pace"] / with_ideal["ideal_pace"]
with_ideal

In [None]:
with_ideal.columns
ax = sns.lmplot(x="ideal_pace", y="pace", data=with_ideal)

In [None]:
g = sns.FacetGrid(with_ideal, col="leg", row="year")
g.map(sns.histplot, "personal_coefficient", alpha=.7)
g.add_legend()
g.set(xlim=(0.7, 1.35), ylim=(0, 180))
plt.axvline(2.8, 0,0.17)

In [None]:
year = 2022
ve_or_ju = "ve"
