In [None]:
import pandas as pd
import numpy as np
import logging
import shared
import os
import xmltodict

os.environ['RACE_TYPE'] = "ju"
race_type = shared.race_type()
#year = shared.forecast_year()
import time
startTime = time.time()

In [None]:
with open(f'data/results_j2022_{race_type}.xml', 'r', encoding='utf-8') as file:
#with open(f'small-res.xml', 'r', encoding='utf-8') as file:
    my_xml = file.read()
    my_dict = xmltodict.parse(my_xml)

In [None]:
len(my_dict['event']['class']['team'])

In [None]:
pd.json_normalize(my_dict['event']['class']['team'])

In [None]:
teams_list = my_dict['event']['class']['team']
display(len(teams_list))

teams_df = pd.json_normalize(teams_list, 
                             errors="ignore",
                             meta=[
                                 "teamid", "teamname", "teamnro", "result", "tsecs", "placement", 
                                  ], 
                             record_path=["leg"], 
                             meta_prefix="team.")
teams_df.head()

In [None]:
#no_control_legs = teams_df[teams_df.control.isna()][["team.teamid","legnro"]].to_dict(orient="records")
teams_df[teams_df.control.isna()]

In [None]:
teams_df[(teams_df.control.isna()) & (teams_df.tsecs.notna())]

In [None]:
def _clean_team(team):
    clean_legs = [leg for leg in team["leg"] if "control" in leg and isinstance(leg["control"], list)]
    team["leg"] = clean_legs
    return team

clean_team_list = [_clean_team(team) for team in teams_list]

len(clean_team_list)

In [None]:
#                                 

controls_df = pd.json_normalize(clean_team_list, 
                             errors="ignore",
                             meta=[
                                 "teamid", "teamname", "teamnro", "result", "tsecs", "placement", 
                                   ['leg', 'legnro'],
                                     ['leg', 'nm'],
                                     ['leg', 'crs'],
                                     ['leg', 'emit'],
                                     ['leg', 'result'],
                                     ['leg', 'tsecs']
                                  ], 
                             record_path=["leg", "control"], 
                             meta_prefix="team.", 
                             record_prefix="ctrl.")
controls_df

In [None]:
controls_df.info()

In [None]:
controls_df.describe()

In [None]:
controls_df.sample(5)

In [None]:
controls_df[controls_df["team.teamid"] == "409"].head(60)

In [None]:
controls_df[controls_df["ctrl.cd"] == "-"]

In [None]:
controls_df[["ctrl.cn", "ctrl.cc", "ctrl.cl", "ctrl.ct", "ctrl.cd"]].info()
#controls_df[controls_df["team.leg.nm"] == "Antti Parjanne"]

In [None]:
from datetime import datetime, timedelta
#from datetime import time as dt_time


def parse_time_with_hours(time_str):
    # Extract the individual components of the time string
    hour_str, minute_str, second_str = time_str.split(':')
    
    # Convert the components to integer values
    hours = int(hour_str)
    minutes = int(minute_str)
    seconds = int(second_str)
    
    # Calculate the number of days and remaining hours
    days, hours = divmod(hours, 24)
    
    # Create a new datetime object with the adjusted values
    dt = datetime(1900, 1, 1) + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
    
    return dt

def parse_time(control_time_text):
    if pd.isna(control_time_text):
        return control_time_text
    
    try:
        return datetime.strptime(control_time_text, '%S')
    except ValueError:
        try:
            return datetime.strptime(control_time_text, '%M:%S')
        except ValueError:
            try:
                return parse_time_with_hours(control_time_text)
            except ValueError:
                print("Cannot parse " + control_time_text)
                raise

#
clean_controls_df = controls_df[controls_df["team.leg.nm"].notna()]
#clean_controls_df = clean_controls_df[controls_df["ctrl.cl"].notna()] # Random clegs and finnish
clean_controls_df.columns = [col.replace("team.leg", "leg").replace("team.team", "team.").replace("leg.leg", "leg.") for col in clean_controls_df.columns]
clean_controls_df = clean_controls_df.rename(columns={'leg.crs': 'leg.hajonta', 'leg.nm': 'leg.runnername', 'ctrl.cn': 'ctrl.num', 'ctrl.cc': 'ctrl.code', 'ctrl.cl': 'ctrl.distance', 'ctrl.ct': 'ctrl.time', 'ctrl.cd': 'ctrl.duration'})
epoc_date = pd.to_datetime('1900-01-01')
time_secs = (clean_controls_df["ctrl.time"].apply(parse_time) - epoc_date).dt.total_seconds()
clean_controls_df.insert(5, "ctrl.time_secs", time_secs)

#clean_controls_df.loc[clean_controls_df["ctrl.duration"] == "-", "ctrl.duration"] = pd.NA
#clean_controls_df["ctrl.duration_temp"] = clean_controls_df["ctrl.duration"].apply(parse_time)
#durations_secs = (clean_controls_df["ctrl.duration_temp"] - epoc_date).dt.total_seconds()
#clean_controls_df.insert(5, "ctrl.durations_secs", durations_secs)

number_cols = ['ctrl.num', 'ctrl.code', 'ctrl.distance', 'ctrl.time_secs', 'team.id', 'team.nro', 'team.tsecs', 'team.placement', 'leg.nro', 'leg.tsecs']
clean_controls_df[number_cols] = clean_controls_df[number_cols].apply(pd.to_numeric)
clean_controls_df = clean_controls_df.sort_values(["team.id", "leg.nro", "ctrl.num"])
max_cleg_time = clean_controls_df["leg.tsecs"].max() * 0.8 
clean_controls_df = clean_controls_df[clean_controls_df["ctrl.time_secs"] < max_cleg_time]
#clean_controls_df["team.result"] = clean_controls_df["team.result"].apply(parse_time)
clean_controls_df.head()

In [None]:
clean_controls_df[clean_controls_df["leg.runnername"].isin(["Anu Kovanen", "Antti Jokinen", "Joose Nurmela"])]
#clean_controls_df.sort_values("ctrl.time_secs").tail(20)
#clean_controls_df[clean_controls_df["leg.tsecs"].notna()].sort_values("leg.tsecs").tail(20)
#clean_controls_df["leg.tsecs"].max()


In [None]:
clean_controls_df.info()

In [None]:
def add_cleg_id(single):
    single.insert(1, "ctrl.previous_code", single["ctrl.code"].shift(1, fill_value=0))
    single.insert(6, "ctrl.previous_time_secs", single["ctrl.time_secs"].shift(1, fill_value=0))
    cleg_ids = single["ctrl.previous_code"].astype(str) + "-" + single["ctrl.code"].astype(str)
    single.insert(1, "ctrl.cleg_id", cleg_ids)
    cleg_durations = single["ctrl.time_secs"] - single["ctrl.previous_time_secs"]
    single.insert(9, "ctrl.cleg_duration_secs", cleg_durations)
    return single
df2 = clean_controls_df.groupby(["team.id", "leg.nro"]).apply(add_cleg_id).reset_index(drop=True)
df2 

In [None]:
df2[df2["leg.runnername"].str.contains("Joose Nurmela")]

In [None]:
cleg_id_stats = df2.groupby("ctrl.cleg_id").agg({"ctrl.distance": "median", "leg.runnername": "nunique", "leg.nro": "nunique", "ctrl.cleg_duration_secs": ["mean", "median"]}).sort_values(("leg.runnername", "nunique"))
cleg_id_stats[cleg_id_stats[("leg.runnername", "nunique")] > 14].round(1)


In [None]:
leg_times_df = df2[["team.id", "leg.nro", "leg.tsecs"]].drop_duplicates().reset_index(drop=True)
# We could calculate the relay time from controls but....
leg_times_df = leg_times_df[leg_times_df["leg.tsecs"].notna()].reset_index(drop=True)
leg_times_df["end_relay_tsecs"] = leg_times_df.groupby(["team.id"])["leg.tsecs"].expanding().sum().reset_index(drop=True)
leg_times_df["leg.start_relay_tsecs"] = leg_times_df.groupby(["team.id"])["end_relay_tsecs"].shift(1, fill_value=0).reset_index(drop=True)

# Venlojen viestin viestinomainen vaihto suljetaan la 18.6.2022 klo 18:30. Siihen mennessä vaihtoon saapumattomien joukkueiden uusintalähtö osuuksille 2–4 tapahtuu karttatelineiltä klo 18:45. 

# Jukolan viestin viestinomainen vaihto suljetaan sunnuntaina 19.6.2022 klo 8:45.
# Ankkureiden (7. osuus) uusintalähtö tapahtuu klo 9:00.
relay_closes_secs = 9.75 * 3600
logging.info(f"{relay_closes_secs=}")
leg_7_mass_start_secs = 10 * 3600
logging.info(f"{leg_7_mass_start_secs=}")
#leg_times_df["leg_7_runners_in_mass_start"] = (leg_times_df["leg.nro"] == 7) & (leg_times_df["leg.start_relay_tsecs"] >= relay_closes_secs)
leg_7_runners_in_mass_start = (leg_times_df["leg.nro"] == 7) & (leg_times_df["leg.start_relay_tsecs"] >= relay_closes_secs)
leg_times_df.loc[leg_7_runners_in_mass_start, "leg.start_relay_tsecs"] = leg_7_mass_start_secs
# 2.-6. osuuksien uusintalähdöt tapahtuvat klo 9:30. 
leg_2_6_mass_start_secs = 10.5 * 3600
logging.info(f"{leg_2_6_mass_start_secs=}")

leg_2_6_runners_in_mass_start = (leg_times_df["leg.nro"] >= 2) & (leg_times_df["leg.nro"] <= 6) & (leg_times_df["leg.start_relay_tsecs"] >= relay_closes_secs)
leg_times_df.loc[leg_2_6_runners_in_mass_start, "leg.start_relay_tsecs"] = leg_2_6_mass_start_secs

leg_times_df["leg_rank"] = leg_times_df.groupby(["team.id"])["leg.nro"].rank()
# unfortunately have to drop the rest of the legs as 
# its immpossible to calculate the relay time if legs are missing between
leg_times_df = leg_times_df[leg_times_df["leg_rank"] == leg_times_df["leg.nro"]]

display(leg_times_df[leg_times_df["team.id"] == 22])
display(leg_times_df[leg_times_df["team.id"] == 150])
display(leg_times_df[leg_times_df["team.id"] == 36])

leg_times_df = leg_times_df.drop(columns=["leg.tsecs", "end_relay_tsecs"])

leg_times_df

In [None]:
#8352.0 + 5993.0 + 14125.0
leg_times_df.groupby("leg.nro").agg({"team.id": "count", "leg.start_relay_tsecs": ["mean", "median", "min", "max"]})

In [None]:
leg_times_df.describe()
leg_times_df[(leg_times_df["leg.start_relay_tsecs"] == 4723.0) & (leg_times_df["leg.nro"] > 1)]

In [None]:
37800.000000 / 3600
#leg_times_df[leg_times_df["leg.start_relay_tsecs"] > leg_2_6_mass_start_secs]
#leg_times_df.info()

In [None]:
relay_df = pd.merge(df2,leg_times_df , how='inner', on=['team.id', 'leg.nro'])
relay_df.head(30)
relay_df = relay_df[relay_df["leg.start_relay_tsecs"].notna()]

In [None]:
relay_df["cleg_start_relay_time"] = relay_df["ctrl.previous_time_secs"] + relay_df["leg.start_relay_tsecs"]
relay_df["cleg_end_relay_time"] = relay_df["ctrl.time_secs"] + relay_df["leg.start_relay_tsecs"]
relay_df["cleg_middle_relay_time"] = relay_df["cleg_start_relay_time"] + (relay_df["ctrl.cleg_duration_secs"] / 2)
relay_df["cleg_middle_relay_time_minute"] = (relay_df["cleg_middle_relay_time"] / 60).astype(int)
relay_df.info()

In [None]:
relay_df.describe()

In [None]:
relay_df[relay_df["cleg_start_relay_time"] > 16 * 3600]

In [None]:
def _add_overlaps_count(cleg_df):
    # convert start and end columns to numpy arrays
    start = cleg_df['cleg_start_relay_time'].values
    end = cleg_df['cleg_end_relay_time'].values
    # start[:, None] is a numpy indexing operation that adds a new axis to the start array.
    # In the comparison start[:, None] < end, numpy automatically broadcasts the two arrays together
    # so that they have the same shape. This is done by “stretching” the start array along its new axis 
    # to match the shape of the end array. The result is a 2D boolean array with shape (n, n) where 
    # each element indicates whether the corresponding element in the start array is less than 
    # the corresponding element in the end array.
    cleg_df['num_runners_on_cleg'] =  np.sum((start[:, None] < end) & (end[:, None] > start), axis=1) - 1
    return cleg_df

    
    
overlaps_df = relay_df.groupby(["ctrl.cleg_id"]).apply(_add_overlaps_count).reset_index(drop=True)
# ctrl.distance is missing for final and is wrong on unplanned clegs
overlaps_df['num_runners_on_cleg_per_100m'] = overlaps_df['num_runners_on_cleg'] * 100 / overlaps_df['ctrl.distance'] 
overlaps_df = overlaps_df.sort_values(["team.id", "leg.nro", "ctrl.num"])
overlaps_df

In [None]:
overlaps_df.info()

In [None]:
import seaborn as sns
overlaps_df[overlaps_df["leg.runnername"].str.contains("Oskari Pirttikoski")]

In [None]:
cleg2_df = overlaps_df[overlaps_df["ctrl.cleg_id"] == "0-119"].copy()
cleg2_df["debug_individual"] = cleg2_df["leg.runnername"].str.contains("Oskari Pirttikoski")
cleg2_df

In [None]:
sns.set(rc={"figure.figsize":(16, 9)}) 
sns.scatterplot(cleg2_df,x="cleg_middle_relay_time_minute", y="num_runners_on_cleg_per_100m", 
                style="debug_individual", hue="leg.nro", palette="bright")



In [None]:
#overlaps_df.nunique()
cleg_minutes = overlaps_df.groupby(["leg.nro", "cleg_middle_relay_time_minute"]).agg({"num_runners_on_cleg_per_100m": ["mean", "median", "count"], "ctrl.cleg_id": ["count", "nunique"]}).reset_index()
#cleg_minutes.columns = ['_'.join(tup).rstrip('_') for tup in cleg_minutes.columns.values]                                                                            
cleg_minutes.columns = ["_".join(col).rstrip('_') for col in cleg_minutes.columns.to_flat_index()]
cleg_minutes["cleg_middle_time"] = shared.start_timestamp[race_type][2022] + pd.to_timedelta(cleg_minutes["cleg_middle_relay_time_minute"], unit="minutes")
cleg_minutes



In [None]:
import matplotlib.dates as mdates
ax = sns.lineplot(cleg_minutes,x=cleg_minutes["cleg_middle_time"].dt.round(freq='10min'), y="num_runners_on_cleg_per_100m_mean", hue="leg.nro", palette="bright")
# Show only the times on the x-axis
# Set the date format and tick frequency
#date_format = '%H:%M'  # Show only the time (hours and minutes)
#tick_frequency = 2  # Show a tick every 2 hours
#ax.xaxis.set_major_formatter(mdates.DateFormatter(date_format))
#ax.xaxis.set_major_locator(mdates.HourLocator(interval=tick_frequency))


In [None]:
overlaps_df.info()

In [None]:

legs_df = overlaps_df[overlaps_df["num_runners_on_cleg_per_100m"].notna()].groupby(["team.id", "team.name", "leg.nro", "leg.runnername", "leg.tsecs", "leg.start_relay_tsecs", "leg.result", "team.tsecs"]).agg(
    {"num_runners_on_cleg_per_100m": ["mean", "median", "count"], 
     "ctrl.cleg_id": ["count", "nunique"]
    }).reset_index()
#legs_df.columns = ['_'.join(tup).rstrip('_') for tup in cleg_minutes.columns.values]                                                                            
legs_df.columns = ["_".join(col).rstrip('_') for col in legs_df.columns.to_flat_index()]
race_start_timestamp = shared.start_timestamp[race_type][2022]
legs_df.insert(5,"leg_start_ts", race_start_timestamp + pd.to_timedelta(legs_df["leg.start_relay_tsecs"], unit="seconds"))
legs_df.insert(6,"leg_end_ts", legs_df["leg_start_ts"] + pd.to_timedelta(legs_df["leg.tsecs"], unit="seconds"))
legs_df


In [None]:
selected_legs = legs_df[legs_df["num_runners_on_cleg_per_100m_mean"] > 40]
legs_df[legs_df["team.id"].isin(selected_legs["team.id"].sample(1))]

In [None]:
legs_df[legs_df["leg.tsecs"].isna()]

In [None]:
ax = sns.lineplot(legs_df,x=legs_df["leg_end_ts"].dt.round(freq='10min'), y="num_runners_on_cleg_per_100m_mean", hue="leg.nro", palette="bright")


In [None]:
legs_df.to_csv(f"data/control_legs_congestion-{ shared.race_id_str()}.csv")

In [None]:
ax = sns.lineplot(legs_df,x=legs_df["team.tsecs"], y="num_runners_on_cleg_per_100m_mean", hue="leg.nro", palette="bright")
