# Athletics 100m next race time predictor

## Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.element import ResultSet
import lxml
import json
from typing import Tuple, List, Dict, Any
import pickle
from scipy.optimize import minimize

# Data Acquisition

## Scraper

Here we use the requests lib and BeautifulSoup with a html parser to extract the athletes profile id for later use.

In [3]:
def extract_hidden_table_rows(URL: str) -> ResultSet:
    """
    Returns all html table rows that contain the class name 'table-row--hover' as its a clickable element not initially displayed.
    This is to extract an athletes profile url.

    Params:
        URL: World athletics Men's 100m ranking page.

    Returns:
        results: All table rows containing data-athlete-url.
    """
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("tr", class_="table-row--hover")
    return results

def extract_data_athlete_urls(results: ResultSet) -> List[str]:
    """
    Returns a list of all the extracted data-athlete-urls.

    Params:
        results: The parsed html results.

    Returns:
        data_athlete_urls: A list of data-athlete-urls.
    """
    data_athlete_urls = []
    for result in results:
        attrs = result.attrs
        data_athlete_url = attrs['data-athlete-url']
        data_athlete_urls.append(data_athlete_url)
    return data_athlete_urls

def extract_athlete_name_and_id(data_athlete_url: str) -> Tuple[int, str]:
    """
    Returns the athlete name and unique identifiction.

    Params:
        data_athlete_url: The athletes profile page ~ /athletes/united-states/trayvon-bromell-14519911.
    
    Returns:
        athlete_id, athlete_name: Athlete's name, id ~ 14633823, amaury-golitin.
    """
    url_split = data_athlete_url.split('/')[-1].split('-')
    athlete_id = int(url_split[-1])
    athlete_name = '-'.join(url_split[:-1])
    return athlete_id, athlete_name

def create_athlete_id_to_athlete(data_athlete_urls: List[str]) -> Dict[int, str]:
    """
    Returns a dictionary mapping an athletes id to their name.

    Params:
        data_athlete_urls: List of data-athlete-urls.

    Returns:
        athlete_id_to_name: dict of athlete id to name.
    """
    athlete_id_to_name = {}
    for data_athlete_url in data_athlete_urls:
        athlete_id, athlete_name = extract_athlete_name_and_id(data_athlete_url=data_athlete_url)
        athlete_id_to_name[athlete_id] = athlete_name
    return athlete_id_to_name

def print_athlete_id_to_name(athlete_id_to_name: Dict[int, str]) -> None:
    """
    Utility printing function to check scraping successful.

    Params:
        athlete_id_to_name: dict of athlete id to name

    Returns:
        None
    """
    for athlete_id, athlete_name in athlete_id_to_name.items():
        print(f"{athlete_id}: {athlete_name}")



URL = 'https://www.worldathletics.org/world-rankings/100m/men?regionType=world&page=1&rankDate=2022-10-04&limitByCountry=0'
results = extract_hidden_table_rows(URL=URL)
data_athlete_urls = extract_data_athlete_urls(results=results)
athlete_id_to_name = create_athlete_id_to_athlete(data_athlete_urls=data_athlete_urls)

print_athlete_id_to_name(athlete_id_to_name=athlete_id_to_name)





14519911: trayvon-bromell
14504382: fred-kerley
14425680: marvin-bracy
14541956: christian-coleman
14417763: akani-simbine
14453864: lamont-marcell-jacobs
14737998: oblique-seville
14201842: yohan-blake
14747153: ferdinand-omanyala
14366482: aaron-brown
14638971: ackeem-blake
14522622: reece-prescod
14432013: elijah-hall
14671546: abdul-hakim-sani-brown
14536762: noah-lyles
14466007: brandon-carnes
14476000: kyree-king
14414524: zharnel-hughes
14715873: micah-williams
14715661: yupun-abeykoon
14636943: arthur-cisse
14771648: jeremiah-azu
14629201: cravont-charleston
14465376: kendal-williams
14888403: favour-oghene-tejiri-ashe
14657140: felipe-bardi
14883897: letsile-tebogo
14375111: emmanuel-matadi
14535607: andre-de-grasse
14714099: raymond-ekevwo
14873268: benjamin-azamati
14702316: mouhamadou-fall
14334964: jimmy-vicaut
14654737: jerome-blake
14249856: michael-rodgers
14731617: jake-doran
14413736: henricho-bruintjies
14701305: chituru-ali
14469945: cejhae-greene
14417680: emile-er

In [69]:
page = requests.get("https://worldathletics.org/athletes/united-states/trayvon-bromell-14519911")
print(page.headers)

{'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Date': 'Thu, 13 Oct 2022 16:49:26 GMT', 'X-Powered-By': 'Next.js', 'ETag': '"2708f-9Kp7cv1dhZlcZogtii4mIt9lexA"', 'Content-Encoding': 'gzip', 'Vary': 'Accept-Encoding', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 9a4946b43dbf1005ebaa0c93701f16ec.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'LHR61-P3', 'X-Amz-Cf-Id': '8lY1ONZijjvXnmMH8lWZLZCsb59yPg6Jrv5xjpEWNO-c-Zp-CmzYKQ==', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000'}


## GraphQL Queries



In [65]:
def create_headers() -> Dict[str, str]:
    """
    Returns a dict containing the required HTTP headers for the graphql request.
    """
    headers = {
        "Host": "x2iza5bwnneavfz3tdx4osjp6e.appsync-api.eu-west-1.amazonaws.com",
        "Accept": "*/*",
        "Accept-Language": "en-GB,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://worldathletics.org/",
        "content-type": "application/json",
        "x-api-key": "da2-i7akoctidrg5xi7atduiyjebym",
        "x-amz-user-agent": "aws-amplify/3.0.2",
        "Origin": "https://worldathletics.org",
        "Connection": "keep-alive",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "cross-site",
        "Sec-GPC": "1",
        "TE": "trailers"
    }
    return headers

def create_season_payload(athlete_id: int) -> Dict:
    """
    Create the payload for the athletes season graphql request.

    Params:
        athlete_id: Identifier for an athlete.
    
    Returns:
        payload: graphql request payload.
    """
    payload={
        "operationName":"GetSingleCompetitorResultsDiscipline",
        "variables":{
            "resultsByYearOrderBy":"discipline",
            "id":athlete_id,
            "resultsByYear": "2021"
        },
        "query":"query GetSingleCompetitorResultsDiscipline($id: Int, $resultsByYearOrderBy: String, $resultsByYear: Int) {\n  getSingleCompetitorResultsDiscipline(id: $id, resultsByYear: $resultsByYear, resultsByYearOrderBy: $resultsByYearOrderBy) {\n    parameters {\n      resultsByYear\n      resultsByYearOrderBy\n      __typename\n    }\n    activeYears\n    resultsByEvent {\n      indoor\n      disciplineCode\n      disciplineNameUrlSlug\n      typeNameUrlSlug\n      discipline\n      withWind\n      results {\n        date\n        competition\n        venue\n        country\n        category\n        race\n        place\n        mark\n        wind\n        notLegal\n        resultScore\n        remark\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"
    }
    return payload

def create_all_time_payload(athlete_id: int) -> Dict:
    """
    Create the payload for the athletes all-time graphql request.

    Params:
        athlete_id: Identifier for an athlete.
    
    Returns:
        payload: graphql request payload.
    """
    payload={
        "operationName": "GetSingleCompetitorAllTimePersonalTop10",
        "variables":{
            "allTimePersonalTop10Discipline": 10229630, #fixed id for 100m
            "id":athlete_id,
        },
        "query":"query GetSingleCompetitorAllTimePersonalTop10($id: Int, $urlSlug: String, $allTimePersonalTop10Discipline: Int) {\n  getSingleCompetitorAllTimePersonalTop10(id: $id, urlSlug: $urlSlug, allTimePersonalTop10Discipline: $allTimePersonalTop10Discipline) {\n    parameters {\n      allTimePersonalTop10Discipline\n      __typename\n    }\n    disciplines {\n      id\n      name\n      __typename\n    }\n    results {\n      discipline\n      date\n      competition\n      country\n      category\n      race\n      place\n      result\n      wind\n      drop\n      withWind\n      withDrop\n      score\n      records\n      remark\n      __typename\n    }\n    __typename\n  }\n}\n"
    }
    return payload

def create_basic_df_from_event_results(event_results: Dict) -> pd.DataFrame:
    df = pd.DataFrame.from_records(event_results)
    df = df.drop(['remark', '__typename'], axis=1)
    return df


def filter_season_results(data: Dict) -> pd.DataFrame:
    events = data['data']['getSingleCompetitorResultsDiscipline']['resultsByEvent']
    event_results = None
    for event in events:
        if event["disciplineCode"] == "100":
            event_results = event['results']

    df = create_basic_df_from_event_results(event_results)
    return df

def filter_all_time_results(data: Dict) -> pd.DataFrame:
    event_results = data["data"]["getSingleCompetitorAllTimePersonalTop10"]["results"]
    df = create_basic_df_from_event_results(event_results)
    return df

def init_athlete_to_results(athlete_id_to_name: Dict[int, str]) -> Dict[str, Dict[str, pd.DataFrame]]:
    athlete_to_results = {}
    for athlete_name in athlete_id_to_name.values():
        athlete_to_results[athlete_name] = dict.fromkeys(["season", "all_time"], None)
    return athlete_to_results

def make_graphql_request(url: str, headers: Dict, payload: Dict) -> Dict:
    response = requests.post(url=url, json=payload, headers=headers)
    data = response.json()
    return data

def get_results(athlete_id, payload_func, filter_func) -> pd.DataFrame:
    payload = payload_func(athlete_id=athlete_id)
    data = make_graphql_request(url=url, headers=headers, payload=payload)
    df = filter_func(data) 
    return df

def get_athlete_results(athlete_id: int) -> pd.DataFrame:
    season_df = get_results(athlete_id, create_season_payload, filter_season_results)
    all_time_df = get_results(athlete_id, create_all_time_payload, filter_all_time_results)

    return season_df, all_time_df

def create_initial_dataset(athlete_id_to_name: Dict[int, str], athlete_to_results: Dict[str, Dict[str, None]]) -> Dict[str, Dict[str, pd.DataFrame]]:
    for athlete_id, athlete_name in athlete_id_to_name.items():
        season_df, all_time_df = get_athlete_results(athlete_id)
        athlete_to_results[athlete_name]["season"] = season_df
        athlete_to_results[athlete_name]["all_time"] = all_time_df
    
    return athlete_to_results


url = "https://x2iza5bwnneavfz3tdx4osjp6e.appsync-api.eu-west-1.amazonaws.com/graphql"
headers = create_headers()
athlete_to_results = init_athlete_to_results(athlete_id_to_name)
athlete_to_results = create_initial_dataset(athlete_id_to_name=athlete_id_to_name, athlete_to_results=athlete_to_results)


TypeError: 'NoneType' object is not subscriptable

In [5]:
pickle.dump(athlete_to_results, open('data.pickle', 'wb'))

In [9]:
season = pickle.load(open('data.pickle', 'rb'))
season

{'trayvon-bromell': {'season':            date                                        competition  \
  0   30 APR 2022  UNF Invitational, Jax Track at Hodges Stadium,...   
  1   12 MAY 2022  Puerto Rico International Athletics Classic, E...   
  2   12 MAY 2022  Puerto Rico International Athletics Classic, E...   
  3   21 MAY 2022  Diamond League Meeting, Alexander Stadium, Bir...   
  4   28 MAY 2022  Prefontaine Classic, Hayward Field, Eugene, OR...   
  5   23 JUN 2022  Toyota USATF Outdoor Championships, Hayward Fi...   
  6   24 JUN 2022  Toyota USATF Outdoor Championships, Hayward Fi...   
  7   24 JUN 2022  Toyota USATF Outdoor Championships, Hayward Fi...   
  8   15 JUL 2022  World Athletics Championships, Oregon 2022, Ha...   
  9   16 JUL 2022  World Athletics Championships, Oregon 2022, Ha...   
  10  16 JUL 2022  World Athletics Championships, Oregon 2022, Ha...   
  11  06 AUG 2022  Kamila Skolimowska Memorial, Stadion Śląski, C...   
  12  08 AUG 2022  Gyulai István Me

In [7]:
test['trayvon-bromell']['season']

Unnamed: 0,date,competition,venue,country,category,race,place,mark,wind,notLegal,resultScore
0,30 APR 2022,"UNF Invitational, Jax Track at Hodges Stadium,...","Jax Track at Hodges Stadium, Jacksonville, FL ...",USA,F,F1,1.0,9.75,2.1,True,1282
1,12 MAY 2022,"Puerto Rico International Athletics Classic, E...","Estadio Francisco Montaner, Ponce (PUR)",PUR,B,H2,1.0,10.06,0.6,False,1186
2,12 MAY 2022,"Puerto Rico International Athletics Classic, E...","Estadio Francisco Montaner, Ponce (PUR)",PUR,B,F,1.0,9.92,-0.2,False,1235
3,21 MAY 2022,"Diamond League Meeting, Alexander Stadium, Bir...","Alexander Stadium, Birmingham (GBR)",GBR,GW,F,,DQ,-0.2,False,0
4,28 MAY 2022,"Prefontaine Classic, Hayward Field, Eugene, OR...","Hayward Field, Eugene, OR (USA)",USA,GW,F,1.0,9.93,-0.2,False,1232
5,23 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,H1,1.0,10.10,-0.4,False,1174
6,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,SF2,1.0,9.81,1.5,False,1273
7,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,F,3.0,9.88,1.8,False,1248
8,15 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...","Hayward Field, Eugene, OR (USA)",USA,OW,H3,1.0,9.89,0.6,False,1245
9,16 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...","Hayward Field, Eugene, OR (USA)",USA,OW,SF1,2.0,9.97,0.3,False,1217


In [8]:
athlete_to_results['trayvon-bromell']['all_time']

Unnamed: 0,discipline,date,competition,country,category,race,place,result,wind,drop,withWind,withDrop,score,records
0,100 Metres,18 SEP 2021,"Kip Keino Classic, Moi International Sports Ce...",KEN,A,F,1.0,9.76,1.2,,True,False,1291,[]
1,100 Metres,05 JUN 2021,"NACAC New Life Invitational, Ansin Sports Comp...",USA,B,F2,1.0,9.77,1.5,,True,False,1287,[]
2,100 Metres,20 JUN 2021,"U.S. Olympic Trials, Hayward Field, Eugene, OR",USA,B,F,1.0,9.8,0.8,,True,False,1276,[]
3,100 Metres,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...",USA,B,SF2,1.0,9.81,1.5,,True,False,1273,[]
4,100 Metres,25 JUN 2015,"Eugene USA Ch., Hayward Field, Eugene, OR",USA,B,H4,1.0,9.84,1.3,,True,False,1262,[]
5,100 Metres,03 JUL 2016,"Eugene U.S. Olympic Trials, Hayward Field, Eug...",USA,B,F,2.0,9.84,1.6,,True,False,1262,[]
6,100 Metres,03 JUL 2016,"Eugene U.S. Olympic Trials, Hayward Field, Eug...",USA,B,SF2,1.0,9.86,2.0,,True,False,1255,[]
7,100 Metres,30 APR 2021,"North Florida Invitational, Jax Track at Hodge...",USA,F,F,1.0,9.88,1.5,,True,False,1248,[]
8,100 Metres,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...",USA,B,F,3.0,9.88,1.8,,True,False,1248,[]
9,100 Metres,16 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...",USA,OW,F,3.0,9.88,-0.1,,True,False,1248,[]


In [2]:
def wind_adjusted_time(time, wind):
    # https://www.tandfonline.com/doi/full/10.1080/17461391.2018.1480062
    a = 0.009459
    B = 0.0449
    b = 0.0042
    adjusted = time - (B*wind) + (a*time*wind) - (b*wind*wind) 
    return round(adjusted, 2)

# Feature Engineering

In [28]:
athlete_to_results = pickle.load(open('data.pickle', 'rb'))
# https://www.worldathletics.org/world-ranking-rules/track-field-events

event_precedence_map = {
    "OW": 1,
    "DF": 2,
    "GW": 3,
    "GL": 4,
    "A": 4,
    "B": 5,
    "C": 6,
    "D": 7,
    "E": 8,
    "F": 9,
}

def wind_adjusted_time(time, wind):
    # https://www.tandfonline.com/doi/full/10.1080/17461391.2018.1480062
    a = 0.009459
    B = 0.0449
    b = 0.0042
    adjusted = time - (B*wind) + (a*time*wind) - (b*wind*wind) 
    return round(adjusted, 2)

def ensure_df_types(df: pd.DataFrame) -> pd.DataFrame:
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = pd.to_numeric(df['time'], errors='coerce')
    df['place'] = pd.to_numeric(df['place'], errors='coerce')
    df['wind'] = pd.to_numeric(df['wind'], errors='coerce')
    df = df.dropna()
    df['time'] = df['time'].astype(float)
    df['place'] = df['place'].astype(float).astype(int)
    df['wind'] = df['wind'].astype(float)
    df['wind_adjusted_time'] = df.apply(lambda x: wind_adjusted_time(time=x['time'], wind=x['wind']), axis=1)
    df['category'] = df['category'].map(event_precedence_map)
    return df



def clean_season_df(season: pd.DataFrame) -> pd.DataFrame:
    season = season.rename(columns={'mark': 'time', 'resultScore': 'score'})
    # able to drop race as its information is contained within score
    season = season.drop(columns=['competition', 'country', 'venue', 'notLegal', 'race'], axis=1)
    season = season[season['score'] != 0]
    season = ensure_df_types(season)
    return season

# def clean_all_time_frame(all_time: pd.DataFrame) -> pd.DataFrame:
def clean_all_time_df(all_time: pd.DataFrame) -> pd.DataFrame:
    all_time = all_time.rename(columns={'result': 'time'})
    all_time = all_time.drop(columns=['competition', 'country', 'race', 'discipline', 'drop', 'withWind', 'withDrop', 'records'], axis=1)
    all_time = ensure_df_types(all_time)
    return all_time
    


failed_athletes = []
new_dict = {}
for athlete, results in athlete_to_results.items():
    try:
        new_dict[athlete] = clean_season_df(results['season'])

    except:
        failed_athletes.append(athlete)
        pass
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = df['time'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['place'] = df['place'].astype(float).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wind'] = df['wind'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

# Kalman Filter

In [3]:

def calc_kalman_gain(noise, uncertainty):
    kalman_gain = noise/(noise + uncertainty)
    return kalman_gain

def calc_update(mean, kalman_gain, measurement):
    new_mean = mean + kalman_gain * (measurement - mean)
    return new_mean

def calc_drift(uncertainty, time, drift):
    uncertainty += time * drift
    return uncertainty

def extract_x0(x0):
    noise = x0[0]
    uncertainty = x0[1]
    mean = x0[2]
    drift = x0[3]
    return noise, uncertainty, mean, drift
    


def kalman_filter(x0, args):
    season = args
    noise, uncertainty, mean, drift = extract_x0(x0)
    error = 0
    # iterate
    for row in season.itertuples():
        kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
        error += abs(mean - row.wind_adjusted_time)
        mean = calc_update(mean=mean, kalman_gain=kg, measurement=row.wind_adjusted_time)
        uncertainty = calc_drift(uncertainty=uncertainty, time=mean, drift=drift)

    return error

def calc_params(season: pd.DataFrame):
    initial_params = [2, 2, 10, 0.01]
    bounds = ((0, None), (0, None), (0, None), (0, None)) 
    return minimize(fun=kalman_filter, x0=initial_params, args=season, bounds=bounds, options={"disp": False})

def calc_estimated_next_run(x0, previous_run):
    noise, uncertainty, mean, drift = extract_x0(x0)
    kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
    mean = calc_update(mean=mean, kalman_gain=kg, measurement=previous_run)
    return mean

def kalman_filter_prediction(season: pd.DataFrame):
    x = calc_params(season)
    previous_run = season.tail(1)['wind_adjusted_time']
    result = calc_estimated_next_run(x.x, previous_run=previous_run)
    return result


In [89]:
athlete_season

Unnamed: 0,date,category,place,time,wind,score,wind_adjusted_time
0,2022-04-30,9,1,9.75,2.1,1282,9.83
1,2022-05-12,5,1,10.06,0.6,1186,10.09
2,2022-05-12,5,1,9.92,-0.2,1235,9.91
4,2022-05-28,3,1,9.93,-0.2,1232,9.92
5,2022-06-23,5,1,10.1,-0.4,1174,10.08
6,2022-06-24,5,1,9.81,1.5,1273,9.87
7,2022-06-24,5,3,9.88,1.8,1248,9.95
8,2022-07-15,1,1,9.89,0.6,1245,9.92
9,2022-07-16,1,2,9.97,0.3,1217,9.98
10,2022-07-16,1,3,9.88,-0.1,1248,9.88


In [90]:
athlete_season[:-1]

Unnamed: 0,date,category,place,time,wind,score,wind_adjusted_time
0,2022-04-30,9,1,9.75,2.1,1282,9.83
1,2022-05-12,5,1,10.06,0.6,1186,10.09
2,2022-05-12,5,1,9.92,-0.2,1235,9.91
4,2022-05-28,3,1,9.93,-0.2,1232,9.92
5,2022-06-23,5,1,10.1,-0.4,1174,10.08
6,2022-06-24,5,1,9.81,1.5,1273,9.87
7,2022-06-24,5,3,9.88,1.8,1248,9.95
8,2022-07-15,1,1,9.89,0.6,1245,9.92
9,2022-07-16,1,2,9.97,0.3,1217,9.98
10,2022-07-16,1,3,9.88,-0.1,1248,9.88


In [87]:
next_run

13    9.92
Name: wind_adjusted_time, dtype: float64

In [34]:
# athlete_season = athlete_to_results['trayvon-bromell']['season']
# athlete_all_time = athlete_to_results['trayvon-bromell']['all_time']

# athlete_season = clean_season_df(athlete_season)
# athlete_all_time = clean_all_time_df(athlete_all_time)


def create_athlete_final_dict(season: pd.DataFrame, all_time: pd.DataFrame, athlete: str) -> Dict:
    athlete_season = clean_season_df(season)
    athlete_all_time = clean_all_time_df(all_time)

    athlete_all_time = athlete_all_time.sort_values(by='time', ascending=True)
    athlete_season = athlete_season.sort_values(by='date', ascending=True)

    athlete_final_dict = {}
    athlete_final_dict['athlete'] = athlete

    # separate target now to avoid information 'leakage' into feature creation
    athlete_final_dict['next_run'] = float(athlete_season.tail(1)['wind_adjusted_time'])
    athlete_season = athlete_season[:-1].copy()

    athlete_final_dict['season_time_best'] = athlete_season['wind_adjusted_time'].min()
    athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
    athlete_final_dict['season_time_most_recent_3_avg'] =athlete_season.tail(3)['wind_adjusted_time'].mean()
    athlete_final_dict['season_time_kfp'] = float(kalman_filter_prediction(athlete_season))
    athlete_final_dict['season_time_avg'] = athlete_season['wind_adjusted_time'].mean()
    athlete_final_dict['season_score_best'] = athlete_season['score'].max()
    athlete_final_dict['season_score_avg'] = athlete_season['score'].mean()
    athlete_final_dict['years_since_pb'] = 2022 - athlete_all_time['date'][0].year
    athlete_final_dict['all_time_time_best'] = athlete_all_time['wind_adjusted_time'][0]
    athlete_final_dict['all_time_score_best'] = athlete_all_time['score'][0]
    athlete_final_dict['all_time_time_top_3_avg'] = athlete_all_time.head(3)['wind_adjusted_time'].mean()
    print(athlete_final_dict)

    return athlete_final_dict


In [35]:
final_athlete_dict_list = []
for athlete, results in athlete_to_results.items():
    final_athlete_dict = create_athlete_final_dict(season=results['season'], all_time=results['all_time'], athlete=athlete)
    final_athlete_dict_list.append(final_athlete_dict)

print(len(final_athlete_dict_list))
    

  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'trayvon-bromell', 'next_run': 9.92, 'season_time_best': 9.83, 'season_time_top_3_avg': 9.86, 'season_time_most_recent_3_avg': 9.953333333333333, 'season_time_kfp': 9.948907820199418, 'season_time_avg': 9.950833333333334, 'season_score_best': 1282, 'season_score_avg': 1230.9166666666667, 'years_since_pb': 1, 'all_time_time_best': 9.81, 'all_time_score_best': 1291, 'all_time_time_top_3_avg': 9.826666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  kalman_gain = noise/(noise + uncertainty)


{'athlete': 'fred-kerley', 'next_run': 9.86, 'season_time_best': 9.79, 'season_time_top_3_avg': 9.816666666666666, 'season_time_most_recent_3_avg': 9.883333333333333, 'season_time_kfp': 10.019999339366578, 'season_time_avg': 9.911111111111111, 'season_score_best': 1291, 'season_score_avg': 1246.5555555555557, 'years_since_pb': 0, 'all_time_time_best': 9.82, 'all_time_score_best': 1291, 'all_time_time_top_3_avg': 9.816666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'marvin-bracy', 'next_run': 9.97, 'season_time_best': 9.88, 'season_time_top_3_avg': 9.9, 'season_time_most_recent_3_avg': 10.046666666666667, 'season_time_kfp': 10.029999997162408, 'season_time_avg': 10.062000000000001, 'season_score_best': 1259, 'season_score_avg': 1190.8666666666666, 'years_since_pb': 1, 'all_time_time_best': 9.91, 'all_time_score_best': 1259, 'all_time_time_top_3_avg': 9.886666666666668}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'christian-coleman', 'next_run': 10.23, 'season_time_best': 9.93, 'season_time_top_3_avg': 9.97, 'season_time_most_recent_3_avg': 10.046666666666667, 'season_time_kfp': 10.090099363333383, 'season_time_avg': 10.043000000000001, 'season_score_best': 1252, 'season_score_avg': 1193.9, 'years_since_pb': 3, 'all_time_time_best': 9.79, 'all_time_score_best': 1291, 'all_time_time_top_3_avg': 9.793333333333335}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'akani-simbine', 'next_run': 10.2, 'season_time_best': 9.98, 'season_time_top_3_avg': 9.986666666666666, 'season_time_most_recent_3_avg': 10.1, 'season_time_kfp': 10.194268743410749, 'season_time_avg': 10.139, 'season_score_best': 1217, 'season_score_avg': 1160.35, 'years_since_pb': 1, 'all_time_time_best': 9.89, 'all_time_score_best': 1262, 'all_time_time_top_3_avg': 9.933333333333332}
{'athlete': 'lamont-marcell-jacobs', 'next_run': 9.95, 'season_time_best': 10.01, 'season_time_top_3_avg': 10.036666666666667, 'season_time_most_recent_3_avg': 10.043333333333335, 'season_time_kfp': 10.062883235322209, 'season_time_avg': 10.075, 'season_score_best': 1206, 'season_score_avg': 1184.5, 'years_since_pb': 1, 'all_time_time_best': 9.8, 'all_time_score_best': 1276, 'all_time_time_top_3_avg': 9.873333333333333}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'oblique-seville', 'next_run': 9.97, 'season_time_best': 9.87, 'season_time_top_3_avg': 9.896666666666667, 'season_time_most_recent_3_avg': 9.92, 'season_time_kfp': 10.067638886740058, 'season_time_avg': 10.061428571428573, 'season_score_best': 1255, 'season_score_avg': 1181.4285714285713, 'years_since_pb': 0, 'all_time_time_best': 9.87, 'all_time_score_best': 1255, 'all_time_time_top_3_avg': 9.896666666666667}
{'athlete': 'yohan-blake', 'next_run': 10.03, 'season_time_best': 9.89, 'season_time_top_3_avg': 9.923333333333334, 'season_time_most_recent_3_avg': 10.016666666666666, 'season_time_kfp': 10.114216252096453, 'season_time_avg': 10.095263157894735, 'season_score_best': 1259, 'season_score_avg': 1163.2105263157894, 'years_since_pb': 10, 'all_time_time_best': 9.69, 'all_time_score_best': 1316, 'all_time_time_top_3_avg': 9.766666666666667}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = df['time'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['place'] = df['place'].astype(float).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['w

{'athlete': 'ferdinand-omanyala', 'next_run': 10.04, 'season_time_best': 9.93, 'season_time_top_3_avg': 9.936666666666666, 'season_time_most_recent_3_avg': 10.13, 'season_time_kfp': 10.179560637733408, 'season_time_avg': 10.084347826086956, 'season_score_best': 1259, 'season_score_avg': 1180.391304347826, 'years_since_pb': 1, 'all_time_time_best': 9.82, 'all_time_score_best': 1287, 'all_time_time_top_3_avg': 9.88}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'aaron-brown', 'next_run': 10.04, 'season_time_best': 10.01, 'season_time_top_3_avg': 10.020000000000001, 'season_time_most_recent_3_avg': 10.116666666666667, 'season_time_kfp': 10.142288623233124, 'season_time_avg': 10.124117647058823, 'season_score_best': 1221, 'season_score_avg': 1156.2941176470588, 'years_since_pb': 6, 'all_time_time_best': 10.04, 'all_time_score_best': 1220, 'all_time_time_top_3_avg': 10.053333333333333}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'ackeem-blake', 'next_run': 10.08, 'season_time_best': 9.96, 'season_time_top_3_avg': 9.963333333333333, 'season_time_most_recent_3_avg': 10.053333333333333, 'season_time_kfp': 10.125019283423422, 'season_time_avg': 10.122857142857143, 'season_score_best': 1231, 'season_score_avg': 1167.0, 'years_since_pb': 0, 'all_time_time_best': 9.97, 'all_time_score_best': 1231, 'all_time_time_top_3_avg': 9.98}
{'athlete': 'reece-prescod', 'next_run': 10.14, 'season_time_best': 9.87, 'season_time_top_3_avg': 9.996666666666664, 'season_time_most_recent_3_avg': 10.153333333333334, 'season_time_kfp': 10.30987528029026, 'season_time_avg': 10.193333333333333, 'season_score_best': 1238, 'season_score_avg': 1146.0833333333333, 'years_since_pb': 0, 'all_time_time_best': 9.87, 'all_time_score_best': 1238, 'all_time_time_top_3_avg': 9.913333333333334}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'elijah-hall', 'next_run': 10.07, 'season_time_best': 9.97, 'season_time_top_3_avg': 9.993333333333332, 'season_time_most_recent_3_avg': 10.036666666666667, 'season_time_kfp': 10.049999998516059, 'season_time_avg': 10.08111111111111, 'season_score_best': 1241, 'season_score_avg': 1181.6666666666667, 'years_since_pb': 0, 'all_time_time_best': 9.97, 'all_time_score_best': 1241, 'all_time_time_top_3_avg': 10.026666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'abdul-hakim-sani-brown', 'next_run': 10.17, 'season_time_best': 9.96, 'season_time_top_3_avg': 10.023333333333333, 'season_time_most_recent_3_avg': 10.073333333333332, 'season_time_kfp': 10.139897611658473, 'season_time_avg': 10.121666666666668, 'season_score_best': 1214, 'season_score_avg': 1167.5, 'years_since_pb': 3, 'all_time_time_best': 10.01, 'all_time_score_best': 1217, 'all_time_time_top_3_avg': 10.013333333333334}
{'athlete': 'noah-lyles', 'next_run': 9.96, 'season_time_best': 9.96, 'season_time_top_3_avg': 10.01, 'season_time_most_recent_3_avg': 10.036666666666667, 'season_time_kfp': 10.065067888999955, 'season_time_avg': 10.032, 'season_score_best': 1218, 'season_score_avg': 1159.8, 'years_since_pb': 3, 'all_time_time_best': 9.9, 'all_time_score_best': 1255, 'all_time_time_top_3_avg': 9.916666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  kalman_gain = noise/(noise + uncertainty)


{'athlete': 'brandon-carnes', 'next_run': 10.03, 'season_time_best': 10.01, 'season_time_top_3_avg': 10.036666666666667, 'season_time_most_recent_3_avg': 10.12, 'season_time_kfp': 10.119999997505204, 'season_time_avg': 10.132105263157895, 'season_score_best': 1216, 'season_score_avg': 1153.9473684210527, 'years_since_pb': 0, 'all_time_time_best': 10.01, 'all_time_score_best': 1200, 'all_time_time_top_3_avg': 10.07}
{'athlete': 'kyree-king', 'next_run': 10.35, 'season_time_best': 10.01, 'season_time_top_3_avg': 10.026666666666666, 'season_time_most_recent_3_avg': 10.26, 'season_time_kfp': 10.190054935033684, 'season_time_avg': 10.142, 'season_score_best': 1220, 'season_score_avg': 1158.2, 'years_since_pb': 0, 'all_time_time_best': 10.04, 'all_time_score_best': 1220, 'all_time_time_top_3_avg': 10.043333333333333}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'zharnel-hughes', 'next_run': 9.99, 'season_time_best': 9.97, 'season_time_top_3_avg': 9.993333333333334, 'season_time_most_recent_3_avg': 10.046666666666667, 'season_time_kfp': 10.03, 'season_time_avg': 10.1, 'season_score_best': 1217, 'season_score_avg': 1171.0, 'years_since_pb': 4, 'all_time_time_best': 9.93, 'all_time_score_best': 1238, 'all_time_time_top_3_avg': 9.936666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'micah-williams', 'next_run': 9.97, 'season_time_best': 9.89, 'season_time_top_3_avg': 9.923333333333334, 'season_time_most_recent_3_avg': 10.116666666666667, 'season_time_kfp': 9.986584119912342, 'season_time_avg': 10.066666666666666, 'season_score_best': 1255, 'season_score_avg': 1192.1666666666667, 'years_since_pb': 0, 'all_time_time_best': 9.89, 'all_time_score_best': 1255, 'all_time_time_top_3_avg': 9.936666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'yupun-abeykoon', 'next_run': 10.12, 'season_time_best': 10.01, 'season_time_top_3_avg': 10.023333333333333, 'season_time_most_recent_3_avg': 10.139999999999999, 'season_time_kfp': 10.129999999999935, 'season_time_avg': 10.129999999999999, 'season_score_best': 1220, 'season_score_avg': 1163.8461538461538, 'years_since_pb': 0, 'all_time_time_best': 10.03, 'all_time_score_best': 1220, 'all_time_time_top_3_avg': 10.063333333333333}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = df['time'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['place'] = df['place'].astype(float).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wind'] = df['wind'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

{'athlete': 'arthur-cisse', 'next_run': 10.31, 'season_time_best': 10.05, 'season_time_top_3_avg': 10.07, 'season_time_most_recent_3_avg': 10.17, 'season_time_kfp': 10.169999994549327, 'season_time_avg': 10.207368421052632, 'season_score_best': 1199, 'season_score_avg': 1139.7368421052631, 'years_since_pb': 3, 'all_time_time_best': 10.01, 'all_time_score_best': 1231, 'all_time_time_top_3_avg': 9.993333333333332}
{'athlete': 'jeremiah-azu', 'next_run': 10.14, 'season_time_best': 10.0, 'season_time_top_3_avg': 10.066666666666665, 'season_time_most_recent_3_avg': 10.143333333333333, 'season_time_kfp': 10.22507340281185, 'season_time_avg': 10.192941176470587, 'season_score_best': 1226, 'season_score_avg': 1144.5294117647059, 'years_since_pb': 0, 'all_time_time_best': 10.14, 'all_time_score_best': 1162, 'all_time_time_top_3_avg': 10.14}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'cravont-charleston', 'next_run': 10.3, 'season_time_best': 10.04, 'season_time_top_3_avg': 10.069999999999999, 'season_time_most_recent_3_avg': 10.159999999999998, 'season_time_kfp': 10.169999998413692, 'season_time_avg': 10.197333333333335, 'season_score_best': 1213, 'season_score_avg': 1150.4666666666667, 'years_since_pb': 0, 'all_time_time_best': 10.04, 'all_time_score_best': 1213, 'all_time_time_top_3_avg': 10.096666666666666}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
  kalman_gain = noise/(noise + uncertainty)


{'athlete': 'kendal-williams', 'next_run': 10.04, 'season_time_best': 9.98, 'season_time_top_3_avg': 10.016666666666666, 'season_time_most_recent_3_avg': 10.146666666666667, 'season_time_kfp': 10.17, 'season_time_avg': 10.135, 'season_score_best': 1213, 'season_score_avg': 1167.111111111111, 'years_since_pb': 4, 'all_time_time_best': 10.04, 'all_time_score_best': 1210, 'all_time_time_top_3_avg': 10.040000000000001}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'favour-oghene-tejiri-ashe', 'next_run': 10.17, 'season_time_best': 9.9, 'season_time_top_3_avg': 9.963333333333333, 'season_time_most_recent_3_avg': 10.066666666666666, 'season_time_kfp': 10.190752628467104, 'season_time_avg': 10.087857142857143, 'season_score_best': 1262, 'season_score_avg': 1180.0714285714287, 'years_since_pb': 0, 'all_time_time_best': 9.99, 'all_time_score_best': 1210, 'all_time_time_top_3_avg': 10.020000000000001}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'felipe-bardi', 'next_run': 10.09, 'season_time_best': 10.02, 'season_time_top_3_avg': 10.073333333333332, 'season_time_most_recent_3_avg': 10.263333333333334, 'season_time_kfp': 10.26999998334731, 'season_time_avg': 10.267894736842106, 'season_score_best': 1182, 'season_score_avg': 1120.7894736842106, 'years_since_pb': 0, 'all_time_time_best': 10.09, 'all_time_score_best': 1182, 'all_time_time_top_3_avg': 10.113333333333333}


  athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'athlete': 'letsile-tebogo', 'next_run': 9.95, 'season_time_best': 9.97, 'season_time_top_3_avg': 10.0, 'season_time_most_recent_3_avg': 10.100000000000001, 'season_time_kfp': 10.156091360598172, 'season_time_avg': 10.082857142857142, 'season_score_best': 1227, 'season_score_avg': 1186.857142857143, 'years_since_pb': 0, 'all_time_time_best': 9.95, 'all_time_score_best': 1238, 'all_time_time_top_3_avg': 9.993333333333332}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time'] = df['time'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['place'] = df['place'].astype(float).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wind'] = df['wind'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

KeyError: 0

In [21]:
# season SB, TOP 3 avg, Most recent 3 avg, Kalman, season avg time, season best score, season avg score, category avg, next run as target
# all_time PB, TOP 3, Years since PB
athlete_season = athlete_to_results['trayvon-bromell']['season']
athlete_all_time = athlete_to_results['trayvon-bromell']['all_time']

athlete_season = clean_season_df(athlete_season)
athlete_all_time = clean_all_time_df(athlete_all_time)

athlete_all_time = athlete_all_time.sort_values(by='time', ascending=True)
athlete_season = athlete_season.sort_values(by='date', ascending=True)

final_df_dict = {}

# next_run = athlete_season.tail(1)['wind_adjusted_time']
final_df_dict['next_run'] = float(athlete_season.tail(1)['wind_adjusted_time'])


athlete_season = athlete_season[:-1].copy()
# season_time_best = athlete_season['time'].min()
final_df_dict['season_time_best'] = athlete_season['wind_adjusted_time'].min()

# season_time_top_3_avg = athlete_season['time'].sort_values()[:3].mean()
# print(season_top_3_avg)
final_df_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()

# season_time_most_recent_3_avg = athlete_season.tail(3)['wind_adjusted_time'].mean()
final_df_dict['season_time_most_recent_3_avg'] =athlete_season.tail(3)['wind_adjusted_time'].mean()

# print(season_most_recent_3_avg)

# season_time_kfp = kalman_filter_prediction(athlete_season)
final_df_dict['season_time_kfp'] = float(kalman_filter_prediction(athlete_season))

# print(season_kfp)
season_time_avg = athlete_season['wind_adjusted_time'].mean()

final_df_dict['season_time_avg'] = athlete_season['wind_adjusted_time'].mean()
# print(season_time_avg)
# season_score_best = athlete_season['score'].max()
final_df_dict['season_score_best'] = athlete_season['score'].max()

# season_score_avg = athlete_season['score'].mean()
final_df_dict['season_score_avg'] = athlete_season['score'].mean()

final_df_dict['years_since_pb'] = 2022 - athlete_all_time['date'][0].year

final_df_dict['all_time_time_best'] = athlete_all_time['wind_adjusted_time'][0]

final_df_dict['all_time_score_best'] = athlete_all_time['score'][0]

final_df_dict['all_time_time_top_3_avg'] = athlete_all_time.head(3)['wind_adjusted_time'].mean()

final_df_dict

  final_df_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()


{'next_run': 9.92,
 'season_time_best': 9.83,
 'season_time_top_3_avg': 9.86,
 'season_time_most_recent_3_avg': 9.953333333333333,
 'season_time_kfp': 9.948907820199418,
 'season_time_avg': 9.950833333333334,
 'season_score_best': 1282,
 'season_score_avg': 1230.9166666666667,
 'years_since_pb': 1,
 'all_time_time_best': 9.81,
 'all_time_score_best': 1291,
 'all_time_time_top_3_avg': 9.826666666666666}

In [23]:
df = pd.DataFrame([final_df_dict])
df

Unnamed: 0,next_run,season_time_best,season_time_top_3_avg,season_time_most_recent_3_avg,season_time_kfp,season_time_avg,season_score_best,season_score_avg,years_since_pb,all_time_time_best,all_time_score_best,all_time_time_top_3_avg
0,9.92,9.83,9.86,9.953333,9.948908,9.950833,1282,1230.916667,1,9.81,1291,9.826667


In [54]:
athlete_all_time = athlete_to_results['trayvon-bromell']['season']
athlete_all_time

Unnamed: 0,date,competition,venue,country,category,race,place,mark,wind,notLegal,resultScore
0,30 APR 2022,"UNF Invitational, Jax Track at Hodges Stadium,...","Jax Track at Hodges Stadium, Jacksonville, FL ...",USA,F,F1,1.0,9.75,2.1,True,1282
1,12 MAY 2022,"Puerto Rico International Athletics Classic, E...","Estadio Francisco Montaner, Ponce (PUR)",PUR,B,H2,1.0,10.06,0.6,False,1186
2,12 MAY 2022,"Puerto Rico International Athletics Classic, E...","Estadio Francisco Montaner, Ponce (PUR)",PUR,B,F,1.0,9.92,-0.2,False,1235
3,21 MAY 2022,"Diamond League Meeting, Alexander Stadium, Bir...","Alexander Stadium, Birmingham (GBR)",GBR,GW,F,,DQ,-0.2,False,0
4,28 MAY 2022,"Prefontaine Classic, Hayward Field, Eugene, OR...","Hayward Field, Eugene, OR (USA)",USA,GW,F,1.0,9.93,-0.2,False,1232
5,23 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,H1,1.0,10.10,-0.4,False,1174
6,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,SF2,1.0,9.81,1.5,False,1273
7,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,F,3.0,9.88,1.8,False,1248
8,15 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...","Hayward Field, Eugene, OR (USA)",USA,OW,H3,1.0,9.89,0.6,False,1245
9,16 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...","Hayward Field, Eugene, OR (USA)",USA,OW,SF1,2.0,9.97,0.3,False,1217


In [60]:
athlete_all_time = athlete_to_results['trayvon-bromell']['all_time']

clean_all_time_frame(athlete_all_time)


Unnamed: 0,date,category,place,time,wind,score,wind_adjusted_time
0,2021-09-18,4,1,9.76,1.2,1291,9.81
1,2021-06-05,5,1,9.77,1.5,1287,9.83
2,2021-06-20,5,1,9.8,0.8,1276,9.84
3,2022-06-24,5,1,9.81,1.5,1273,9.87
4,2015-06-25,5,1,9.84,1.3,1262,9.9
5,2016-07-03,5,2,9.84,1.6,1262,9.91
6,2016-07-03,5,1,9.86,2.0,1255,9.94
7,2021-04-30,9,1,9.88,1.5,1248,9.94
8,2022-06-24,5,3,9.88,1.8,1248,9.95
9,2022-07-16,1,3,9.88,-0.1,1248,9.88


In [56]:
athlete_all_time

Unnamed: 0,discipline,date,competition,country,category,race,place,result,wind,drop,withWind,withDrop,score,records
0,100 Metres,18 SEP 2021,"Kip Keino Classic, Moi International Sports Ce...",KEN,A,F,1.0,9.76,1.2,,True,False,1291,[]
1,100 Metres,05 JUN 2021,"NACAC New Life Invitational, Ansin Sports Comp...",USA,B,F2,1.0,9.77,1.5,,True,False,1287,[]
2,100 Metres,20 JUN 2021,"U.S. Olympic Trials, Hayward Field, Eugene, OR",USA,B,F,1.0,9.8,0.8,,True,False,1276,[]
3,100 Metres,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...",USA,B,SF2,1.0,9.81,1.5,,True,False,1273,[]
4,100 Metres,25 JUN 2015,"Eugene USA Ch., Hayward Field, Eugene, OR",USA,B,H4,1.0,9.84,1.3,,True,False,1262,[]
5,100 Metres,03 JUL 2016,"Eugene U.S. Olympic Trials, Hayward Field, Eug...",USA,B,F,2.0,9.84,1.6,,True,False,1262,[]
6,100 Metres,03 JUL 2016,"Eugene U.S. Olympic Trials, Hayward Field, Eug...",USA,B,SF2,1.0,9.86,2.0,,True,False,1255,[]
7,100 Metres,30 APR 2021,"North Florida Invitational, Jax Track at Hodge...",USA,F,F,1.0,9.88,1.5,,True,False,1248,[]
8,100 Metres,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...",USA,B,F,3.0,9.88,1.8,,True,False,1248,[]
9,100 Metres,16 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...",USA,OW,F,3.0,9.88,-0.1,,True,False,1248,[]


In [28]:
x = athlete_to_results['ferdinand-omanyala']['season']


df = clean_season_frame(x)
df

NameError: name 'clean_season_frame' is not defined

In [53]:
from scipy.optimize import minimize
season = athlete_season
def calc_kalman_gain(noise, uncertainty):
    kalman_gain = noise/(noise + uncertainty)
    return kalman_gain

def calc_update(mean, kalman_gain, measurement):
    new_mean = mean + kalman_gain * (measurement - mean)
    return new_mean

def calc_drift(uncertainty, time, drift):
    uncertainty += time * drift
    return uncertainty

def extract_x0(x0):
    noise = x0[0]
    uncertainty = x0[1]
    mean = x0[2]
    drift = x0[3]
    return noise, uncertainty, mean, drift
    


def kalman_filter(x0):
    noise, uncertainty, mean, drift = extract_x0(x0)
    # print(f"n{noise}, u{uncertainty}, m{mean}, d{drift}")
    
    error = 0
    # iterate
    for row in season.itertuples():
        kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
        error += abs(mean - row.wind_adjusted_time)
        mean = calc_update(mean=mean, kalman_gain=kg, measurement=row.wind_adjusted_time)
        uncertainty = calc_drift(uncertainty=uncertainty, time=mean, drift=drift)

        # print(f"kalmain_gain: {kg}, mean: {mean}, uncertainty: {uncertainty}")

    return error

def calc_params():
    initial_params = [2, 2, 10, 0.01]
    bounds = ((0, None), (0.01, None), (0, None), (0, None)) 
    return minimize(kalman_filter, initial_params, bounds=bounds, options={"disp": False})

def calc_estimated_next_run(x0, previous_run):
    noise, uncertainty, mean, drift = extract_x0(x0)
    kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
    mean = calc_update(mean=mean, kalman_gain=kg, measurement=previous_run)
    return mean



x = calc_params()
print(x.x)
previous_run = season.tail(1)['wind_adjusted_time']
result = calc_estimated_next_run(x.x, previous_run=previous_run)
result




[5.42019754 0.01000004 9.82999468 1.2396178 ]


13    9.919834
Name: wind_adjusted_time, dtype: float64

In [None]:
def final_estimation()

In [20]:
x

      fun: 0.995431568054828
 hess_inv: array([[ 1.09702215,  0.02857958, -0.16450414, -0.41707842],
       [ 0.02857958,  0.93947237,  0.09140534,  0.10658595],
       [-0.16450414,  0.09140534,  0.09498394,  0.2282192 ],
       [-0.41707842,  0.10658595,  0.2282192 ,  1.0987979 ]])
      jac: array([ 0.03370368, -0.03246617,  0.74462211,  0.19582748])
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 642
      nit: 4
     njev: 126
   status: 2
  success: False
        x: array([2.09334464, 0.99131048, 9.77892698, 0.08528734])

In [None]:

event_precedence_map = {
    "OW": 1,
    "DF": 2,
    ""
}