In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.element import ResultSet
import lxml
import json
from typing import Tuple, List, Dict, Any
import pickle

# Data Acquisition

## Scraper

Here we use the requests lib and BeautifulSoup with a html parser to extract the athletes profile id for later use.

In [3]:
def extract_hidden_table_rows(URL: str) -> ResultSet:
    """
    Returns all html table rows that contain the class name 'table-row--hover' as its a clickable element not initially displayed.
    This is to extract an athletes profile url.

    Params:
        URL: World athletics Men's 100m ranking page.

    Returns:
        results: All table rows containing data-athlete-url.
    """
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("tr", class_="table-row--hover")
    return results

def extract_data_athlete_urls(results: ResultSet) -> List[str]:
    """
    Returns a list of all the extracted data-athlete-urls.

    Params:
        results: The parsed html results.

    Returns:
        data_athlete_urls: A list of data-athlete-urls.
    """
    data_athlete_urls = []
    for result in results:
        attrs = result.attrs
        data_athlete_url = attrs['data-athlete-url']
        data_athlete_urls.append(data_athlete_url)
    return data_athlete_urls

def extract_athlete_name_and_id(data_athlete_url: str) -> Tuple[int, str]:
    """
    Returns the athlete name and unique identifiction.

    Params:
        data_athlete_url: The athletes profile page ~ /athletes/united-states/trayvon-bromell-14519911.
    
    Returns:
        athlete_id, athlete_name: Athlete's name, id ~ 14633823, amaury-golitin.
    """
    url_split = data_athlete_url.split('/')[-1].split('-')
    athlete_id = int(url_split[-1])
    athlete_name = '-'.join(url_split[:-1])
    return athlete_id, athlete_name

def create_athlete_id_to_athlete(data_athlete_urls: List[str]) -> Dict[int, str]:
    """
    Returns a dictionary mapping an athletes id to their name.

    Params:
        data_athlete_urls: List of data-athlete-urls.

    Returns:
        athlete_id_to_name: dict of athlete id to name.
    """
    athlete_id_to_name = {}
    for data_athlete_url in data_athlete_urls:
        athlete_id, athlete_name = extract_athlete_name_and_id(data_athlete_url=data_athlete_url)
        athlete_id_to_name[athlete_id] = athlete_name
    return athlete_id_to_name

def print_athlete_id_to_name(athlete_id_to_name: Dict[int, str]) -> None:
    """
    Utility printing function to check scraping successful.

    Params:
        athlete_id_to_name: dict of athlete id to name

    Returns:
        None
    """
    for athlete_id, athlete_name in athlete_id_to_name.items():
        print(f"{athlete_id}: {athlete_name}")



URL = 'https://www.worldathletics.org/world-rankings/100m/men?regionType=world&page=1&rankDate=2022-10-04&limitByCountry=0'
results = extract_hidden_table_rows(URL=URL)
data_athlete_urls = extract_data_athlete_urls(results=results)
athlete_id_to_name = create_athlete_id_to_athlete(data_athlete_urls=data_athlete_urls)

print_athlete_id_to_name(athlete_id_to_name=athlete_id_to_name)





14519911: trayvon-bromell
14504382: fred-kerley
14425680: marvin-bracy
14541956: christian-coleman
14417763: akani-simbine
14453864: lamont-marcell-jacobs
14737998: oblique-seville
14201842: yohan-blake
14747153: ferdinand-omanyala
14366482: aaron-brown
14638971: ackeem-blake
14522622: reece-prescod
14432013: elijah-hall
14671546: abdul-hakim-sani-brown
14536762: noah-lyles
14466007: brandon-carnes
14476000: kyree-king
14414524: zharnel-hughes
14715873: micah-williams
14715661: yupun-abeykoon
14636943: arthur-cisse
14771648: jeremiah-azu
14629201: cravont-charleston
14465376: kendal-williams
14888403: favour-oghene-tejiri-ashe
14657140: felipe-bardi
14883897: letsile-tebogo
14375111: emmanuel-matadi
14535607: andre-de-grasse
14714099: raymond-ekevwo
14873268: benjamin-azamati
14702316: mouhamadou-fall
14334964: jimmy-vicaut
14654737: jerome-blake
14249856: michael-rodgers
14731617: jake-doran
14413736: henricho-bruintjies
14701305: chituru-ali
14469945: cejhae-greene
14417680: emile-er

## GraphQL Queries



In [65]:
def create_headers() -> Dict[str, str]:
    """
    Returns a dict containing the required HTTP headers for the graphql request.
    """
    headers = {
        "Host": "x2iza5bwnneavfz3tdx4osjp6e.appsync-api.eu-west-1.amazonaws.com",
        "Accept": "*/*",
        "Accept-Language": "en-GB,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://worldathletics.org/",
        "content-type": "application/json",
        "x-api-key": "da2-i7akoctidrg5xi7atduiyjebym",
        "x-amz-user-agent": "aws-amplify/3.0.2",
        "Origin": "https://worldathletics.org",
        "Connection": "keep-alive",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "cross-site",
        "Sec-GPC": "1",
        "TE": "trailers"
    }
    return headers

def create_season_payload(athlete_id: int) -> Dict:
    """
    Create the payload for the athletes season graphql request.

    Params:
        athlete_id: Identifier for an athlete.
    
    Returns:
        payload: graphql request payload.
    """
    payload={
        "operationName":"GetSingleCompetitorResultsDiscipline",
        "variables":{
            "resultsByYearOrderBy":"discipline",
            "id":athlete_id,
            "resultsByYear": "2021"
        },
        "query":"query GetSingleCompetitorResultsDiscipline($id: Int, $resultsByYearOrderBy: String, $resultsByYear: Int) {\n  getSingleCompetitorResultsDiscipline(id: $id, resultsByYear: $resultsByYear, resultsByYearOrderBy: $resultsByYearOrderBy) {\n    parameters {\n      resultsByYear\n      resultsByYearOrderBy\n      __typename\n    }\n    activeYears\n    resultsByEvent {\n      indoor\n      disciplineCode\n      disciplineNameUrlSlug\n      typeNameUrlSlug\n      discipline\n      withWind\n      results {\n        date\n        competition\n        venue\n        country\n        category\n        race\n        place\n        mark\n        wind\n        notLegal\n        resultScore\n        remark\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"
    }
    return payload

def create_all_time_payload(athlete_id: int) -> Dict:
    """
    Create the payload for the athletes all-time graphql request.

    Params:
        athlete_id: Identifier for an athlete.
    
    Returns:
        payload: graphql request payload.
    """
    payload={
        "operationName": "GetSingleCompetitorAllTimePersonalTop10",
        "variables":{
            "allTimePersonalTop10Discipline": 10229630, #fixed id for 100m
            "id":athlete_id,
        },
        "query":"query GetSingleCompetitorAllTimePersonalTop10($id: Int, $urlSlug: String, $allTimePersonalTop10Discipline: Int) {\n  getSingleCompetitorAllTimePersonalTop10(id: $id, urlSlug: $urlSlug, allTimePersonalTop10Discipline: $allTimePersonalTop10Discipline) {\n    parameters {\n      allTimePersonalTop10Discipline\n      __typename\n    }\n    disciplines {\n      id\n      name\n      __typename\n    }\n    results {\n      discipline\n      date\n      competition\n      country\n      category\n      race\n      place\n      result\n      wind\n      drop\n      withWind\n      withDrop\n      score\n      records\n      remark\n      __typename\n    }\n    __typename\n  }\n}\n"
    }
    return payload

def create_basic_df_from_event_results(event_results: Dict) -> pd.DataFrame:
    df = pd.DataFrame.from_records(event_results)
    df = df.drop(['remark', '__typename'], axis=1)
    return df


def filter_season_results(data: Dict) -> pd.DataFrame:
    events = data['data']['getSingleCompetitorResultsDiscipline']['resultsByEvent']
    event_results = None
    for event in events:
        if event["disciplineCode"] == "100":
            event_results = event['results']

    df = create_basic_df_from_event_results(event_results)
    return df

def filter_all_time_results(data: Dict) -> pd.DataFrame:
    event_results = data["data"]["getSingleCompetitorAllTimePersonalTop10"]["results"]
    df = create_basic_df_from_event_results(event_results)
    return df

def init_athlete_to_results(athlete_id_to_name: Dict[int, str]) -> Dict[str, Dict[str, pd.DataFrame]]:
    athlete_to_results = {}
    for athlete_name in athlete_id_to_name.values():
        athlete_to_results[athlete_name] = dict.fromkeys(["season", "all_time"], None)
    return athlete_to_results

def make_graphql_request(url: str, headers: Dict, payload: Dict) -> Dict:
    response = requests.post(url=url, json=payload, headers=headers)
    data = response.json()
    return data

def get_results(athlete_id, payload_func, filter_func) -> pd.DataFrame:
    payload = payload_func(athlete_id=athlete_id)
    data = make_graphql_request(url=url, headers=headers, payload=payload)
    df = filter_func(data) 
    return df

def get_athlete_results(athlete_id: int) -> pd.DataFrame:
    season_df = get_results(athlete_id, create_season_payload, filter_season_results)
    all_time_df = get_results(athlete_id, create_all_time_payload, filter_all_time_results)

    return season_df, all_time_df

def create_initial_dataset(athlete_id_to_name: Dict[int, str], athlete_to_results: Dict[str, Dict[str, None]]) -> Dict[str, Dict[str, pd.DataFrame]]:
    for athlete_id, athlete_name in athlete_id_to_name.items():
        season_df, all_time_df = get_athlete_results(athlete_id)
        athlete_to_results[athlete_name]["season"] = season_df
        athlete_to_results[athlete_name]["all_time"] = all_time_df
    
    return athlete_to_results


url = "https://x2iza5bwnneavfz3tdx4osjp6e.appsync-api.eu-west-1.amazonaws.com/graphql"
headers = create_headers()
athlete_to_results = init_athlete_to_results(athlete_id_to_name)
athlete_to_results = create_initial_dataset(athlete_id_to_name=athlete_id_to_name, athlete_to_results=athlete_to_results)


TypeError: 'NoneType' object is not subscriptable

In [5]:
pickle.dump(athlete_to_results, open('data.pickle', 'wb'))

In [6]:
season = pickle.load(open('data.pickle', 'rb'))
season

{'trayvon-bromell': {'season':            date                                        competition  \
  0   30 APR 2022  UNF Invitational, Jax Track at Hodges Stadium,...   
  1   12 MAY 2022  Puerto Rico International Athletics Classic, E...   
  2   12 MAY 2022  Puerto Rico International Athletics Classic, E...   
  3   21 MAY 2022  Diamond League Meeting, Alexander Stadium, Bir...   
  4   28 MAY 2022  Prefontaine Classic, Hayward Field, Eugene, OR...   
  5   23 JUN 2022  Toyota USATF Outdoor Championships, Hayward Fi...   
  6   24 JUN 2022  Toyota USATF Outdoor Championships, Hayward Fi...   
  7   24 JUN 2022  Toyota USATF Outdoor Championships, Hayward Fi...   
  8   15 JUL 2022  World Athletics Championships, Oregon 2022, Ha...   
  9   16 JUL 2022  World Athletics Championships, Oregon 2022, Ha...   
  10  16 JUL 2022  World Athletics Championships, Oregon 2022, Ha...   
  11  06 AUG 2022  Kamila Skolimowska Memorial, Stadion Śląski, C...   
  12  08 AUG 2022  Gyulai István Me

In [7]:
test['trayvon-bromell']['season']

Unnamed: 0,date,competition,venue,country,category,race,place,mark,wind,notLegal,resultScore
0,30 APR 2022,"UNF Invitational, Jax Track at Hodges Stadium,...","Jax Track at Hodges Stadium, Jacksonville, FL ...",USA,F,F1,1.0,9.75,2.1,True,1282
1,12 MAY 2022,"Puerto Rico International Athletics Classic, E...","Estadio Francisco Montaner, Ponce (PUR)",PUR,B,H2,1.0,10.06,0.6,False,1186
2,12 MAY 2022,"Puerto Rico International Athletics Classic, E...","Estadio Francisco Montaner, Ponce (PUR)",PUR,B,F,1.0,9.92,-0.2,False,1235
3,21 MAY 2022,"Diamond League Meeting, Alexander Stadium, Bir...","Alexander Stadium, Birmingham (GBR)",GBR,GW,F,,DQ,-0.2,False,0
4,28 MAY 2022,"Prefontaine Classic, Hayward Field, Eugene, OR...","Hayward Field, Eugene, OR (USA)",USA,GW,F,1.0,9.93,-0.2,False,1232
5,23 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,H1,1.0,10.10,-0.4,False,1174
6,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,SF2,1.0,9.81,1.5,False,1273
7,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...","Hayward Field, Eugene, OR (USA)",USA,B,F,3.0,9.88,1.8,False,1248
8,15 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...","Hayward Field, Eugene, OR (USA)",USA,OW,H3,1.0,9.89,0.6,False,1245
9,16 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...","Hayward Field, Eugene, OR (USA)",USA,OW,SF1,2.0,9.97,0.3,False,1217


In [8]:
athlete_to_results['trayvon-bromell']['all_time']

Unnamed: 0,discipline,date,competition,country,category,race,place,result,wind,drop,withWind,withDrop,score,records
0,100 Metres,18 SEP 2021,"Kip Keino Classic, Moi International Sports Ce...",KEN,A,F,1.0,9.76,1.2,,True,False,1291,[]
1,100 Metres,05 JUN 2021,"NACAC New Life Invitational, Ansin Sports Comp...",USA,B,F2,1.0,9.77,1.5,,True,False,1287,[]
2,100 Metres,20 JUN 2021,"U.S. Olympic Trials, Hayward Field, Eugene, OR",USA,B,F,1.0,9.8,0.8,,True,False,1276,[]
3,100 Metres,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...",USA,B,SF2,1.0,9.81,1.5,,True,False,1273,[]
4,100 Metres,25 JUN 2015,"Eugene USA Ch., Hayward Field, Eugene, OR",USA,B,H4,1.0,9.84,1.3,,True,False,1262,[]
5,100 Metres,03 JUL 2016,"Eugene U.S. Olympic Trials, Hayward Field, Eug...",USA,B,F,2.0,9.84,1.6,,True,False,1262,[]
6,100 Metres,03 JUL 2016,"Eugene U.S. Olympic Trials, Hayward Field, Eug...",USA,B,SF2,1.0,9.86,2.0,,True,False,1255,[]
7,100 Metres,30 APR 2021,"North Florida Invitational, Jax Track at Hodge...",USA,F,F,1.0,9.88,1.5,,True,False,1248,[]
8,100 Metres,24 JUN 2022,"Toyota USATF Outdoor Championships, Hayward Fi...",USA,B,F,3.0,9.88,1.8,,True,False,1248,[]
9,100 Metres,16 JUL 2022,"World Athletics Championships, Oregon 2022, Ha...",USA,OW,F,3.0,9.88,-0.1,,True,False,1248,[]


In [5]:
def wind_adjusted_time(time, wind):
    # https://www.tandfonline.com/doi/full/10.1080/17461391.2018.1480062
    a = 0.009459
    B = 0.0449
    b = 0.0042
    adjusted = time - (B*wind) + (a*time*wind) - (b*wind*wind) 
    return round(adjusted, 2)

# Feature Engineering

In [7]:
season = season['trayvon-bromell']['season']
season['date'] = pd.to_datetime(season['date'])
season = season.rename(columns={'mark': 'time', 'resultScore': 'score'})
season = season.drop(columns=['competition', 'country', 'venue', 'notLegal'], axis=1)
season = season[season['score'] != 0]
season['time'] = season['time'].astype(float)
season['place'] = season['place'].astype(float).astype(int)
season['wind'] = season['wind'].astype(float)
season['wind_adjusted_time'] = season.apply(lambda x: wind_adjusted_time(time=x['time'], wind=x['wind']), axis=1)


season

Unnamed: 0,date,category,race,place,time,wind,score,wind_adjusted_time
0,2022-04-30,F,F1,1,9.75,2.1,1282,9.83
1,2022-05-12,B,H2,1,10.06,0.6,1186,10.09
2,2022-05-12,B,F,1,9.92,-0.2,1235,9.91
4,2022-05-28,GW,F,1,9.93,-0.2,1232,9.92
5,2022-06-23,B,H1,1,10.1,-0.4,1174,10.08
6,2022-06-24,B,SF2,1,9.81,1.5,1273,9.87
7,2022-06-24,B,F,3,9.88,1.8,1248,9.95
8,2022-07-15,OW,H3,1,9.89,0.6,1245,9.92
9,2022-07-16,OW,SF1,2,9.97,0.3,1217,9.98
10,2022-07-16,OW,F,3,9.88,-0.1,1248,9.88


In [44]:
from scipy.optimize import minimize
def calc_kalman_gain(noise, uncertainty):
    kalman_gain = noise/(noise + uncertainty)
    return kalman_gain

def calc_update(mean, kalman_gain, measurement):
    new_mean = mean + kalman_gain * (measurement - mean)
    return new_mean

def calc_drift(uncertainty, time, drift):
    uncertainty += time * drift
    return uncertainty


def fun(x0):
    noise = x0[0]
    uncertainty = x0[1]
    mean = x0[2]
    drift = x0[3]
    # noise, uncertainty, mean, drift = args
    
    error = 0
    # iterate
    for row in season.itertuples():
        kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
        mean = calc_update(mean=mean, kalman_gain=kg, measurement=row.time)
        uncertainty = calc_drift(uncertainty=uncertainty, time=mean, drift=drift)
        # error = uncertainty

        print(f"kalmain_gain: {kg}, mean: {mean}, uncertainty: {uncertainty}")



    return uncertainty

initial_params = [2, 1, 10, 0.01]
x = minimize(fun, initial_params)

result = fun(x.x)

print(result)

kalmain_gain: 0.6666666666666666, mean: 9.833333333333334, uncertainty: 1.0983333333333334
kalmain_gain: 0.6455083378160301, mean: 9.979648556571634, uncertainty: 1.1981298188990497
kalmain_gain: 0.625365483346294, mean: 9.942346408160306, uncertainty: 1.2975532829806526
kalmain_gain: 0.6065102906213553, mean: 9.934858184558868, uncertainty: 1.3969018648262412
kalmain_gain: 0.5887717925293389, mean: 10.032089027257692, uncertainty: 1.4972227550988182
kalmain_gain: 0.5718823592475132, mean: 9.905080230386579, uncertainty: 1.596273557402684
kalmain_gain: 0.5561312197408165, mean: 9.891132331270311, uncertainty: 1.695184880715387
kalmain_gain: 0.5412449077819349, mean: 9.890519462736334, uncertainty: 1.7940900753427504
kalmain_gain: 0.5271356136212249, mean: 9.932416484517761, uncertainty: 1.8934142401879281
kalmain_gain: 0.5136879552542716, mean: 9.905490767764215, uncertainty: 1.9924691478655703
kalmain_gain: 0.5009431321640213, mean: 9.927787361970625, uncertainty: 2.0917470214852765
k

In [None]:

event_precedence_map = {
    "OW": 1,
    "DF": 2,
    ""
}