# Athletics 100m next race time predictor

## Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.element import ResultSet
import lxml
import json
from typing import Tuple, List, Dict, Any
import pickle
from scipy.optimize import minimize
import warnings
import tensorflow as tf
import datetime
import keras


%load_ext tensorboard
warnings.filterwarnings("ignore")

2022-10-15 17:25:19.326847: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-15 17:25:19.481606: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-15 17:25:19.481630: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-15 17:25:19.507224: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-15 17:25:20.359147: W tensorflow/stream_executor/platform/de

# Data Acquisition

## Scraper

Here we use the requests lib and BeautifulSoup with a html parser to extract the athletes profile id for later use.

In [2]:
def extract_hidden_table_rows(URL: str) -> ResultSet:
    """
    Returns all html table rows that contain the class name 'table-row--hover' as its a clickable element not initially displayed.
    This is to extract an athletes profile url.

    Params:
        URL: World athletics Men's 100m ranking page.

    Returns:
        results: All table rows containing data-athlete-url.
    """
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find_all("tr", class_="table-row--hover")
    return results

def extract_data_athlete_urls(results: ResultSet) -> List[str]:
    """
    Returns a list of all the extracted data-athlete-urls.

    Params:
        results: The parsed html results.

    Returns:
        data_athlete_urls: A list of data-athlete-urls.
    """
    data_athlete_urls = []
    for result in results:
        attrs = result.attrs
        data_athlete_url = attrs['data-athlete-url']
        data_athlete_urls.append(data_athlete_url)
    return data_athlete_urls

def extract_athlete_name_and_id(data_athlete_url: str) -> Tuple[int, str]:
    """
    Returns the athlete name and unique identifiction.

    Params:
        data_athlete_url: The athletes profile page ~ /athletes/united-states/trayvon-bromell-14519911.
    
    Returns:
        athlete_id, athlete_name: Athlete's name, id ~ 14633823, amaury-golitin.
    """
    url_split = data_athlete_url.split('/')[-1].split('-')
    athlete_id = int(url_split[-1])
    athlete_name = '-'.join(url_split[:-1])
    return athlete_id, athlete_name

def create_athlete_id_to_athlete(data_athlete_urls: List[str]) -> Dict[int, str]:
    """
    Returns a dictionary mapping an athletes id to their name.

    Params:
        data_athlete_urls: List of data-athlete-urls.

    Returns:
        athlete_id_to_name: dict of athlete id to name.
    """
    athlete_id_to_name = {}
    for data_athlete_url in data_athlete_urls:
        athlete_id, athlete_name = extract_athlete_name_and_id(data_athlete_url=data_athlete_url)
        athlete_id_to_name[athlete_id] = athlete_name
    return athlete_id_to_name

def print_athlete_id_to_name(athlete_id_to_name: Dict[int, str]) -> None:
    """
    Utility printing function to check scraping successful.

    Params:
        athlete_id_to_name: dict of athlete id to name

    Returns:
        None
    """
    for athlete_id, athlete_name in athlete_id_to_name.items():
        print(f"{athlete_id}: {athlete_name}")



URL = 'https://www.worldathletics.org/world-rankings/100m/men?regionType=world&page=1&rankDate=2022-10-04&limitByCountry=0'
results = extract_hidden_table_rows(URL=URL)
data_athlete_urls = extract_data_athlete_urls(results=results)
athlete_id_to_name = create_athlete_id_to_athlete(data_athlete_urls=data_athlete_urls)

print_athlete_id_to_name(athlete_id_to_name=athlete_id_to_name)





14519911: trayvon-bromell
14504382: fred-kerley
14425680: marvin-bracy
14541956: christian-coleman
14417763: akani-simbine
14453864: lamont-marcell-jacobs
14737998: oblique-seville
14201842: yohan-blake
14747153: ferdinand-omanyala
14366482: aaron-brown
14638971: ackeem-blake
14522622: reece-prescod
14432013: elijah-hall
14671546: abdul-hakim-sani-brown
14536762: noah-lyles
14466007: brandon-carnes
14476000: kyree-king
14414524: zharnel-hughes
14715873: micah-williams
14715661: yupun-abeykoon
14636943: arthur-cisse
14771648: jeremiah-azu
14629201: cravont-charleston
14465376: kendal-williams
14888403: favour-oghene-tejiri-ashe
14657140: felipe-bardi
14883897: letsile-tebogo
14375111: emmanuel-matadi
14535607: andre-de-grasse
14714099: raymond-ekevwo
14873268: benjamin-azamati
14702316: mouhamadou-fall
14334964: jimmy-vicaut
14654737: jerome-blake
14249856: michael-rodgers
14731617: jake-doran
14413736: henricho-bruintjies
14701305: chituru-ali
14469945: cejhae-greene
14417680: emile-er

## GraphQL Queries and Obtaining Initial Data

Note: The host name and api key are also dynamic so this may not work without updating the api key. To avoid having to run this again, I have just
saved the data to pickle file. It is possible with selenium to always grab these and update the query but the season is over and the data won't be changing so not neccessary for now. In a production environment, this would be crucial.

The code has been kept to demonstrate how this was achieved.


In [3]:
def create_headers() -> Dict[str, str]:
    """
    Returns a dict containing the required HTTP headers for the graphql request.
    """
    headers = {
        "Host": "cyxcgiyvwfcg3hxgiozfwhicee.appsync-api.eu-west-1.amazonaws.com",
        "Accept": "*/*",
        "Accept-Language": "en-GB,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Referer": "https://worldathletics.org/",
        "content-type": "application/json",
        "x-api-key": "da2-6e2ufs7vkffhdnowuanorycpia",
        "x-amz-user-agent": "aws-amplify/3.0.2",
        "Origin": "https://worldathletics.org",
        "Connection": "keep-alive",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "cross-site",
        "Sec-GPC": "1",
        "TE": "trailers"
    }
    return headers

def create_season_payload(athlete_id: int) -> Dict:
    """
    Create the payload for the athletes season graphql request.

    Params:
        athlete_id: Identifier for an athlete.
    
    Returns:
        payload: graphql request payload.
    """
    payload={
        "operationName":"GetSingleCompetitorResultsDiscipline",
        "variables":{
            "id":athlete_id,
            "resultsByYearOrderBy":"discipline",
        },
        "query":"query GetSingleCompetitorResultsDiscipline($id: Int, $resultsByYearOrderBy: String, $resultsByYear: Int) {\n  getSingleCompetitorResultsDiscipline(id: $id, resultsByYear: $resultsByYear, resultsByYearOrderBy: $resultsByYearOrderBy) {\n    parameters {\n      resultsByYear\n      resultsByYearOrderBy\n      __typename\n    }\n    activeYears\n    resultsByEvent {\n      indoor\n      disciplineCode\n      disciplineNameUrlSlug\n      typeNameUrlSlug\n      discipline\n      withWind\n      results {\n        date\n        competition\n        venue\n        country\n        category\n        race\n        place\n        mark\n        wind\n        notLegal\n        resultScore\n        remark\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"
    }
    return payload

def create_all_time_payload(athlete_id: int) -> Dict:
    """
    Create the payload for the athletes all-time graphql request.

    Params:
        athlete_id: Identifier for an athlete.
    
    Returns:
        payload: graphql request payload.
    """
    payload={
        "operationName": "GetSingleCompetitorAllTimePersonalTop10",
        "variables":{
            "allTimePersonalTop10Discipline": 10229630, #fixed id for 100m
            "id":athlete_id,
        },
        "query":"query GetSingleCompetitorAllTimePersonalTop10($id: Int, $urlSlug: String, $allTimePersonalTop10Discipline: Int) {\n  getSingleCompetitorAllTimePersonalTop10(id: $id, urlSlug: $urlSlug, allTimePersonalTop10Discipline: $allTimePersonalTop10Discipline) {\n    parameters {\n      allTimePersonalTop10Discipline\n      __typename\n    }\n    disciplines {\n      id\n      name\n      __typename\n    }\n    results {\n      discipline\n      date\n      competition\n      country\n      category\n      race\n      place\n      result\n      wind\n      drop\n      withWind\n      withDrop\n      score\n      records\n      remark\n      __typename\n    }\n    __typename\n  }\n}\n"
    }
    return payload

def create_basic_df_from_event_results(event_results: Dict) -> pd.DataFrame:
    df = pd.DataFrame.from_records(event_results)
    df = df.drop(['remark', '__typename'], axis=1)
    return df


def filter_season_results(data: Dict) -> pd.DataFrame:
    events = data['data']['getSingleCompetitorResultsDiscipline']['resultsByEvent']
    event_results = None
    for event in events:
        if event["disciplineCode"] == "100":
            event_results = event['results']

    df = create_basic_df_from_event_results(event_results)
    return df

def filter_all_time_results(data: Dict) -> pd.DataFrame:
    event_results = data["data"]["getSingleCompetitorAllTimePersonalTop10"]["results"]
    df = create_basic_df_from_event_results(event_results)
    return df

def init_athlete_to_results(athlete_id_to_name: Dict[int, str]) -> Dict[str, Dict[str, pd.DataFrame]]:
    athlete_to_results = {}
    for athlete_name in athlete_id_to_name.values():
        athlete_to_results[athlete_name] = dict.fromkeys(["season", "all_time"], None)
    return athlete_to_results

def make_graphql_request(url: str, headers: Dict, payload: Dict) -> Dict:
    response = requests.post(url=url, json=payload, headers=headers)
    data = response.json()
    return data

def get_results(athlete_id, payload_func, filter_func) -> pd.DataFrame:
    payload = payload_func(athlete_id=athlete_id)
    data = make_graphql_request(url=url, headers=headers, payload=payload)
    df = filter_func(data) 
    return df

def get_athlete_results(athlete_id: int) -> pd.DataFrame:
    season_df = get_results(athlete_id, create_season_payload, filter_season_results)
    all_time_df = get_results(athlete_id, create_all_time_payload, filter_all_time_results)

    return season_df, all_time_df

def create_initial_dataset(athlete_id_to_name: Dict[int, str], athlete_to_results: Dict[str, Dict[str, None]]) -> Dict[str, Dict[str, pd.DataFrame]]:
    for athlete_id, athlete_name in athlete_id_to_name.items():
        season_df, all_time_df = get_athlete_results(athlete_id)
        athlete_to_results[athlete_name]["season"] = season_df
        athlete_to_results[athlete_name]["all_time"] = all_time_df
    
    return athlete_to_results


url = "https://cyxcgiyvwfcg3hxgiozfwhicee.appsync-api.eu-west-1.amazonaws.com/graphql"
headers = create_headers()
athlete_to_results = init_athlete_to_results(athlete_id_to_name)
athlete_to_results = create_initial_dataset(athlete_id_to_name=athlete_id_to_name, athlete_to_results=athlete_to_results)


In [4]:
# pickle.dump(athlete_to_results, open('data2.pickle', 'wb'))

# Feature Engineering Part 1

## Data Cleaning and Sorting

In [5]:
athlete_to_results = pickle.load(open('data.pickle', 'rb'))
# https://www.worldathletics.org/world-ranking-rules/track-field-events

event_precedence_map = {
    "OW": 1,
    "DF": 2,
    "GW": 3,
    "GL": 4,
    "A": 4,
    "B": 5,
    "C": 6,
    "D": 7,
    "E": 8,
    "F": 9,
}

def wind_adjusted_time(time, wind):
    # https://www.tandfonline.com/doi/full/10.1080/17461391.2018.1480062
    a = 0.009459
    B = 0.0449
    b = 0.0042
    adjusted = time - (B*wind) + (a*time*wind) - (b*wind*wind) 
    return round(adjusted, 2)

def ensure_df_types(df: pd.DataFrame) -> pd.DataFrame:
    df['date'] = pd.to_datetime(df['date'])
    df['time'] = pd.to_numeric(df['time'], errors='coerce')
    df['place'] = pd.to_numeric(df['place'], errors='coerce')
    df['wind'] = pd.to_numeric(df['wind'], errors='coerce')
    df = df.dropna()
    df['time'] = df['time'].astype(float)
    df['place'] = df['place'].astype(float).astype(int)
    df['wind'] = df['wind'].astype(float)
    df['wind_adjusted_time'] = df.apply(lambda x: wind_adjusted_time(time=x['time'], wind=x['wind']), axis=1)
    df['category'] = df['category'].map(event_precedence_map)
    return df



def clean_season_df(season: pd.DataFrame) -> pd.DataFrame:
    season = season.rename(columns={'mark': 'time', 'resultScore': 'score'})
    # able to drop race as its information is contained within score
    season = season.drop(columns=['competition', 'country', 'venue', 'notLegal', 'race'], axis=1)
    season = season[season['score'] != 0]
    season = ensure_df_types(season)
    return season

# def clean_all_time_frame(all_time: pd.DataFrame) -> pd.DataFrame:
def clean_all_time_df(all_time: pd.DataFrame) -> pd.DataFrame:
    all_time = all_time.rename(columns={'result': 'time'})
    all_time = all_time.drop(columns=['competition', 'country', 'race', 'discipline', 'drop', 'withWind', 'withDrop', 'records'], axis=1)
    all_time = ensure_df_types(all_time)
    return all_time
    

# Kalman Filter

In [6]:

def calc_kalman_gain(noise, uncertainty):
    kalman_gain = noise/(noise + uncertainty)
    return kalman_gain

def calc_update(mean, kalman_gain, measurement):
    new_mean = mean + kalman_gain * (measurement - mean)
    return new_mean

def calc_drift(uncertainty, time, drift):
    uncertainty += time * drift
    return uncertainty

def extract_x0(x0):
    noise = x0[0]
    uncertainty = x0[1]
    mean = x0[2]
    drift = x0[3]
    return noise, uncertainty, mean, drift
    


def kalman_filter(x0, args):
    season = args
    noise, uncertainty, mean, drift = extract_x0(x0)
    error = 0
    # iterate
    for row in season.itertuples():
        kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
        error += abs(mean - row.wind_adjusted_time)
        mean = calc_update(mean=mean, kalman_gain=kg, measurement=row.wind_adjusted_time)
        uncertainty = calc_drift(uncertainty=uncertainty, time=mean, drift=drift)

    return error

def calc_params(season: pd.DataFrame):
    initial_params = [2, 2, 10, 0.01]
    bounds = ((0, None), (0, None), (0, None), (0, None)) 
    return minimize(fun=kalman_filter, x0=initial_params, args=season, bounds=bounds, options={"disp": False})

def calc_estimated_next_run(x0, previous_run):
    noise, uncertainty, mean, drift = extract_x0(x0)
    kg = calc_kalman_gain(noise=noise, uncertainty=uncertainty)
    mean = calc_update(mean=mean, kalman_gain=kg, measurement=previous_run)
    return mean

def kalman_filter_prediction(season: pd.DataFrame):
    x = calc_params(season)
    previous_run = season.tail(1)['wind_adjusted_time']
    result = calc_estimated_next_run(x.x, previous_run=previous_run)
    return result


In [62]:
def create_athlete_final_dict(season: pd.DataFrame, all_time: pd.DataFrame, athlete: str) -> Dict:
    athlete_season = clean_season_df(season)
    athlete_all_time = clean_all_time_df(all_time)

    athlete_all_time = athlete_all_time.sort_values(by='time', ascending=True)
    athlete_season = athlete_season.sort_values(by='date', ascending=True)

    athlete_final_dict = {}
    athlete_final_dict['athlete'] = athlete

    # separate target now to avoid information 'leakage' into feature creation
    athlete_final_dict['next_run'] = float(athlete_season.tail(1)['wind_adjusted_time'])
    athlete_season = athlete_season[:-1].copy()

    athlete_final_dict['season_time_best'] = athlete_season['wind_adjusted_time'].min()
    athlete_final_dict['season_time_top_3_avg'] = athlete_season['wind_adjusted_time'].sort_values()[:3].mean()
    athlete_final_dict['season_time_most_recent_3_avg'] =athlete_season.tail(3)['wind_adjusted_time'].mean()

    ## This below steps utilises the kalman filter 
    athlete_final_dict['season_time_kfp'] = float(kalman_filter_prediction(athlete_season))

    athlete_final_dict['season_time_avg'] = athlete_season['wind_adjusted_time'].mean()
    athlete_final_dict['season_score_best'] = athlete_season['score'].max()
    athlete_final_dict['season_score_avg'] = athlete_season['score'].mean()
    athlete_final_dict['years_since_pb'] = 2022 - athlete_all_time['date'].iloc[0].year
    athlete_final_dict['all_time_time_best'] = athlete_all_time['wind_adjusted_time'].iloc[0]
    athlete_final_dict['all_time_score_best'] = athlete_all_time['score'].iloc[0]
    athlete_final_dict['all_time_time_top_3_avg'] = athlete_all_time.head(3)['wind_adjusted_time'].mean()

    return athlete_final_dict

def create_final_dataset(athlete_to_results: Dict) -> pd.DataFrame:
    final_athlete_dict_list = []
    failed_athletes = []
    for athlete, results in athlete_to_results.items():
        try:
            final_athlete_dict = create_athlete_final_dict(season=results['season'], all_time=results['all_time'], athlete=athlete)
            final_athlete_dict_list.append(final_athlete_dict)
        except Exception as e:
            print(e)
            failed_athletes.append(athlete)
            pass
    
    print(f"unsuccessful athlete data processed: {len(failed_athletes)}")

    final_df = pd.DataFrame(final_athlete_dict_list)
    return final_df

df = create_final_dataset(athlete_to_results=athlete_to_results)

unsuccessful athlete data processed: 0


# Model

from sklearn.model_selection import train_test_split

In [63]:
from sklearn.model_selection import train_test_split

athlete_index = df.pop('athlete') 
df = df.astype('float32')
y = df.pop('next_run')
X = df

In [111]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(90, 11) (10, 11) (90,) (10,)


In [113]:
def create_model():
  return tf.keras.models.Sequential([
    tf.keras.layers.Dense(11, input_dim=(11), activation='relu'),
    tf.keras.layers.Dense(11, activation='relu'),
    # tf.keras.layers.LayerNormalization(),
    tf.keras.layers.Dense(1)
  ])

model = create_model()
model.compile(optimizer='adam',
              loss='mae',
              metrics=['mae'])


log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='mae', patience=50)

model.fit(x=X_train, 
          y=y_train, 
          epochs=10000, 
          validation_split=0.2, 
          callbacks=[tensorboard_callback, early_stopping_callback])


Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

<keras.callbacks.History at 0x7fd263551bb0>

In [114]:
model.evaluate(X_test, y_test)



[0.26093778014183044, 0.26093778014183044]

In [115]:

model.predict(X_test)



array([[ 9.645921],
       [10.361409],
       [10.100461],
       [10.146962],
       [10.74464 ],
       [10.552723],
       [10.37155 ],
       [ 9.981534],
       [10.323005],
       [10.774028]], dtype=float32)

In [101]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets



regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_pred))

# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [ 7.8963953e-01 -5.6810373e-01  2.7756453e-01 -1.2067894e-01
  3.7670076e-01 -4.0882826e-04  1.6521513e-03  1.8400429e-02
 -1.3131101e+00 -2.6454628e-03  1.0316370e+00]
Mean absolute error: 0.13
Coefficient of determination: 0.02


In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, shuffle=True)

def compare_cols_with_regression(col):
    regr = linear_model.LinearRegression()
    regr.fit(X_train[col].values.reshape(-1, 1), y_train)
    y_pred = regr.predict(X_test[col].values.reshape(-1, 1))
    print(f"col:{col} Mean absolute error: {mean_absolute_error(y_test, y_pred)}")

for col in X_train.columns:
    compare_cols_with_regression(col=col)

col:season_time_best Mean absolute error: 0.0747532844543457
col:season_time_top_3_avg Mean absolute error: 0.07520818710327148
col:season_time_most_recent_3_avg Mean absolute error: 0.0693659782409668
col:season_time_kfp Mean absolute error: 0.09777774661779404
col:season_time_avg Mean absolute error: 0.08429326862096786
col:season_score_best Mean absolute error: 0.08030281215906143
col:season_score_avg Mean absolute error: 0.0800803154706955
col:years_since_pb Mean absolute error: 0.11100683361291885
col:all_time_time_best Mean absolute error: 0.09207334369421005
col:all_time_score_best Mean absolute error: 0.10538091510534286
col:all_time_time_top_3_avg Mean absolute error: 0.10703583061695099


In [34]:
X_train

Unnamed: 0,season_time_best,season_time_top_3_avg,season_time_most_recent_3_avg,season_time_kfp,season_time_avg,season_score_best,season_score_avg,years_since_pb,all_time_time_best,all_time_score_best,all_time_time_top_3_avg
28,10.07,10.123333,10.220000,10.175022,10.182500,1189.0,1144.375000,1.0,9.89,1245.0,9.896667
80,10.21,10.230000,10.336667,10.389869,10.399500,1145.0,1079.449951,0.0,10.23,1145.0,10.230000
92,10.06,10.116667,10.356667,10.462999,10.357500,1208.0,1093.875000,0.0,10.09,1161.0,10.166667
79,10.07,10.116667,10.206667,10.240000,10.610769,1199.0,1048.923096,0.0,10.07,1199.0,10.130000
23,9.98,10.016666,10.146667,10.170000,10.135000,1213.0,1167.111084,4.0,10.04,1210.0,10.040000
...,...,...,...,...,...,...,...,...,...,...,...
81,10.20,10.223333,10.260000,10.449679,10.426316,1142.0,1060.052612,0.0,10.18,1148.0,10.196667
89,10.17,10.246667,10.283334,10.338379,10.344000,1165.0,1105.800049,0.0,10.17,1165.0,10.270000
29,10.08,10.106667,10.153334,10.120488,10.206000,1193.0,1145.666626,3.0,10.03,1220.0,10.083333
25,10.02,10.073334,10.263333,10.270000,10.267895,1182.0,1120.789429,0.0,10.09,1182.0,10.113334


In [None]:
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
estimator = KerasRegressor(model=model, epochs=1000, batch_size=1, verbose=0)
kfold = KFold(n_splits=4)
results = cross_val_score(estimator, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
print("Baseline: %.2f (%.2f) MAE" % (results.mean(), results.std()))

INFO:tensorflow:Assets written to: ram:///tmp/tmp1g4ppgxm/assets
INFO:tensorflow:Assets written to: ram:///tmp/tmpvq_89x9a/assets
INFO:tensorflow:Assets written to: ram:///tmp/tmp8wuwqpzl/assets
INFO:tensorflow:Assets written to: ram:///tmp/tmpdssu83g9/assets
Baseline: -0.32 (0.14) MAE


In [None]:
model.summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_88 (Dense)            (None, 11)                132       
                                                                 
 dense_89 (Dense)            (None, 1)                 12        
                                                                 
Total params: 144
Trainable params: 144
Non-trainable params: 0
_________________________________________________________________


In [None]:
keras.utils.plot_model(model, "model.png")

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [None]:
%tensorboard --logdir logs/fit

In [None]:
X_test

Unnamed: 0,season_time_best,season_time_top_3_avg,season_time_most_recent_3_avg,season_time_kfp,season_time_avg,season_score_best,season_score_avg,years_since_pb,all_time_time_best,all_time_score_best,all_time_time_top_3_avg
9,10.01,10.02,10.116667,10.142288,10.124118,1221.0,1156.294067,6.0,10.04,1220.0,10.053333
87,10.16,10.19,10.263333,10.2,10.344666,1179.0,1103.333374,0.0,10.16,1179.0,10.186666
6,9.87,9.896667,9.92,10.067639,10.061429,1255.0,1181.428589,0.0,9.87,1255.0,9.896667
28,10.07,10.123333,10.22,10.175022,10.1825,1189.0,1144.375,1.0,9.89,1245.0,9.896667
86,10.12,10.133333,10.153334,10.169958,10.172857,1189.0,1163.0,1.0,10.01,1227.0,10.086667
65,10.13,10.15,10.196667,10.22,10.227143,1165.0,1134.571411,1.0,10.11,1169.0,10.143333
45,10.03,10.09,10.306666,10.325055,10.236667,1197.0,1139.333374,0.0,10.03,1196.0,10.153334
61,10.05,10.07,10.146667,10.130004,10.1425,1213.0,1165.0,0.0,10.05,1213.0,10.07
78,10.14,10.166667,10.2,10.334027,10.305294,1176.0,1107.0,5.0,10.11,1182.0,10.116667
75,10.06,10.106667,10.13,10.27476,10.256,1193.0,1122.533325,0.0,10.06,1193.0,10.113334


In [None]:
model.predict(X_test)



array([[13.146487 ],
       [10.575032 ],
       [10.000123 ],
       [ 9.998298 ],
       [10.754752 ],
       [10.801645 ],
       [10.450546 ],
       [10.665063 ],
       [11.228992 ],
       [10.3666725],
       [10.264056 ],
       [ 9.438043 ],
       [10.173447 ],
       [10.306195 ],
       [ 9.937534 ],
       [10.281022 ],
       [ 8.052893 ],
       [10.254697 ],
       [ 9.949453 ],
       [ 9.816809 ]], dtype=float32)

In [None]:
model.evaluate(X_test)



[0.0, 0.0]