# Predicting running performance using personal data
# Data preprocessing

In [None]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from random import sample
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Download dataset
f = open("data_raw.pckl", "rb")
data = pickle.load(f)
f.close()
data

Unnamed: 0,ID,Birthdate,Height,Weight,Gender,Distance,Time,Date
0,0,1997,-,-,M,800m,1'45''94,19/06/2021
1,0,1997,-,-,M,800m,1'45''03,17/07/2019
2,0,1997,-,-,M,800m,1'45''35,22/08/2018
3,0,1997,-,-,M,800m,1'45''76,05/05/2017
4,0,1997,-,-,M,800m,1'45''05,04/06/2016
...,...,...,...,...,...,...,...,...
11103,106948,1957,-,-,M,100 Km Route,19h14'50'',24/09/2016
11104,106948,1957,-,-,M,100 Km Route,18h38'43'',26/09/2015
11105,106948,1957,-,-,M,24 Heures,109km824m,04/07/2021
11106,106948,1957,-,-,M,24 Heures,103km420m,10/06/2018


In [None]:
# Keep only datapoints with non-null height and weight
data = data[(data.Height != "-") & (data.Weight != " -")]

for i in range(data.shape[0]):
    if ((i+1)/data.shape[0]*100 // 10) - (i/data.shape[0]*100 // 10) == 1:   
        print(str(round((i+1)/data.shape[0]*100)) + "% done")
    
    h = data.Height.values[i].replace(" ", "")
    w = data.Weight.values[i].replace(" ", "")
    data.Height.values[i] = int(h[:h.find("cm")])
    data.Weight.values[i] = int(w[:w.find("kg")])
    
data = data.dropna().reset_index(drop = True)

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done


In [None]:
# Create dummy variables for gender (M=1, F=0)
data.Gender = (data.Gender.values == "M")*1

In [None]:
# Convert time to seconds
for i in range(data.shape[0]):
    if ((i+1)/data.shape[0]*100 // 10) - (i/data.shape[0]*100 // 10) == 1:   
        print(str(round((i+1)/data.shape[0]*100)) + "% done")
        
    time = data.Time.values[i]  
    try:
        if time.find("h") != -1:
            hours = int(time[:time.find("h")])
            minutes = int(time[time.find("h")+1:time.find("'")])
        else:
            hours = 0
            minutes = int(time[:time.find("'")])
        seconds = float(time[time.find("'")+1:].replace("''", "."))
        data.Time.values[i] = hours*3600 + minutes*60 + seconds 
    except:
        data.Time.values[i] = np.nan
    
data = data.dropna().reset_index(drop = True)

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done


In [None]:
# Harmonize distance format
harm_df = np.array([["800m", 0.8], ["800m - Salle", 0.8], ["1 000m", 1], 
                    ["1 000m - Salle", 1], ["1 500m", 1.5], ["1 500m - Salle", 1.5],
                    ["3 000m", 3], ["3 000m - Salle", 3], ["5 000m", 5], 
                    ["5 000m - Salle", 5], ["5 Km Route", 5], ["10 000m", 10],
                    ["10 Km Route", 10], ["15 Km Route", 15], ["20 000m", 21.097],
                    ["20 Km Route", 21.097], ["1/2 Marathon", 21.097],
                    ["Marathon", 42.195], ["100 Km Route", 100]])

for i in range(data.shape[0]):
    if ((i+1)/data.shape[0]*100 // 10) - (i/data.shape[0]*100 // 10) == 1:   
        print(str(round((i+1)/data.shape[0]*100)) + "% done")
        
    if (data.Distance.values[i] == "20 Km Route") | (data.Distance.values[i] == "20 000m"):
        data.Time.values[i] = data.Time.values[i]/20*21.097  
    
    data.Distance.values[i] = harm_df[:,1][np.where(harm_df[:,0] == data.Distance.values[i])][0]

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done


In [None]:
# Date formatting
data.Date = pd.to_datetime(data.Date)

In [None]:
# Create age column
data["Age"] = None

for i in range(data.shape[0]):
    if ((i+1)/data.shape[0]*100 // 10) - (i/data.shape[0]*100 // 10) == 1:   
        print(str(round((i+1)/data.shape[0]*100)) + "% done")
    try:
        data.Age.values[i] = data.Date.values[i].astype('M8[Y]').astype(int) + 1970 - int(data.Birthdate.values[i])
    except:
        try:
            birthdate = birthdate = np.datetime64(pd.to_datetime(data.Birthdate.values[i]))
            data.Age.values[i] = data.Date.values[i].astype('M8[Y]').astype(int) - birthdate.astype('M8[Y]').astype(int)
        except:
            data.Age.values[i] = np.nan

# Keep only datapoints for which Age >= 18
data = data[data.Age >= 18]
data = data.dropna().reset_index(drop = True)
data.Distance = data.Distance.astype(float)

10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
100% done


In [None]:
# Indentify best times and explanatory performances (no older than 3 years)
pbs = pd.DataFrame(columns = ["y", "PredDist", "AgePred", "PerfTime", "PerfDist", "DeltaAgePerf", "Height", "Weight", "Gender"])
k = 0
ids = data.ID.unique()

for i in range(len(ids)):
  print(str(i+1) + "/" + str(len(ids))) 
  df = data[data.ID == data.ID.unique()[i]].sort_values(by = "Date").reset_index(drop = True)
  height = df.Height.values[0]
  weight = df.Weight.values[0]
  gender = df.Gender.values[0]

  for d in df.Distance.unique():
    pb = df[df.Time == df[df.Distance == d].Time.min()]
    time_pb = pb.Time.values[0] 
    age_pb = pb.Age.values[0]
    past_perfs = df[(df.Date < pb.Date.values[0]) & (df.Distance != d) & (df.Age >= age_pb-3)]
    past_pbs = past_perfs.groupby("Distance")["Time"].min()
    k += 1
    
    for index in past_pbs.index:
      new_row = pd.DataFrame(index = [k], columns = pbs.columns)
      age_past_pb = past_perfs[past_perfs.Time == past_pbs[index]].Age.values[0]
      new_row.loc[k,:] = [time_pb, d, age_pb, past_pbs[index], index, age_pb - age_past_pb, height, weight, gender]
      pbs = pd.concat([pbs, new_row], axis = 0)    

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
10513/15512
10514/15512
10515/15512
10516/15512
10517/15512
10518/15512
10519/15512
10520/15512
10521/15512
10522/15512
10523/15512
10524/15512
10525/15512
10526/15512
10527/15512
10528/15512
10529/15512
10530/15512
10531/15512
10532/15512
10533/15512
10534/15512
10535/15512
10536/15512
10537/15512
10538/15512
10539/15512
10540/15512
10541/15512
10542/15512
10543/15512
10544/15512
10545/15512
10546/15512
10547/15512
10548/15512
10549/15512
10550/15512
10551/15512
10552/15512
10553/15512
10554/15512
10555/15512
10556/15512
10557/15512
10558/15512
10559/15512
10560/15512
10561/15512
10562/15512
10563/15512
10564/15512
10565/15512
10566/15512
10567/15512
10568/15512
10569/15512
10570/15512
10571/15512
10572/15512
10573/15512
10574/15512
10575/15512
10576/15512
10577/15512
10578/15512
10579/15512
10580/15512
10581/15512
10582/15512
10583/15512
10584/15512
10585/15512
10586/15512
10587/15512
10588/15

In [None]:
# Compute alpha and beta coefficients
pbs["Alpha"] = np.nan
pbs["Beta"] = np.nan
counter = 1

for k in pbs.index.unique():
  print(str(counter) + "/" + str(len(pbs.index.unique())))
  df = pbs.loc[k,:]

  if sum(pbs.index == k) >= 2:
    X = np.log(df.PerfDist.values.reshape(-1,1).astype(float))
    y = np.log(df.PerfTime.values.reshape(-1,1).astype(float))
    reg = LinearRegression()
    reg.fit(X,y)
    pbs.loc[k, "Alpha"] = reg.intercept_[0]
    pbs.loc[k, "Beta"] = reg.coef_[0][0]
  
  counter += 1

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
42352/47351
42353/47351
42354/47351
42355/47351
42356/47351
42357/47351
42358/47351
42359/47351
42360/47351
42361/47351
42362/47351
42363/47351
42364/47351
42365/47351
42366/47351
42367/47351
42368/47351
42369/47351
42370/47351
42371/47351
42372/47351
42373/47351
42374/47351
42375/47351
42376/47351
42377/47351
42378/47351
42379/47351
42380/47351
42381/47351
42382/47351
42383/47351
42384/47351
42385/47351
42386/47351
42387/47351
42388/47351
42389/47351
42390/47351
42391/47351
42392/47351
42393/47351
42394/47351
42395/47351
42396/47351
42397/47351
42398/47351
42399/47351
42400/47351
42401/47351
42402/47351
42403/47351
42404/47351
42405/47351
42406/47351
42407/47351
42408/47351
42409/47351
42410/47351
42411/47351
42412/47351
42413/47351
42414/47351
42415/47351
42416/47351
42417/47351
42418/47351
42419/47351
42420/47351
42421/47351
42422/47351
42423/47351
42424/47351
42425/47351
42426/47351
42427/47

In [None]:
# Replace NaNs by 0s and indicate with a dummy whether alpha and beta could have been computed
pbs.Alpha[np.isnan(pbs.Alpha)] = 0
pbs.Beta[np.isnan(pbs.Beta)] = 0
pbs["IsAlphaBeta"] = 0
pbs.IsAlphaBeta[(pbs.Alpha != 0) & (pbs.Beta != 0)] = 1

In [None]:
# Split into training/test samples
data = pbs
index = list(data.index.unique())
np.random.shuffle(index)
N = len(data.index.unique())
train_index = sample(index, round(N*0.75))
test_index = [k for k in index if k not in train_index]
data_train = data.loc[train_index,:]
data_test = data.loc[test_index,:]

In [None]:
# Store processed datasets
data_train.to_csv("data_train.csv")
data_test.to_csv("data_test.csv")

In [None]:
# Display training set
data_train

Unnamed: 0,y,PredDist,AgePred,PerfTime,PerfDist,DeltaAgePerf,Height,Weight,Gender,Alpha,Beta,IsAlphaBeta
29749,2301.0,10.0,51,1076.4,5.0,2,164,57,1,0.000000,0.000000,0
52283,2326.0,10.0,36,11146.0,42.195,1,183,75,1,0.000000,0.000000,0
28369,117.91,0.8,20,159.51,1.0,1,185,65,1,5.067339,1.122975,1
28369,117.91,0.8,20,242.26,1.5,0,185,65,1,5.067339,1.122975,1
28369,117.91,0.8,20,551.73,3.0,1,185,65,1,5.067339,1.122975,1
...,...,...,...,...,...,...,...,...,...,...,...,...
42507,2265.0,10.0,41,5186.69745,21.097,2,170,52,1,5.376283,1.042123,1
42507,2265.0,10.0,41,10681.0,42.195,2,170,52,1,5.376283,1.042123,1
11559,131.34,0.8,19,173.61,1.0,1,163,50,0,5.146454,1.116322,1
11559,131.34,0.8,19,265.78,1.5,0,163,50,0,5.146454,1.116322,1
