In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import itertools

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

## All Participants

### Grouped

In [45]:
london = pd.read_csv('../data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('../data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('../data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('../data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('../data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)

berlin = berlin.apply(lambda x: round(x,2))
boston = boston.apply(lambda x: round(x,2))
chicago = chicago.apply(lambda x: round(x,2))
london = london.apply(lambda x: round(x,2))
nyc = nyc.apply(lambda x: round(x,2))
combined = combined.apply(lambda x: round(x,2))

In [47]:
train = london[london.index.isin(range(2000, 2016), level=0)]
test = london[london.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.85966375473755
MAE: 471.21778128881806
MSE: 344379.4622846739
RMSE: 586.8385316973263


In [49]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,2))))

Unnamed: 0,0,1
0,age,455.02
1,male,-1920.4
2,temperature,40.28
3,relative_humidity,-7.71
4,wind_speed,32.02
5,precipitation,2123.48
6,precipitation_cover,9.38
7,cloud_cover,-9.96
8,overcast,0.0
9,partially_cloudy,592.85


In [50]:
train = nyc[nyc.index.isin(range(2000,  2016), level=0)]
test = nyc[nyc.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8819033115459372
MAE: 579.2856025678539
MSE: 456835.4378788669
RMSE: 675.8960259380632


In [52]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,2))))

Unnamed: 0,0,1
0,age,759.08
1,male,-1914.75
2,temperature,35.53
3,relative_humidity,-4.36
4,wind_speed,4.79
5,precipitation,-0.0
6,precipitation_cover,0.0
7,cloud_cover,14.49
8,overcast,-924.92
9,partially_cloudy,-597.19


In [53]:
train = boston[boston.index.isin(range(2000, 2016), level=0)]
test = boston[boston.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.5829732351280517
MAE: 1035.7978856697525
MSE: 1402562.0151735803
RMSE: 1184.2981107700798


In [54]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,2))))

Unnamed: 0,0,1
0,age,594.13
1,male,-1432.82
2,temperature,85.97
3,relative_humidity,10.36
4,wind_speed,2.56
5,precipitation,1788.77
6,precipitation_cover,0.43
7,cloud_cover,-14.97
8,rain,132.72
9,event,0.0


In [55]:
train = berlin[berlin.index.isin(range(2000, 2016), level=0)]
test = berlin[berlin.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8871951058903067
MAE: 380.1910822955753
MSE: 268259.85357276205
RMSE: 517.9380788982039


In [56]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,2))))

Unnamed: 0,0,1
0,age,488.07
1,male,-1480.04
2,temperature,24.06
3,relative_humidity,-6.57
4,wind_speed,7.43
5,precipitation,-0.0
6,precipitation_cover,0.0
7,cloud_cover,-5.13
8,overcast,525.23
9,partially_cloudy,41.47


In [57]:
train = chicago[chicago.index.isin(range(2000, 2016), level=0)]
test = chicago[chicago.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.790093329362709
MAE: 567.7840008295069
MSE: 551230.963176453
RMSE: 742.4493000713604


In [58]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,2))))

Unnamed: 0,0,1
0,age,462.65
1,male,-1488.85
2,temperature,44.95
3,relative_humidity,-8.33
4,wind_speed,-0.43
5,precipitation,-0.03
6,precipitation_cover,-34.35
7,cloud_cover,11.78
8,overcast,270.85
9,partially_cloudy,-109.76


In [59]:
train = combined[combined.index.isin(range(2000, 2016), level=0)]
test = combined[combined.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.774800257258571
MAE: 658.9347657639383
MSE: 816270.1770034891
RMSE: 903.4767163593588


In [60]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,2))))

Unnamed: 0,0,1
0,age,551.42
1,male,-1644.46
2,temperature,50.82
3,relative_humidity,-7.42
4,wind_speed,13.26
5,precipitation,-736.76
6,precipitation_cover,-1.41
7,cloud_cover,2.16
8,overcast,64.83
9,partially_cloudy,89.85
