In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import GammaRegressor
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

def undo_age(x):
    '''make age categories their division again'''
    if x == 0.0:
        return '18-39'
    elif x == 1.0:
        return '40-44'
    elif x == 2.0:
        return '45-49'
    elif x == 3.0:
        return '50-54'
    elif x == 4.0:
        return '55-59'
    elif x == 5.0:
        return '60-64'
    elif x == 6.0:
        return '65-69'
    return '70+'

## prep

In [3]:
london = pd.read_csv('../data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('../data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('../data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('../data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('../data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)
combined.columns = ['age', 'male', 'time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'boston', 'chicago', 'london', 'nyc']


berlin['age'] = berlin['age'].apply(lambda x: undo_age(x))
berlin = berlin.merge(pd.get_dummies(berlin['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
berlin.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

boston['age'] = boston['age'].apply(lambda x: undo_age(x))
boston = boston.merge(pd.get_dummies(boston['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
boston.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

chicago['age'] = chicago['age'].apply(lambda x: undo_age(x))
chicago = chicago.merge(pd.get_dummies(chicago['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
chicago.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

london['age'] = london['age'].apply(lambda x: undo_age(x))
london = london.merge(pd.get_dummies(london['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
london.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

nyc['age'] = nyc['age'].apply(lambda x: undo_age(x))
nyc = nyc.merge(pd.get_dummies(nyc['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
nyc.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

combined['age'] = combined['age'].apply(lambda x: undo_age(x))
combined = combined.merge(pd.get_dummies(combined['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

berlin = berlin.apply(lambda x: round(x,2))
boston = boston.apply(lambda x: round(x,2))
chicago = chicago.apply(lambda x: round(x,2))
london = london.apply(lambda x: round(x,2))
nyc = nyc.apply(lambda x: round(x,2))
combined = combined.apply(lambda x: round(x,2))

## model

In [4]:
gparams = {
    'max_iter': [10_000],
    'alpha': np.linspace(0,1,20)
}
ggs = GridSearchCV(
    GammaRegressor(),
    gparams,
    n_jobs = -1
)

In [24]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


ggs.fit(X_train, y_train)
model_scores(X_test, y_test, ggs)

R2: 0.9225400441265754
MAE: 366.3702824568545
MSE: 194242.8628146723
RMSE: 440.72992048949015


In [25]:
ggs.best_params_

{'alpha': 0.0, 'max_iter': 10000}

In [26]:
pd.DataFrame(list(zip(X_test.columns,ggs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-0.092596
1,temperature,0.001998
2,relative_humidity,-0.000328
3,wind_speed,0.000691
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,4.5e-05
7,event,-0.067273
8,40-44,0.001198
9,45-49,0.017276


In [27]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


ggs.fit(X_train, y_train)
model_scores(X_test, y_test, ggs)

R2: 0.603147235524466
MAE: 1024.8972469386226
MSE: 1407793.9863534663
RMSE: 1186.5049457770779


In [28]:
ggs.best_params_

{'alpha': 0.0, 'max_iter': 10000}

In [29]:
pd.DataFrame(list(zip(X_test.columns,ggs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-0.093418
1,temperature,0.005323
2,relative_humidity,0.000708
3,wind_speed,-3.7e-05
4,precipitation,0.130934
5,precipitation_cover,9.9e-05
6,cloud_cover,-0.000979
7,event,0.0
8,40-44,0.016453
9,45-49,0.043384


In [30]:
train = chicago[chicago.index.isin(range(2000,2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


ggs.fit(X_train, y_train)
model_scores(X_test, y_test, ggs)

R2: 0.7997846219525226
MAE: 578.5347141308977
MSE: 548000.7433896643
RMSE: 740.2707230396622


In [31]:
ggs.best_params_

{'alpha': 0.0, 'max_iter': 10000}

In [32]:
pd.DataFrame(list(zip(X_test.columns,ggs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-0.0871101
1,temperature,0.002390405
2,relative_humidity,-0.0005967037
3,wind_speed,-0.000134685
4,precipitation,-2.504695e-07
5,precipitation_cover,-0.0002504695
6,cloud_cover,0.0006549046
7,event,-0.07003906
8,40-44,0.004603901
9,45-49,0.02361484


In [33]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


ggs.fit(X_train, y_train)
model_scores(X_test, y_test, ggs)

R2: 0.898866939625553
MAE: 423.9639843345805
MSE: 267657.3701377324
RMSE: 517.3561347251353


In [34]:
ggs.best_params_

{'alpha': 0.0, 'max_iter': 10000}

In [35]:
pd.DataFrame(list(zip(X_test.columns,ggs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-0.110965
1,temperature,0.001484
2,relative_humidity,-0.00062
3,wind_speed,0.002249
4,precipitation,-0.001841
5,precipitation_cover,-0.000132
6,cloud_cover,-0.000155
7,event,-0.018216
8,40-44,-0.023772
9,45-49,-0.010382


In [36]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


ggs.fit(X_train, y_train)
model_scores(X_test, y_test, ggs)

R2: 0.9582956168825464
MAE: 331.09177692053
MSE: 166828.02177019147
RMSE: 408.4458614923053


In [37]:
ggs.best_params_

{'alpha': 0.0, 'max_iter': 10000}

In [38]:
pd.DataFrame(list(zip(X_test.columns,ggs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-0.107145
1,temperature,0.001558
2,relative_humidity,0.000122
3,wind_speed,0.000685
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,0.000156
7,event,-0.039516
8,40-44,0.007728
9,45-49,0.031661


In [39]:
train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


ggs.fit(X_train, y_train)
model_scores(X_test, y_test, ggs)

R2: 0.39326488429028017
MAE: 1156.8566862010039
MSE: 2258568.589481047
RMSE: 1502.853482373131


In [40]:
ggs.best_params_

{'alpha': 0.0, 'max_iter': 10000}

In [41]:
pd.DataFrame(list(zip(X_test.columns,ggs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-0.09893
1,temperature,0.001577
2,relative_humidity,-0.001176
3,wind_speed,-0.001663
4,precipitation,0.344263
5,precipitation_cover,-0.001309
6,cloud_cover,-0.000189
7,boston,0.000324
8,chicago,0.000553
9,london,0.000263


In [42]:
pd.DataFrame(list(zip(X_test.columns,np.exp(ggs.best_estimator_.coef_))))

Unnamed: 0,0,1
0,male,0.905806
1,temperature,1.001578
2,relative_humidity,0.998825
3,wind_speed,0.998338
4,precipitation,1.410949
5,precipitation_cover,0.998692
6,cloud_cover,0.999811
7,boston,1.000324
8,chicago,1.000553
9,london,1.000263
