In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import itertools

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

## Starting Point From Initial_Modeling

In [3]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)

In [4]:
train = london[london.index.isin(range(2017), level=0)]
test = london[london.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8486382770895685
MAE: 500.5378739349461
MSE: 363822.3823038144
RMSE: 603.1769079663233


In [5]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,452.511953
1,male,-1915.789567
2,temperature,39.860797
3,relative_humidity,-8.202356
4,wind_speed,32.583386
5,precipitation,1669.587288
6,precipitation_cover,10.751469
7,cloud_cover,-9.79622
8,overcast,0.0
9,partially_cloudy,580.134974


In [6]:
train = nyc[nyc.index.isin(range(2017), level=0)]
test = nyc[nyc.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.26346539052707507
MAE: 272.47329606419436
MSE: 120235.10134171456
RMSE: 346.74933502706904


In [7]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,109.580496
1,male,-398.981823
2,temperature,29.026549
3,relative_humidity,0.034438
4,wind_speed,16.439797
5,precipitation,76.260346
6,precipitation_cover,10.287346
7,cloud_cover,5.525568
8,overcast,-541.590089
9,partially_cloudy,-193.718247


In [8]:
train = boston[boston.index.isin(range(2017), level=0)]
test = boston[boston.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.47207035106013917
MAE: 1318.2329814451143
MSE: 1953173.294080445
RMSE: 1397.5597640460478


In [9]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,593.763749
1,male,-1431.211599
2,temperature,87.334866
3,relative_humidity,11.906754
4,wind_speed,-0.085594
5,precipitation,1967.982287
6,precipitation_cover,1.077872
7,cloud_cover,-14.533504
8,rain,64.285743
9,event,0.0


In [10]:
train = berlin[berlin.index.isin(range(2017), level=0)]
test = berlin[berlin.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.5841806815774391
MAE: 829.7206237258863
MSE: 923687.0762982728
RMSE: 961.0864041792876


In [11]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,467.0261
1,male,-1455.646
2,temperature,69.20877
3,relative_humidity,5.048345
4,wind_speed,-12.74645
5,precipitation,-6.536993e-13
6,precipitation_cover,2.273737e-13
7,cloud_cover,-13.35731
8,overcast,1103.646
9,partially_cloudy,768.5363


In [12]:
train = chicago[chicago.index.isin(range(2017), level=0)]
test = chicago[chicago.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8727267283684006
MAE: 476.75164008732094
MSE: 355137.6957471701
RMSE: 595.9343048920493


In [13]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,447.221232
1,male,-1535.602797
2,temperature,45.002944
3,relative_humidity,-8.631163
4,wind_speed,-58.098825
5,precipitation,-0.009502
6,precipitation_cover,-9.501577
7,cloud_cover,16.083448
8,overcast,-458.174109
9,partially_cloudy,-795.303901


In [14]:
train = combined[combined.index.isin(range(2017), level=0)]
test = combined[combined.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.3025721670668655
MAE: 1721.7268557546429
MSE: 3811689.766792224
RMSE: 1952.3549284882151


In [15]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,411.122336
1,male,-1334.361499
2,temperature,44.435123
3,relative_humidity,-16.293453
4,wind_speed,-51.240033
5,precipitation,-31.918961
6,precipitation_cover,-21.109005
7,cloud_cover,-2.994308
8,overcast,-824.905476
9,partially_cloudy,59.542532


## Continuation

In [79]:
cols = ['temperature', 'relative_humidity',
       'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover',
       'clear', 'overcast', 'partially_cloudy', 'rain', 'event']
yr = 2015

In [80]:
london_train = london[london.index.isin(range(yr), level=0)]
london_test = london[london.index.isin(range(yr, 2019), level=0)]

results = {}
best_london = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = london_train.drop(columns=drop_cols)
        y_train = london_train['time_seconds']

        X_test = london_test.drop(columns=drop_cols)
        y_test = london_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_london[min(results.keys())] = results[min(results.keys())]

In [81]:
min(best_london.keys())

567.4712723177524

In [82]:
best_london[min(best_london.keys())]

['relative_humidity',
 'precipitation',
 'precipitation_cover',
 'cloud_cover',
 'clear',
 'rain',
 'event',
 'time_seconds']

In [83]:
chicago_train = chicago[chicago.index.isin(range(yr), level=0)]
chicago_test = chicago[chicago.index.isin(range(yr, 2019), level=0)]


results = {}
best_chicago = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = chicago_train.drop(columns=drop_cols)
        y_train = chicago_train['time_seconds']

        X_test = chicago_test.drop(columns=drop_cols)
        y_test = chicago_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_chicago[min(results.keys())] = results[min(results.keys())]

In [84]:
min(best_chicago.keys())

584.0171471028609

In [85]:
best_chicago[min(best_chicago.keys())]

['relative_humidity',
 'wind_speed',
 'precipitation',
 'overcast',
 'partially_cloudy',
 'time_seconds']

In [86]:
nyc_train = nyc[nyc.index.isin(range(yr), level=0)]
nyc_test = nyc[nyc.index.isin(range(yr, 2019), level=0)]


results = {}
best_nyc = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        
        X_train = nyc_train.drop(columns=drop_cols)
        y_train = nyc_train['time_seconds']

        X_test = nyc_test.drop(columns=drop_cols)
        y_test = nyc_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_nyc[min(results.keys())] = results[min(results.keys())]

In [87]:
min(best_nyc.keys())

322.0221033970261

In [88]:
best_nyc[min(best_nyc.keys())]

['temperature',
 'precipitation',
 'cloud_cover',
 'clear',
 'overcast',
 'time_seconds']

In [89]:
berlin_train = berlin[berlin.index.isin(range(yr), level=0)]
berlin_test = berlin[berlin.index.isin(range(yr, 2019), level=0)]

results = {}
best_berlin = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        
        X_train = berlin_train.drop(columns=drop_cols)
        y_train = berlin_train['time_seconds']

        X_test = berlin_test.drop(columns=drop_cols)
        y_test = berlin_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_berlin[min(results.keys())] = results[min(results.keys())]

In [90]:
min(best_berlin.keys())

841.3942830629142

In [91]:
best_berlin[min(best_berlin.keys())]

['relative_humidity',
 'cloud_cover',
 'overcast',
 'partially_cloudy',
 'time_seconds']

In [92]:
boston_train = boston[boston.index.isin(range(yr), level=0)]
boston_test = boston[boston.index.isin(range(yr, 2019), level=0)]


results = {}
best_boston = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = boston_train.drop(columns=drop_cols)
        y_train = boston_train['time_seconds']

        X_test = boston_test.drop(columns=drop_cols)
        y_test = boston_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_boston[min(results.keys())] = results[min(results.keys())]

In [93]:
min(best_boston.keys())

522.171829854125

In [94]:
best_boston[min(best_boston.keys())]

['temperature',
 'relative_humidity',
 'wind_speed',
 'cloud_cover',
 'clear',
 'overcast',
 'partially_cloudy',
 'rain',
 'event',
 'time_seconds']

In [95]:
combined_train = combined[combined.index.isin(range(yr), level=0)]
combined_test = combined[combined.index.isin(range(yr,2019), level=0)]

results = {}
best_combined = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = combined_train.drop(columns=drop_cols)
        y_train = combined_train['time_seconds']

        X_test = combined_test.drop(columns=drop_cols)
        y_test = combined_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)

        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_combined[min(results.keys())] = results[min(results.keys())] 

In [96]:
min(best_combined.keys())

1477.9707092980796

In [97]:
best_combined[min(best_combined.keys())]

['relative_humidity',
 'precipitation',
 'precipitation_cover',
 'clear',
 'overcast',
 'partially_cloudy',
 'time_seconds']

### Best Scores/Coefs

In [98]:
X_train = berlin_train.drop(columns=best_berlin[min(best_berlin.keys())])
y_train = berlin_train['time_seconds']

X_test = berlin_test.drop(columns=best_berlin[min(best_berlin.keys())])
y_test = berlin_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.6860704822668877
R2: 0.6993935836691532
MAE: 704.4130294238205
MSE: 707944.3395709554
RMSE: 841.3942830629142


In [99]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,459.248855
1,male,-1447.364734
2,temperature,60.564108
3,wind_speed,-17.68781
4,precipitation,0.0
5,precipitation_cover,0.0
6,clear,-131.698396
7,rain,0.0
8,event,0.0


In [100]:
X_train = boston_train.drop(columns=best_boston[min(best_boston.keys())])
y_train = boston_train['time_seconds']

X_test = boston_test.drop(columns=best_boston[min(best_boston.keys())])
y_test = boston_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.6654762189610279
R2: 0.9171493942450878
MAE: 419.71308700026304
MSE: 272663.4198932053
RMSE: 522.171829854125


In [101]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,596.311923
1,male,-1442.261933
2,precipitation,1827.722856
3,precipitation_cover,-13.493588


In [102]:
X_train = chicago_train.drop(columns=best_chicago[min(best_chicago.keys())])
y_train = chicago_train['time_seconds']

X_test = chicago_test.drop(columns=best_chicago[min(best_chicago.keys())])
y_test = chicago_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.742981520801216
R2: 0.8632554596342361
MAE: 427.86580937953454
MSE: 341076.0281101646
RMSE: 584.0171471028609


In [103]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,437.896173
1,male,-1540.085387
2,temperature,41.137493
3,precipitation_cover,-36.429305
4,cloud_cover,20.04555
5,clear,871.455417
6,rain,-3.642931
7,event,0.0


In [104]:
X_train = london_train.drop(columns=best_london[min(best_london.keys())])
y_train = london_train['time_seconds']

X_test = london_test.drop(columns=best_london[min(best_london.keys())])
y_test = london_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.803865223889333
R2: 0.8712712264888957
MAE: 479.6718084946103
MSE: 322023.64490592876
RMSE: 567.4712723177524


In [105]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,454.840436
1,male,-1918.37598
2,temperature,53.225935
3,wind_speed,29.719154
4,overcast,0.0
5,partially_cloudy,82.970964


In [106]:
X_train = nyc_train.drop(columns=best_nyc[min(best_nyc.keys())])
y_train = nyc_train['time_seconds']

X_test = nyc_test.drop(columns=best_nyc[min(best_nyc.keys())])
y_test = nyc_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.33672554342767
R2: 0.1805314887164069
MAE: 283.0013953923912
MSE: 103698.23507624498
RMSE: 322.0221033970261


In [107]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,110.116709
1,male,-395.296192
2,relative_humidity,7.484337
3,wind_speed,15.232434
4,precipitation_cover,2.493454
5,partially_cloudy,31.192121
6,rain,-330.437723
7,event,0.0


In [108]:
X_train = combined_train.drop(columns=best_combined[min(best_combined.keys())])
y_train = combined_train['time_seconds']

X_test = combined_test.drop(columns=best_combined[min(best_combined.keys())])
y_test = combined_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.5069477484746175
R2: 0.5646975650653245
MAE: 1237.216875640227
MSE: 2184397.4175430685
RMSE: 1477.9707092980796


In [109]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,410.273781
1,male,-1343.626928
2,temperature,47.458927
3,wind_speed,-25.358463
4,cloud_cover,-10.844291
5,rain,981.26444
6,event,-434.577584
