In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import itertools

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

## Starting Point From Initial_Linear_Modeling

In [3]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)

In [4]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin([2016, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8784395204389115
MAE: 414.91953232851773
MSE: 294385.3600606244
RMSE: 542.5729075991765


In [5]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,455.0172
1,male,-1920.404
2,temperature,40.28425
3,relative_humidity,-7.705318
4,wind_speed,32.01593
5,precipitation,2123.47
6,precipitation_cover,9.382286
7,cloud_cover,-9.958525
8,overcast,9.094947e-13
9,partially_cloudy,592.845


In [6]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin([2016, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8805622266691774
MAE: 568.78638943414
MSE: 469570.8602143672
RMSE: 685.2524062083746


In [7]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,759.0829
1,male,-1914.746
2,temperature,35.52777
3,relative_humidity,-4.356455
4,wind_speed,4.787671
5,precipitation,-9.379164e-12
6,precipitation_cover,-2.046363e-12
7,cloud_cover,14.48621
8,overcast,-924.9209
9,partially_cloudy,-597.1917


In [8]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin([2016, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8318848553406181
MAE: 713.6786734180324
MSE: 646553.5428095255
RMSE: 804.0855320235065


In [9]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,594.126246
1,male,-1432.820716
2,temperature,85.969153
3,relative_humidity,10.363134
4,wind_speed,2.563545
5,precipitation,1788.779558
6,precipitation_cover,0.429013
7,cloud_cover,-14.96967
8,rain,132.715657
9,event,0.0


In [10]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin([2016, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.874236818411501
MAE: 431.09406468844344
MSE: 320012.1270363163
RMSE: 565.6961437347053


In [11]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,488.0671
1,male,-1480.039
2,temperature,24.0552
3,relative_humidity,-6.568969
4,wind_speed,7.426664
5,precipitation,4.547474e-13
6,precipitation_cover,-1.477929e-12
7,cloud_cover,-5.133045
8,overcast,525.2323
9,partially_cloudy,41.46611


In [12]:
train = chicago[chicago.index.isin(range(2000,2016), level=0)]
test = chicago[chicago.index.isin([2016, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.7817110765678504
MAE: 530.3884576679438
MSE: 513930.6920879206
RMSE: 716.8895954663595


In [13]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,462.649627
1,male,-1488.849172
2,temperature,44.951399
3,relative_humidity,-8.328303
4,wind_speed,-0.426446
5,precipitation,-0.034352
6,precipitation_cover,-34.352144
7,cloud_cover,11.779944
8,overcast,270.846286
9,partially_cloudy,-109.762611


In [14]:
train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin([2016, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.7456148185774321
MAE: 667.9324620249184
MSE: 920772.7357014156
RMSE: 959.5690364436608


In [15]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,551.41658
1,male,-1644.454915
2,temperature,50.824448
3,relative_humidity,-7.418479
4,wind_speed,13.2587
5,precipitation,-736.759393
6,precipitation_cover,-1.408614
7,cloud_cover,2.164248
8,overcast,64.828307
9,partially_cloudy,89.848002


## Linear Modeling Continuation

In [16]:
cols = ['temperature', 'relative_humidity',
       'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover',
       'clear', 'overcast', 'partially_cloudy', 'rain', 'event']
yr = 2016

In [17]:
london_train = london[london.index.isin(range(yr), level=0)]
london_test = london[london.index.isin(range(yr, 2019), level=0)]

results = {}
best_london = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = london_train.drop(columns=drop_cols)
        y_train = london_train['time_seconds']

        X_test = london_test.drop(columns=drop_cols)
        y_test = london_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_london[min(results.keys())] = results[min(results.keys())]

In [18]:
min(best_london.keys())

576.6643491631934

In [19]:
best_london[min(best_london.keys())]

['relative_humidity',
 'wind_speed',
 'precipitation',
 'precipitation_cover',
 'clear',
 'rain',
 'time_seconds']

In [20]:
chicago_train = chicago[chicago.index.isin(range(yr), level=0)]
chicago_test = chicago[chicago.index.isin(range(yr, 2019), level=0)]


results = {}
best_chicago = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = chicago_train.drop(columns=drop_cols)
        y_train = chicago_train['time_seconds']

        X_test = chicago_test.drop(columns=drop_cols)
        y_test = chicago_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_chicago[min(results.keys())] = results[min(results.keys())]

In [21]:
min(best_chicago.keys())

604.3384698278334

In [22]:
best_chicago[min(best_chicago.keys())]

['relative_humidity',
 'precipitation_cover',
 'overcast',
 'partially_cloudy',
 'rain',
 'event',
 'time_seconds']

In [23]:
nyc_train = nyc[nyc.index.isin(range(yr), level=0)]
nyc_test = nyc[nyc.index.isin(range(yr, 2019), level=0)]


results = {}
best_nyc = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        
        X_train = nyc_train.drop(columns=drop_cols)
        y_train = nyc_train['time_seconds']

        X_test = nyc_test.drop(columns=drop_cols)
        y_test = nyc_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_nyc[min(results.keys())] = results[min(results.keys())]

In [24]:
min(best_nyc.keys())

660.5526845800417

In [25]:
best_nyc[min(best_nyc.keys())]

['temperature',
 'relative_humidity',
 'precipitation',
 'precipitation_cover',
 'cloud_cover',
 'clear',
 'overcast',
 'rain',
 'event',
 'time_seconds']

In [26]:
berlin_train = berlin[berlin.index.isin(range(yr), level=0)]
berlin_test = berlin[berlin.index.isin(range(yr, 2019), level=0)]

results = {}
best_berlin = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        
        X_train = berlin_train.drop(columns=drop_cols)
        y_train = berlin_train['time_seconds']

        X_test = berlin_test.drop(columns=drop_cols)
        y_test = berlin_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_berlin[min(results.keys())] = results[min(results.keys())]

In [27]:
min(best_berlin.keys())

854.113176815812

In [28]:
best_berlin[min(best_berlin.keys())]

['relative_humidity',
 'precipitation',
 'precipitation_cover',
 'clear',
 'overcast',
 'partially_cloudy',
 'rain',
 'event',
 'time_seconds']

In [29]:
boston_train = boston[boston.index.isin(range(yr), level=0)]
boston_test = boston[boston.index.isin(range(yr, 2019), level=0)]


results = {}
best_boston = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = boston_train.drop(columns=drop_cols)
        y_train = boston_train['time_seconds']

        X_test = boston_test.drop(columns=drop_cols)
        y_test = boston_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_boston[min(results.keys())] = results[min(results.keys())]

In [30]:
min(best_boston.keys())

563.9786705087007

In [31]:
best_boston[min(best_boston.keys())]

['temperature',
 'relative_humidity',
 'wind_speed',
 'cloud_cover',
 'overcast',
 'partially_cloudy',
 'event',
 'time_seconds']

In [32]:
combined_train = combined[combined.index.isin(range(yr), level=0)]
combined_test = combined[combined.index.isin(range(yr,2019), level=0)]

results = {}
best_combined = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')

        X_train = combined_train.drop(columns=drop_cols)
        y_train = combined_train['time_seconds']

        X_test = combined_test.drop(columns=drop_cols)
        y_test = combined_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)

        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_combined[min(results.keys())] = results[min(results.keys())] 

In [33]:
min(best_combined.keys())

688.5253385874158

In [34]:
best_combined[min(best_combined.keys())]

['relative_humidity',
 'wind_speed',
 'precipitation_cover',
 'cloud_cover',
 'clear',
 'partially_cloudy',
 'rain',
 'time_seconds']

### Best Scores/Coefs

In [35]:
X_train = berlin_train.drop(columns=best_berlin[min(best_berlin.keys())])
y_train = berlin_train['time_seconds']

X_test = berlin_test.drop(columns=best_berlin[min(best_berlin.keys())])
y_test = berlin_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.6882267710313108
R2: 0.6932365526300843
MAE: 724.0948961155018
MSE: 729509.3188103986
RMSE: 854.113176815812


In [36]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,462.829293
1,male,-1448.887874
2,temperature,54.933423
3,wind_speed,-22.263758
4,cloud_cover,0.312984


In [37]:
X_train = boston_train.drop(columns=best_boston[min(best_boston.keys())])
y_train = boston_train['time_seconds']

X_test = boston_test.drop(columns=best_boston[min(best_boston.keys())])
y_test = boston_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.6821554757822392
R2: 0.9054269907098701
MAE: 453.7491648827442
MSE: 318071.94078876165
RMSE: 563.9786705087007


In [38]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,596.848277
1,male,-1440.98681
2,precipitation,1676.595615
3,precipitation_cover,-12.444363
4,clear,0.0
5,rain,-10.152106


In [39]:
X_train = chicago_train.drop(columns=best_chicago[min(best_chicago.keys())])
y_train = chicago_train['time_seconds']

X_test = chicago_test.drop(columns=best_chicago[min(best_chicago.keys())])
y_test = chicago_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.7665625196052492
R2: 0.8609236684421164
MAE: 422.2007785698438
MSE: 365224.986113847
RMSE: 604.3384698278334


In [40]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,441.528919
1,male,-1546.577626
2,temperature,40.873137
3,wind_speed,-65.509346
4,precipitation,-13862.1303
5,cloud_cover,18.653145
6,clear,898.705585


In [41]:
X_train = london_train.drop(columns=best_london[min(best_london.keys())])
y_train = london_train['time_seconds']

X_test = london_test.drop(columns=best_london[min(best_london.keys())])
y_test = london_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.8059482928769438
R2: 0.8644876358484527
MAE: 469.22528923809233
MSE: 332541.77159580943
RMSE: 576.6643491631934


In [42]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,455.017239
1,male,-1920.404065
2,temperature,60.472343
3,cloud_cover,-8.440907
4,overcast,0.0
5,partially_cloudy,652.346717
6,event,0.0


In [43]:
X_train = nyc_train.drop(columns=best_nyc[min(best_nyc.keys())])
y_train = nyc_train['time_seconds']

X_test = nyc_test.drop(columns=best_nyc[min(best_nyc.keys())])
y_test = nyc_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.9050579131212888
R2: 0.8872042500686859
MAE: 569.6177404271069
MSE: 436329.84910590004
RMSE: 660.5526845800417


In [44]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,759.082858
1,male,-1914.746188
2,wind_speed,10.614954
3,partially_cloudy,-201.655078


In [45]:
X_train = combined_train.drop(columns=best_combined[min(best_combined.keys())])
y_train = combined_train['time_seconds']

X_test = combined_test.drop(columns=best_combined[min(best_combined.keys())])
y_test = combined_test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.7938377999954352
R2: 0.8692102114227132
MAE: 530.9511464469608
MSE: 474067.14187691564
RMSE: 688.5253385874158


In [46]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,531.390344
1,male,-1624.894221
2,temperature,48.063201
3,precipitation,1665.599937
4,overcast,119.521165
5,event,750.024306
