In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
import itertools

In [2]:
# added once everything was completed to reduce scrolling for outputs due to convergence warnings
import warnings
warnings.filterwarnings("ignore")

## Functions

In [3]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

def undo_age(x):
    '''make age categories their division again'''
    if x == 0.0:
        return '18-39'
    elif x == 1.0:
        return '40-44'
    elif x == 2.0:
        return '45-49'
    elif x == 3.0:
        return '50-54'
    elif x == 4.0:
        return '55-59'
    elif x == 5.0:
        return '60-64'
    elif x == 6.0:
        return '65-69'
    return '70+'

## All Participants

### Relative Ages only numeric conditions

In [4]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)


events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined = pd.concat(events, ignore_index=True)

aggregate = {col: 'mean' for col in london.drop(columns='year')}
    
london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

aggregate = {col: 'mean' for col in london}

combined = combined.groupby(['year','event', 'age', 'male']).agg(aggregate)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.columns = ['age', 'male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc']
combined.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)


berlin['age'] = berlin['age'].apply(lambda x: undo_age(x))
berlin = berlin.merge(pd.get_dummies(berlin['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
berlin.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

boston['age'] = boston['age'].apply(lambda x: undo_age(x))
boston = boston.merge(pd.get_dummies(boston['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
boston.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

chicago['age'] = chicago['age'].apply(lambda x: undo_age(x))
chicago = chicago.merge(pd.get_dummies(chicago['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
chicago.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

london['age'] = london['age'].apply(lambda x: undo_age(x))
london = london.merge(pd.get_dummies(london['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
london.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

nyc['age'] = nyc['age'].apply(lambda x: undo_age(x))
nyc = nyc.merge(pd.get_dummies(nyc['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
nyc.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

combined['age'] = combined['age'].apply(lambda x: undo_age(x))
combined = combined.merge(pd.get_dummies(combined['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

In [5]:
train = berlin[berlin.index.isin(range(2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.7401391292190441
MAE: 725.3419318377643
MSE: 617971.040729292
RMSE: 786.1113411783931


In [6]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1496.339
1,temperature,54.5064
2,relative_humidity,0.8244
3,wind_speed,-19.9604
4,precipitation,-0.0
5,precipitation_cover,0.0
6,cloud_cover,-0.0341
7,event,0.0
8,40-44,76.3013
9,45-49,277.4475


In [7]:
train = boston[boston.index.isin(range(2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.5940271677327711
MAE: 1019.9841052492193
MSE: 1365384.982840177
RMSE: 1168.4968903853262


In [8]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1436.3639
1,temperature,87.0489
2,relative_humidity,11.311
3,wind_speed,0.3845
4,precipitation,1975.3024
5,precipitation_cover,1.8445
6,cloud_cover,-14.7215
7,event,-0.0
8,40-44,244.9844
9,45-49,633.9419


In [9]:
train = chicago[chicago.index.isin(range(2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.27254617236911993
MAE: 1203.1327325819864
MSE: 1910348.8790569305
RMSE: 1382.153710358197


In [10]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1541.0973
1,temperature,43.5592
2,relative_humidity,-22.9555
3,wind_speed,-50.6158
4,precipitation,0.082
5,precipitation_cover,82.0163
6,cloud_cover,6.5565
7,event,-0.0
8,40-44,98.4425
9,45-49,432.8619


In [11]:
train = london[london.index.isin(range(2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.891867048647816
MAE: 424.71146533380187
MSE: 265353.81797579065
RMSE: 515.1250508136744


In [12]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1920.4041
1,temperature,28.3701
2,relative_humidity,-12.9644
3,wind_speed,42.8826
4,precipitation,-1300.711
5,precipitation_cover,13.9915
6,cloud_cover,-2.1418
7,event,-0.0
8,40-44,-369.7451
9,45-49,-148.8873


In [13]:
train = nyc[nyc.index.isin(range(2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.9633424136949514
MAE: 309.7327949029726
MSE: 141803.20722020295
RMSE: 376.56766619055725


In [14]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1914.7462
1,temperature,28.0
2,relative_humidity,1.4719
3,wind_speed,12.5439
4,precipitation,-0.0
5,precipitation_cover,0.0
6,cloud_cover,2.9354
7,event,0.0
8,40-44,134.9726
9,45-49,531.3897


In [15]:
train = combined[combined.index.isin(range(2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8739059167982461
MAE: 545.4800298418432
MSE: 457046.85573156783
RMSE: 676.0524060541222


In [16]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1632.1046
1,temperature,49.5529
2,relative_humidity,-7.256
3,wind_speed,3.6119
4,precipitation,519.2366
5,precipitation_cover,1.7451
6,cloud_cover,2.2639
7,boston,102.3149
8,chicago,1282.3407
9,london,1990.9027


## Elastic Net

In [17]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [1_000_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

In [18]:
train = london[london.index.isin(range(2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.9005835316415086
MAE: 412.1469050926691
MSE: 243963.92698720377
RMSE: 493.9270462195847


In [19]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 1000000}

In [20]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1916.404065
1,temperature,26.031918
2,relative_humidity,-13.002548
3,wind_speed,39.17506
4,precipitation,-0.0
5,precipitation_cover,1.692231
6,cloud_cover,-2.546304
7,event,0.0
8,40-44,-386.054914
9,45-49,-165.195089


In [21]:
train = nyc[nyc.index.isin(range(2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.9639687451126008
MAE: 305.55281149171276
MSE: 139380.35801604754
RMSE: 373.33678899359427


In [22]:
engs.best_params_

{'alpha': 0.01, 'l1_ratio': 0.9696969696969697, 'max_iter': 1000000}

In [23]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1912.389353
1,temperature,27.99935
2,relative_humidity,1.472011
3,wind_speed,12.543449
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,2.93538
7,event,0.0
8,40-44,96.954277
9,45-49,492.413699


In [24]:
train = boston[boston.index.isin(range(2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.5481230025612659
MAE: 1044.9660582663723
MSE: 1519771.8107098101
RMSE: 1232.7902541429382


In [25]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 1000000}

In [26]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1395.757055
1,temperature,86.241669
2,relative_humidity,11.505064
3,wind_speed,9.264444
4,precipitation,0.0
5,precipitation_cover,5.295134
6,cloud_cover,-13.507521
7,event,0.0
8,40-44,-74.671994
9,45-49,154.994487


In [27]:
train = berlin[berlin.index.isin(range(2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.7401184511356553
MAE: 725.3402301300863
MSE: 618020.214953459
RMSE: 786.1426174387565


In [28]:
engs.best_params_

{'alpha': 0.01, 'l1_ratio': 1.0, 'max_iter': 1000000}

In [29]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1496.293465
1,temperature,54.506318
2,relative_humidity,0.824251
3,wind_speed,-19.960231
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,-0.034044
7,event,0.0
8,40-44,75.668425
9,45-49,276.815035


In [30]:
train = chicago[chicago.index.isin(range(2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.27259487899101875
MAE: 1203.0928554134834
MSE: 1910220.971776753
RMSE: 1382.107438579488


In [31]:
engs.best_params_

{'alpha': 0.01, 'l1_ratio': 1.0, 'max_iter': 1000000}

In [32]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1541.056142
1,temperature,43.558914
2,relative_humidity,-22.954928
3,wind_speed,-50.614326
4,precipitation,0.0
5,precipitation_cover,82.011512
6,cloud_cover,6.556652
7,event,0.0
8,40-44,97.800669
9,45-49,432.220294


In [33]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [100_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

train = combined[combined.index.isin(range(2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.8541765712122814
MAE: 579.8398649889872
MSE: 528558.8183609232
RMSE: 727.0205075243223


In [34]:
engs.best_estimator_

ElasticNet(alpha=1, l1_ratio=1.0, max_iter=100000)

In [35]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1627.956568
1,temperature,49.479001
2,relative_humidity,-6.740398
3,wind_speed,3.449685
4,precipitation,0.0
5,precipitation_cover,2.605448
6,cloud_cover,2.002913
7,boston,48.344241
8,chicago,1242.336494
9,london,1948.715728


## all

In [36]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined = pd.concat(events, ignore_index=True)

aggregate = {col: 'mean' for col in london.drop(columns='year')}
    
london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

aggregate = {col: 'mean' for col in london}

combined = combined.groupby(['year','event', 'age', 'male']).agg(aggregate)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.columns = ['age', 'male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc']

berlin['age'] = berlin['age'].apply(lambda x: undo_age(x))
berlin = berlin.merge(pd.get_dummies(berlin['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

boston['age'] = boston['age'].apply(lambda x: undo_age(x))
boston = boston.merge(pd.get_dummies(boston['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

chicago['age'] = chicago['age'].apply(lambda x: undo_age(x))
chicago = chicago.merge(pd.get_dummies(chicago['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

london['age'] = london['age'].apply(lambda x: undo_age(x))
london = london.merge(pd.get_dummies(london['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

nyc['age'] = nyc['age'].apply(lambda x: undo_age(x))
nyc = nyc.merge(pd.get_dummies(nyc['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

combined['age'] = combined['age'].apply(lambda x: undo_age(x))
combined = combined.merge(pd.get_dummies(combined['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
combined.columns = ['male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70+']

In [37]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.927331508024147
MAE: 334.0957839064749
MSE: 172811.79532565898
RMSE: 415.7063811461871


In [38]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1480.0387
1,temperature,24.0552
2,relative_humidity,-6.569
3,wind_speed,7.4267
4,precipitation,-0.0
5,precipitation_cover,0.0
6,cloud_cover,-5.133
7,overcast,525.2323
8,partially_cloudy,41.4661
9,rain,-0.0


In [39]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.468662892360622
MAE: 1097.2488716462176
MSE: 1787015.4107233442
RMSE: 1336.792957313639


In [40]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1435.9908
1,temperature,87.1349
2,relative_humidity,4.7512
3,wind_speed,28.9957
4,precipitation,230.7458
5,precipitation_cover,-3.2355
6,cloud_cover,-26.7305
7,overcast,339.6197
8,partially_cloudy,-339.6197
9,rain,632.792


In [41]:
train = chicago[chicago.index.isin(range(2000,2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8106573523866958
MAE: 540.0185298998327
MSE: 497228.141342991
RMSE: 705.1440571564019


In [42]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1489.6779
1,temperature,44.9011
2,relative_humidity,-8.3388
3,wind_speed,-0.3737
4,precipitation,-0.0344
5,precipitation_cover,-34.3579
6,cloud_cover,11.7818
7,overcast,268.5146
8,partially_cloudy,-110.9353
9,rain,-3.4358


In [43]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8777088517901251
MAE: 439.9203121143426
MSE: 300097.45111315825
RMSE: 547.8115105701579


In [44]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1920.4041
1,temperature,35.4165
2,relative_humidity,-8.9395
3,wind_speed,33.4177
4,precipitation,8471.3516
5,precipitation_cover,9.1585
6,cloud_cover,-14.4042
7,overcast,-0.0
8,partially_cloudy,791.336
9,rain,-1149.8421


In [45]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.9724582604176039
MAE: 265.8629681994924
MSE: 106540.21169608526
RMSE: 326.40498111408357


In [46]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1914.7462
1,temperature,35.5278
2,relative_humidity,-4.3565
3,wind_speed,4.7877
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,14.4862
7,overcast,-924.9209
8,partially_cloudy,-597.1917
9,rain,0.0


In [47]:
train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.827700749662807
MAE: 556.1675860042683
MSE: 624524.3917235996
RMSE: 790.2685567094262


In [48]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1647.2265
1,temperature,49.9369
2,relative_humidity,-8.0156
3,wind_speed,19.1728
4,precipitation,-806.7048
5,precipitation_cover,0.319
6,cloud_cover,2.2252
7,overcast,169.3861
8,partially_cloudy,112.2735
9,rain,171.7319


## Elastic Net

In [49]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [100_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

In [50]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.9102733768598846
MAE: 399.7680868924779
MSE: 220185.44510783546
RMSE: 469.23921949026754


In [51]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 100000}

In [52]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1880.404065
1,temperature,26.939149
2,relative_humidity,-12.460085
3,wind_speed,38.303857
4,precipitation,-0.0
5,precipitation_cover,-0.0
6,cloud_cover,-6.422871
7,overcast,0.0
8,partially_cloudy,228.53754
9,rain,-0.0


In [53]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.9738438910890641
MAE: 256.4421396183765
MSE: 101180.15139095034
RMSE: 318.0882760979259


In [54]:
engs.best_params_

{'alpha': 0.1, 'l1_ratio': 0.9797979797979799, 'max_iter': 100000}

In [55]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1899.008744
1,temperature,34.036573
2,relative_humidity,-3.380317
3,wind_speed,6.176529
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,12.739158
7,overcast,-783.347042
8,partially_cloudy,-518.0935
9,rain,0.0


In [76]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.5259582822616911
MAE: 1048.3017902809477
MSE: 1594317.1345357446
RMSE: 1262.6627160630603


In [71]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 10000}

In [72]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1427.71891
1,temperature,91.549097
2,relative_humidity,11.404519
3,wind_speed,32.194038
4,precipitation,0.0
5,precipitation_cover,-1.229207
6,cloud_cover,-15.737657
7,overcast,433.191762
8,partially_cloudy,-0.0
9,rain,0.0


In [59]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.9278204410434209
MAE: 336.2702646803226
MSE: 171649.07141936422
RMSE: 414.3055290716794


In [60]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [61]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1476.038744
1,temperature,23.323511
2,relative_humidity,-7.000157
3,wind_speed,6.993451
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,-4.24838
7,overcast,451.493705
8,partially_cloudy,0.269481
9,rain,0.0


In [62]:
train = chicago[chicago.index.isin(range(2000, 2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.8084882485714056
MAE: 522.7335697460418
MSE: 502924.3723403492
RMSE: 709.1716099367975


In [63]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 100000}

In [64]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1449.337889
1,temperature,42.969215
2,relative_humidity,-8.483604
3,wind_speed,-2.614883
4,precipitation,-0.0
5,precipitation_cover,-20.02144
6,cloud_cover,12.953239
7,overcast,0.0
8,partially_cloudy,-143.204268
9,rain,-0.0


In [65]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [10_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

R2: 0.8779218444548313
MAE: 493.33414383353625
MSE: 442490.52555586136
RMSE: 665.1996133160792


In [66]:
engs.best_estimator_

ElasticNet(alpha=1, l1_ratio=1.0, max_iter=10000)

In [67]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1643.492504
1,temperature,49.242097
2,relative_humidity,-7.43164
3,wind_speed,16.626574
4,precipitation,-0.0
5,precipitation_cover,-0.906497
6,cloud_cover,3.212562
7,overcast,47.523017
8,partially_cloudy,48.918958
9,rain,87.355454


In [68]:
boston

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,male,time_seconds,temperature,relative_humidity,wind_speed,precipitation,precipitation_cover,cloud_cover,clear,overcast,partially_cloudy,rain,event,40-44,45-49,50-54,55-59,60-64,65-69,70+
year,age,male,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2001,0,0,0.0,14616.932861,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0
2001,0,1,1.0,13143.235949,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0
2001,1,0,0.0,14944.315348,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0.0,1,0,0,0,0,0,0
2001,1,1,1.0,13310.075263,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0.0,1,0,0,0,0,0,0
2001,2,0,0.0,15154.285047,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0.0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5,1,1.0,16035.063477,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,1,0,0
2018,6,0,0.0,19645.315315,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,0,1,0
2018,6,1,1.0,17500.628440,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,0,1,0
2018,7,0,0.0,20704.861111,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,0,0,1
