In [51]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

def undo_age(x):
    '''make age categories their division again'''
    if x == 0.0:
        return '18-39'
    elif x == 1.0:
        return '40-44'
    elif x == 2.0:
        return '45-49'
    elif x == 3.0:
        return '50-54'
    elif x == 4.0:
        return '55-59'
    elif x == 5.0:
        return '60-64'
    elif x == 6.0:
        return '65-69'
    return '70+'

In [3]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)


events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined = pd.concat(events, ignore_index=True)

aggregate = {col: 'mean' for col in london.drop(columns='year')}
    
london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

aggregate = {col: 'mean' for col in london}

combined = combined.groupby(['year','event', 'age', 'male']).agg(aggregate)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.columns = ['age', 'male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc']

In [4]:
age_18 = pd.DataFrame(combined[combined['age'] == 0.0])
age_40 = pd.DataFrame(combined[combined['age'] == 1.0])
age_45 = pd.DataFrame(combined[combined['age'] == 2.0])
age_50 = pd.DataFrame(combined[combined['age'] == 3.0])
age_55 = pd.DataFrame(combined[combined['age'] == 4.0])
age_60 = pd.DataFrame(combined[combined['age'] == 5.0])
age_65 = pd.DataFrame(combined[combined['age'] == 6.0])
age_70 = pd.DataFrame(combined[combined['age'] == 7.0])

## Linear Regression

In [5]:
age_18.columns

Index(['age', 'male', 'time_seconds', 'temperature', 'relative_humidity',
       'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover',
       'clear', 'overcast', 'partially_cloudy', 'rain', 'boston', 'chicago',
       'london', 'nyc'],
      dtype='object')

In [6]:
lr = LinearRegression()

In [7]:
train = age_18[age_18.index.isin(range(2000,2016), level=0)]
test = age_18[age_18.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.9282804148289096
R2: 0.925147071661293
MAE: 307.21163287042856
MSE: 148538.92152066948
RMSE: 385.4074746559406


In [8]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1607.634637
2,temperature,44.416947
3,relative_humidity,-3.867294
4,wind_speed,9.709133
5,precipitation,176.079878
6,precipitation_cover,-2.220309
7,cloud_cover,3.581985
8,boston,1158.802149
9,chicago,1929.782174


In [9]:
train = age_40[age_40.index.isin(range(2000,2016), level=0)]
test = age_40[age_40.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.930478707799073
R2: 0.8957594087938225
MAE: 362.8255133563218
MSE: 196728.33959076327
RMSE: 443.5406853838363


In [10]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1656.231489
2,temperature,46.846907
3,relative_humidity,-4.040626
4,wind_speed,12.432888
5,precipitation,109.125973
6,precipitation_cover,-1.092676
7,cloud_cover,4.650219
8,boston,998.359772
9,chicago,1797.143482


In [11]:
train = age_45[age_45.index.isin(range(2000,2016), level=0)]
test = age_45[age_45.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.928455143058903
R2: 0.893250706633198
MAE: 357.46576508128663
MSE: 189298.45300425275
RMSE: 435.08442054876286


In [12]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1772.360271
2,temperature,46.83142
3,relative_humidity,-4.317614
4,wind_speed,16.842376
5,precipitation,215.086489
6,precipitation_cover,-2.375684
7,cloud_cover,5.197874
8,boston,903.89665
9,chicago,1751.312442


In [13]:
train = age_50[age_50.index.isin(range(2000,2016), level=0)]
test = age_50[age_50.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.9142827175751065
R2: 0.8901223025630061
MAE: 316.54271741079634
MSE: 179398.30707215093
RMSE: 423.5543732180686


In [14]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1863.15894
2,temperature,48.430303
3,relative_humidity,-4.75001
4,wind_speed,19.427932
5,precipitation,236.194668
6,precipitation_cover,-4.032234
7,cloud_cover,5.585673
8,boston,823.759801
9,chicago,1752.670667


In [15]:
train = age_55[age_55.index.isin(range(2000,2016), level=0)]
test = age_55[age_55.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.8971062408941027
R2: 0.8531390408382961
MAE: 377.3847832176296
MSE: 237377.0583309819
RMSE: 487.21356542175823


In [16]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1789.441558
2,temperature,49.32511
3,relative_humidity,-5.674693
4,wind_speed,18.366868
5,precipitation,427.837384
6,precipitation_cover,-4.540464
7,cloud_cover,4.512966
8,boston,628.29667
9,chicago,1633.57035


In [17]:
train = age_60[age_60.index.isin(range(2000,2016), level=0)]
test = age_60[age_60.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.8719906588290356
R2: 0.7920113479319728
MAE: 459.46343186114115
MSE: 292682.73950588674
RMSE: 541.0016076740316


In [18]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1682.707933
2,temperature,50.664916
3,relative_humidity,-9.419866
4,wind_speed,22.379501
5,precipitation,543.56534
6,precipitation_cover,0.563021
7,cloud_cover,4.388978
8,boston,623.959062
9,chicago,1568.681821


In [19]:
train = age_65[age_65.index.isin(range(2000,2016), level=0)]
test = age_65[age_65.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.8026360651070458
R2: 0.7372236743278597
MAE: 504.4826008582353
MSE: 414855.73004706856
RMSE: 644.092951403032


In [20]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1522.480224
2,temperature,52.446155
3,relative_humidity,-10.337802
4,wind_speed,10.775072
5,precipitation,350.707063
6,precipitation_cover,5.678976
7,cloud_cover,2.905454
8,boston,330.630514
9,chicago,1241.292185


In [23]:
train = age_70[age_70.index.isin(range(2000,2016), level=0)]
test = age_70[age_70.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr.fit(X_train, y_train)
print(f'Train R2: {lr.score(X_train, y_train)}')
model_scores(X_test, y_test, lr)

Train R2: 0.6627485900917435
R2: -0.6864428325867005
MAE: 966.6536875761104
MSE: 3178989.570019767
RMSE: 1782.9721170056944


In [22]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1275.355785
2,temperature,58.068619
3,relative_humidity,-16.334654
4,wind_speed,32.920975
5,precipitation,-3355.459909
6,precipitation_cover,16.635216
7,cloud_cover,3.215925
8,boston,886.834453
9,chicago,749.773341


In [24]:
age_70

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,age,male,time_seconds,temperature,relative_humidity,wind_speed,precipitation,precipitation_cover,cloud_cover,clear,overcast,partially_cloudy,rain,boston,chicago,london,nyc
year,event,age,male,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1986,1,7,1,7.0,1.0,15796.666667,56.5,71.72,8.1,0.00,0.0,44.5,0.0,0.0,1.0,0.0,1,0,0,0
1987,1,7,1,7.0,1.0,16250.000000,54.3,57.31,15.7,0.00,0.0,1.7,1.0,0.0,0.0,0.0,1,0,0,0
1988,1,7,1,7.0,1.0,15497.142857,52.6,83.32,16.5,0.00,0.0,87.8,0.0,1.0,0.0,0.0,1,0,0,0
1989,1,7,1,7.0,1.0,16003.636364,53.7,68.52,10.2,0.00,0.0,62.5,0.0,0.0,1.0,0.0,1,0,0,0
1990,1,7,1,7.0,1.0,15043.636364,60.0,84.14,8.2,0.00,0.0,92.3,0.0,1.0,0.0,0.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,3,7,0,7.0,0.0,20113.208333,69.7,58.77,15.0,0.00,0.0,0.1,1.0,0.0,0.0,0.0,0,0,1,0
2018,3,7,1,7.0,1.0,19904.152941,69.7,58.77,15.0,0.00,0.0,0.1,1.0,0.0,0.0,0.0,0,0,1,0
2018,4,7,0,7.0,0.0,22764.760563,51.4,44.59,9.9,0.00,0.0,19.9,1.0,0.0,0.0,0.0,0,0,0,1
2018,4,7,1,7.0,1.0,21424.022654,51.4,44.59,9.9,0.00,0.0,19.9,1.0,0.0,0.0,0.0,0,0,0,1


## Elastic Net

In [25]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [100_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

In [26]:
train = age_18[age_18.index.isin(range(2000,2016), level=0)]
test = age_18[age_18.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.9145581343517478
MAE: 327.0311367519388
MSE: 169551.7177187371
RMSE: 411.7665815953707


In [27]:
engs.best_params_

{'alpha': 0.01, 'l1_ratio': 0.8484848484848485, 'max_iter': 100000}

In [28]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1598.187146
2,temperature,43.761112
3,relative_humidity,-2.920922
4,wind_speed,7.347642
5,precipitation,109.702
6,precipitation_cover,-3.085065
7,cloud_cover,2.72949
8,boston,1018.332925
9,chicago,1820.18312


In [29]:
train = age_40[age_40.index.isin(range(2000,2016), level=0)]
test = age_40[age_40.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8862063223993926
MAE: 378.3888480499251
MSE: 214757.4279007684
RMSE: 463.4192787323035


In [30]:
engs.best_params_

{'alpha': 0.1, 'l1_ratio': 0.98989898989899, 'max_iter': 100000}

In [31]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1649.33723
2,temperature,46.434243
3,relative_humidity,-3.454041
4,wind_speed,11.090324
5,precipitation,27.965683
6,precipitation_cover,-1.557083
7,cloud_cover,4.110406
8,boston,910.391775
9,chicago,1727.814293


In [32]:
train = age_45[age_45.index.isin(range(2000,2016), level=0)]
test = age_45[age_45.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8782979761912431
MAE: 374.9154348123888
MSE: 215814.12024268304
RMSE: 464.55798372504916


In [33]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [34]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1768.436125
2,temperature,46.662688
3,relative_humidity,-3.866304
4,wind_speed,16.34073
5,precipitation,0.0
6,precipitation_cover,-2.216218
7,cloud_cover,4.892463
8,boston,853.027756
9,chicago,1714.983071


In [35]:
train = age_50[age_50.index.isin(range(2000,2016), level=0)]
test = age_50[age_50.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8711859959752235
MAE: 333.49844419884516
MSE: 210315.78553492454
RMSE: 458.601990330313


In [36]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [37]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1859.23312
2,temperature,48.263986
3,relative_humidity,-4.28942
4,wind_speed,18.95927
5,precipitation,0.0
6,precipitation_cover,-3.820247
7,cloud_cover,5.279385
8,boston,772.908142
9,chicago,1716.470618


In [38]:
train = age_55[age_55.index.isin(range(2000,2016), level=0)]
test = age_55[age_55.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8168318401800619
MAE: 408.21354956128124
MSE: 296061.7934551395
RMSE: 544.1156067005793


In [39]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [40]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1785.500549
2,temperature,49.180718
3,relative_humidity,-5.129851
4,wind_speed,18.197672
5,precipitation,0.0
6,precipitation_cover,-3.851633
7,cloud_cover,4.198723
8,boston,577.60146
9,chicago,1598.544417


In [41]:
train = age_60[age_60.index.isin(range(2000,2016), level=0)]
test = age_60[age_60.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.7371095756256187
MAE: 498.356579673593
MSE: 369940.80605221196
RMSE: 608.2275939582255


In [42]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [43]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1678.757453
2,temperature,50.529225
3,relative_humidity,-8.813697
4,wind_speed,22.404904
5,precipitation,0.0
6,precipitation_cover,1.508738
7,cloud_cover,4.06852
8,boston,573.024362
9,chicago,1534.263886


In [44]:
train = age_65[age_65.index.isin(range(2000,2016), level=0)]
test = age_65[age_65.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.6926725821439453
MAE: 530.2511874668633
MSE: 485190.36093543837
RMSE: 696.5560716377672


In [45]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [46]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1518.545025
2,temperature,52.288407
3,relative_humidity,-9.816443
4,wind_speed,10.499117
5,precipitation,0.0
6,precipitation_cover,6.14484
7,cloud_cover,2.593014
8,boston,279.539836
9,chicago,1205.693385


In [47]:
train = age_70[age_70.index.isin(range(2000,2016), level=0)]
test = age_70[age_70.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.41687626837946445
MAE: 724.7635674014018
MSE: 1099203.735242764
RMSE: 1048.4291751199812


In [48]:
engs.best_params_

{'alpha': 0.1, 'l1_ratio': 0.9494949494949496, 'max_iter': 100000}

In [49]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,0.0
1,male,-1255.842424
2,temperature,57.832593
3,relative_humidity,-17.125122
4,wind_speed,27.683323
5,precipitation,-288.391032
6,precipitation_cover,5.251257
7,cloud_cover,-5.000034
8,overcast,718.430367
9,partially_cloudy,234.584987


## XGBoost

In [52]:
boost = XGBRegressor()

xparams = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.03, 0.05, .07, .09, .11], #so called `eta` value
              'max_depth': [3, 4, 5, 6],
              'min_child_weight': [3, 4, 5],
              'subsample': [.5, .7, 1],
              'colsample_bytree': [.5, .7, 1],
              'n_estimators': [100, 200, 300, 400, 500]}

xgs = GridSearchCV(
    boost,
    xparams,
    cv=2,
    verbose=True
)

In [53]:
train = age_18[age_18.index.isin(range(2000,2016), level=0)]
test = age_18[age_18.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.7768073260440792
MAE: 546.3797140983532
MSE: 442905.8397115948
RMSE: 665.5117126779925


In [54]:
xgs.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.07,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 500,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [55]:
train = age_40[age_40.index.isin(range(2000,2016), level=0)]
test = age_40[age_40.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.7519310819243563
MAE: 583.8536245691464
MSE: 468168.741105589
RMSE: 684.228573727807


In [56]:
xgs.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 4,
 'min_child_weight': 5,
 'n_estimators': 200,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [57]:
train = age_45[age_45.index.isin(range(2000,2016), level=0)]
test = age_45[age_45.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.7548034557734152
MAE: 557.216785334193
MSE: 434806.8735648989
RMSE: 659.3988728871918


In [58]:
xgs.best_params_

{'colsample_bytree': 1,
 'learning_rate': 0.11,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 100,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [59]:
train = age_50[age_50.index.isin(range(2000,2016), level=0)]
test = age_50[age_50.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.8015609798056782
MAE: 466.7427626995573
MSE: 323993.17705334374
RMSE: 569.2039854510365


In [60]:
xgs.best_params_

{'colsample_bytree': 1,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 300,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [61]:
train = age_55[age_55.index.isin(range(2000,2016), level=0)]
test = age_55[age_55.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.7116922344430728
MAE: 498.5005997800301
MSE: 466003.011777468
RMSE: 682.6441326031214


In [62]:
xgs.best_params_

{'colsample_bytree': 1,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 5,
 'n_estimators': 400,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [63]:
train = age_60[age_60.index.isin(range(2000,2016), level=0)]
test = age_60[age_60.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.6243231786857657
MAE: 601.8590951926402
MSE: 528654.424834441
RMSE: 727.0862568048175


In [64]:
xgs.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.09,
 'max_depth': 5,
 'min_child_weight': 5,
 'n_estimators': 100,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [65]:
train = age_65[age_65.index.isin(range(2000,2016), level=0)]
test = age_65[age_65.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy', 'rain'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.44616476324337084
MAE: 752.2186590882446
MSE: 874362.3341363355
RMSE: 935.0734378305992


In [66]:
xgs.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.07,
 'max_depth': 5,
 'min_child_weight': 5,
 'n_estimators': 100,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [67]:
train = age_70[age_70.index.isin(range(2000,2016), level=0)]
test = age_70[age_70.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

xgs.fit(X_train, y_train)
model_scores(X_test, y_test, xgs)

Fitting 2 folds for each of 2700 candidates, totalling 5400 fits
R2: 0.3109012520830836
MAE: 884.7330692203367
MSE: 1298969.4580880117
RMSE: 1139.7234129770309


In [68]:
xgs.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 100,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'subsample': 0.5}