In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
import itertools

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

## Modeling Ridge

In [144]:
london = pd.read_csv('../data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('../data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('../data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('../data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('../data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.columns = ['age', 'male','time_seconds', 'temperature', 'relative_humidity','wind_speed','precipitation',
                    'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 'partially_cloudy', 'rain', 'berlin', 'chicago', 'london', 'nyc']

berlin = berlin.apply(lambda x: round(x,2))
boston = boston.apply(lambda x: round(x,2))
chicago = chicago.apply(lambda x: round(x,2))
london = london.apply(lambda x: round(x,2))
nyc = nyc.apply(lambda x: round(x,2))
combined = combined.apply(lambda x: round(x,2))

In [145]:
rparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'tol': [1, .1, .01, .001, .0001, .00001],
}

rgs = GridSearchCV(
    estimator=Ridge(),
    param_grid=rparams
)

In [177]:
train = london[london.index.isin(range(2000, 2016), level=0)]
test = london[london.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


rgs.fit(X_train, y_train)
model_scores(X_test, y_test, rgs)

R2: 0.8555552941877106
MAE: 503.63268928270827
MSE: 354461.4580821635
RMSE: 595.3666585241094


In [178]:
pd.DataFrame(list(zip(X_test.columns,rgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,454.646093
1,male,-1887.293448
2,temperature,39.18724
3,relative_humidity,-8.389509
4,wind_speed,33.946702
5,precipitation,7.816519
6,precipitation_cover,10.373727
7,cloud_cover,-8.130361
8,overcast,0.0
9,partially_cloudy,472.330685


In [179]:
train = nyc[nyc.index.isin(range(2000, 2016), level=0)]
test = nyc[nyc.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


rgs.fit(X_train, y_train)
model_scores(X_test, y_test, rgs)

R2: 0.8812315863147291
MAE: 579.4647422358795
MSE: 459433.88406859705
RMSE: 677.8155236261538


In [180]:
rgs.best_params_

{'alpha': 1, 'tol': 1}

In [181]:
pd.DataFrame(list(zip(X_test.columns,rgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,758.480952
1,male,-1883.357541
2,temperature,33.012551
3,relative_humidity,-2.710734
4,wind_speed,7.144553
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,11.527949
8,overcast,-685.104384
9,partially_cloudy,-463.237818


In [182]:
train = boston[boston.index.isin(range(2000, 2016), level=0)]
test = boston[boston.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']


rgs.fit(X_train, y_train)
model_scores(X_test, y_test, rgs)

R2: 0.5111992539872295
MAE: 1072.016838404473
MSE: 1643955.2975851127
RMSE: 1282.1682017524506


In [183]:
pd.DataFrame(list(zip(X_test.columns,rgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,589.141956
1,male,-1227.338824
2,temperature,85.511654
3,relative_humidity,10.789998
4,wind_speed,10.342593
5,precipitation,66.985599
6,precipitation_cover,3.726613
7,cloud_cover,-13.928252
8,rain,110.259787
9,event,0.0


In [184]:
train = berlin[berlin.index.isin(range(2000, 2016), level=0)]
test = berlin[berlin.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


rgs.fit(X_train, y_train)
model_scores(X_test, y_test, rgs)

R2: 0.8886600514820724
MAE: 376.313650245317
MSE: 264776.0854876916
RMSE: 514.5639760881942


In [185]:
pd.DataFrame(list(zip(X_test.columns,rgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,487.70413
1,male,-1457.269308
2,temperature,23.655451
3,relative_humidity,-6.938078
4,wind_speed,7.237404
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,-4.124609
8,overcast,440.098615
9,partially_cloudy,0.862461


In [187]:
train = chicago[chicago.index.isin(range(2000, 2016), level=0)]
test = chicago[chicago.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


rgs.fit(X_train, y_train)
model_scores(X_test, y_test, rgs)

R2: 0.7757995001442428
MAE: 592.2492100927192
MSE: 588767.6513800869
RMSE: 767.311964835742


In [188]:
pd.DataFrame(list(zip(X_test.columns,rgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,458.803067
1,male,-1280.372804
2,temperature,43.955509
3,relative_humidity,-8.216735
4,wind_speed,-0.459803
5,precipitation,-0.02909
6,precipitation_cover,-29.089795
7,cloud_cover,12.861371
8,overcast,111.779331
9,partially_cloudy,-135.1134


In [189]:
train = combined[combined.index.isin(range(2000, 2016), level=0)]
test = combined[combined.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


rgs.fit(X_train, y_train)
model_scores(X_test, y_test, rgs)

R2: 0.5839993439424789
MAE: 971.90707121301
MSE: 1511266.6115260876
RMSE: 1229.3358416340457


In [190]:
rgs.best_estimator_

Ridge(alpha=1, tol=1)

In [191]:
pd.DataFrame(list(zip(X_test.columns,rgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,551.917228
1,male,-1644.40674
2,temperature,22.933032
3,relative_humidity,-19.758389
4,wind_speed,-28.067809
5,precipitation,1999.638545
6,precipitation_cover,-17.525313
7,cloud_cover,4.499716
8,overcast,-846.456907
9,partially_cloudy,-123.122259


## Modeling Lasso

In [192]:
lparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'tol': [1, .1, .01, .001, .0001, .00001, .000001, .0000001],
    'max_iter': [100_000]
}

lgs = GridSearchCV(
    estimator=Lasso(),
    param_grid=lparams
)

In [193]:
train = london[london.index.isin(range(2000, 2016), level=0)]
test = london[london.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


lgs.fit(X_train, y_train)
model_scores(X_test, y_test, lgs)

R2: 0.852147766092602
MAE: 513.0161406056503
MSE: 362823.3940233654
RMSE: 602.3482331868878


In [194]:
pd.DataFrame(list(zip(X_test.columns,lgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,453.156008
1,male,-1880.40386
2,temperature,42.097543
3,relative_humidity,-9.565041
4,wind_speed,28.165446
5,precipitation,-0.0
6,precipitation_cover,0.0
7,cloud_cover,-0.140307
8,overcast,0.0
9,partially_cloudy,51.719086


In [199]:
train = nyc[nyc.index.isin(range(2000, 2016), level=0)]
test = nyc[nyc.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


lgs.fit(X_train, y_train)
model_scores(X_test, y_test, lgs)

R2: 0.8740013020258991
MAE: 592.6248176867208
MSE: 487402.9163277967
RMSE: 698.1424756651014


In [200]:
lgs.best_params_

{'alpha': 10, 'max_iter': 100000, 'tol': 1}

In [201]:
pd.DataFrame(list(zip(X_test.columns,lgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,757.178159
1,male,-1874.746833
2,temperature,26.03336
3,relative_humidity,2.094807
4,wind_speed,13.283674
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,2.998813
8,overcast,-0.0
9,partially_cloudy,-66.530958


In [202]:
train = boston[boston.index.isin(range(2000, 2016), level=0)]
test = boston[boston.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']


lgs.fit(X_train, y_train)
model_scores(X_test, y_test, lgs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


R2: 0.46959027460899794
MAE: 1143.316929618508
MSE: 1783896.3730313566
RMSE: 1335.6258357157353


In [203]:
pd.DataFrame(list(zip(X_test.columns,lgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,592.336433
1,male,-1393.279344
2,temperature,86.542448
3,relative_humidity,6.919758
4,wind_speed,15.944973
5,precipitation,0.0
6,precipitation_cover,1.271519
7,cloud_cover,-9.410683
8,rain,198.624239
9,event,0.0


In [204]:
train = berlin[berlin.index.isin(range(2000, 2016), level=0)]
test = berlin[berlin.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


lgs.fit(X_train, y_train)
model_scores(X_test, y_test, lgs)

R2: 0.8975382854704247
MAE: 356.2838701934865
MSE: 243662.87255045737
RMSE: 493.62219616874745


In [205]:
pd.DataFrame(list(zip(X_test.columns,lgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,488.0651
1,male,-1479.999141
2,temperature,37.495185
3,relative_humidity,-6.077217
4,wind_speed,12.432417
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,0.291072
8,overcast,216.840718
9,partially_cloudy,-59.351884


In [206]:
train = chicago[chicago.index.isin(range(2000, 2016), level=0)]
test = chicago[chicago.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


lgs.fit(X_train, y_train)
model_scores(X_test, y_test, lgs)

R2: 0.7126578703975802
MAE: 675.0041976237995
MSE: 754582.397886768
RMSE: 868.6670235980919


In [207]:
pd.DataFrame(list(zip(X_test.columns,lgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,442.974883
1,male,-1085.746054
2,temperature,41.570598
3,relative_humidity,-9.026077
4,wind_speed,-0.0
5,precipitation,-0.0
6,precipitation_cover,-0.0
7,cloud_cover,11.412426
8,overcast,0.0
9,partially_cloudy,-0.0


In [208]:
train = combined[combined.index.isin(range(2000, 2016), level=0)]
test = combined[combined.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


lgs.fit(X_train, y_train)
model_scores(X_test, y_test, lgs)

R2: 0.5802097766119121
MAE: 978.9739017196835
MSE: 1525033.5287061986
RMSE: 1234.922478824561


In [209]:
lgs.best_estimator_

Lasso(alpha=0.1, max_iter=100000, tol=1)

In [210]:
pd.DataFrame(list(zip(X_test.columns,lgs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,551.626118
1,male,-1644.050042
2,temperature,25.311295
3,relative_humidity,-21.094504
4,wind_speed,-28.896936
5,precipitation,1226.744748
6,precipitation_cover,-12.236196
7,cloud_cover,-0.909586
8,overcast,-371.179555
9,partially_cloudy,146.714411


## Elastic Net

In [211]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [10_000_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

In [212]:
train = london[london.index.isin(range(2000, 2016), level=0)]
test = london[london.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8394287968044412
MAE: 534.435803186084
MSE: 394035.229540844
RMSE: 627.7222550944358


In [130]:
engs.best_params_

{'alpha': 100, 'l1_ratio': 1.0, 'max_iter': 10000000}

In [131]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,433.87177
1,male,-1515.789567
2,temperature,38.115665
3,relative_humidity,-5.23415
4,wind_speed,25.627689
5,precipitation,-0.0
6,precipitation_cover,-12.290341
7,cloud_cover,-0.0
8,overcast,0.0
9,partially_cloudy,0.0


In [132]:
train = nyc[nyc.index.isin(range(2000, 2016), level=0)]
test = nyc[nyc.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8772893572531737
MAE: 571.4053671524455
MSE: 477644.59571792564
RMSE: 691.1183659243369


In [133]:
engs.best_params_

{'alpha': 0.1, 'l1_ratio': 0.7676767676767677, 'max_iter': 10000000}

In [134]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,750.91427
1,male,-1743.383536
2,temperature,30.123849
3,relative_humidity,-0.831466
4,wind_speed,11.05007
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,6.527796
8,overcast,-280.802641
9,partially_cloudy,-226.17762


In [135]:
train = boston[boston.index.isin(range(2000, 2016), level=0)]
test = boston[boston.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.40224682180474836
MAE: 1297.471038186949
MSE: 2211498.3434765805
RMSE: 1487.1107367901627


In [136]:
engs.best_params_

{'alpha': 100, 'l1_ratio': 1.0, 'max_iter': 10000000}

In [137]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,574.176998
1,male,-1030.933494
2,temperature,85.986731
3,relative_humidity,11.581374
4,wind_speed,5.206532
5,precipitation,0.0
6,precipitation_cover,5.718157
7,cloud_cover,-12.764714
8,rain,0.0
9,event,0.0


In [138]:
train = berlin[berlin.index.isin(range(2000, 2016), level=0)]
test = berlin[berlin.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.6353785852966253
MAE: 761.3116729376164
MSE: 809957.7715166273
RMSE: 899.9765394256826


In [139]:
engs.best_params_

{'alpha': 100, 'l1_ratio': 1.0, 'max_iter': 10000000}

In [140]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,444.539436
1,male,-1053.900873
2,temperature,57.206784
3,relative_humidity,-0.0
4,wind_speed,-14.706931
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,-0.121331
8,overcast,-0.0
9,partially_cloudy,0.0


In [141]:
train = chicago[chicago.index.isin(range(2000, 2016), level=0)]
test = chicago[chicago.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8301541678680803
MAE: 555.5786970598895
MSE: 473930.28152986313
RMSE: 688.4259448407382


In [142]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 10000000}

In [143]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,445.266579
1,male,-1493.647453
2,temperature,45.998546
3,relative_humidity,-11.285113
4,wind_speed,-51.055517
5,precipitation,0.0
6,precipitation_cover,0.0
7,cloud_cover,11.149686
8,overcast,-0.0
9,partially_cloudy,-500.050616


In [125]:
train = combined[combined.index.isin(range(2000, 2016), level=0)]
test = combined[combined.index.isin(range(2016, 2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.5428052616946761
MAE: 1033.8972747621153
MSE: 1667999.6230478466
RMSE: 1291.510597342448


In [126]:
engs.best_estimator_

ElasticNet(alpha=0.01, l1_ratio=0.9696969696969697, max_iter=1000000)

In [127]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,age,546.967921
1,male,-1636.759598
2,temperature,23.92735
3,relative_humidity,-23.866213
4,wind_speed,-32.707802
5,precipitation,2082.901731
6,precipitation_cover,-15.715567
7,cloud_cover,3.879328
8,overcast,-641.804866
9,partially_cloudy,-13.780661
