In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
import itertools

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

def undo_age(x):
    '''make age categories their division again'''
    if x == 0.0:
        return '18-39'
    elif x == 1.0:
        return '40-44'
    elif x == 2.0:
        return '45-49'
    elif x == 3.0:
        return '50-54'
    elif x == 4.0:
        return '55-59'
    elif x == 5.0:
        return '60-64'
    elif x == 6.0:
        return '65-69'
    return '70+'

## All Participants

### Relative Ages only numeric conditions

In [3]:
london = pd.read_csv('../data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('../data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('../data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('../data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('../data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)


events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined = pd.concat(events, ignore_index=True)

aggregate = {col: 'mean' for col in london.drop(columns='year')}
    
london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

aggregate = {col: 'mean' for col in london}

combined = combined.groupby(['year','event', 'age', 'male']).agg(aggregate)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.columns = ['age', 'male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc']
combined.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)


berlin['age'] = berlin['age'].apply(lambda x: undo_age(x))
berlin = berlin.merge(pd.get_dummies(berlin['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
berlin.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

boston['age'] = boston['age'].apply(lambda x: undo_age(x))
boston = boston.merge(pd.get_dummies(boston['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
boston.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

chicago['age'] = chicago['age'].apply(lambda x: undo_age(x))
chicago = chicago.merge(pd.get_dummies(chicago['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
chicago.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

london['age'] = london['age'].apply(lambda x: undo_age(x))
london = london.merge(pd.get_dummies(london['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
london.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

nyc['age'] = nyc['age'].apply(lambda x: undo_age(x))
nyc = nyc.merge(pd.get_dummies(nyc['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
nyc.drop(columns=['clear', 'overcast', 'partially_cloudy', 'rain'], inplace=True)

combined['age'] = combined['age'].apply(lambda x: undo_age(x))
combined = combined.merge(pd.get_dummies(combined['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')


berlin = berlin.apply(lambda x: round(x,2))
boston = boston.apply(lambda x: round(x,2))
chicago = chicago.apply(lambda x: round(x,2))
london = london.apply(lambda x: round(x,2))
nyc = nyc.apply(lambda x: round(x,2))
combined = combined.apply(lambda x: round(x,2))

In [4]:
train = berlin[berlin.index.isin(range(2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.7401391796371537
MAE: 725.3421537548029
MSE: 617971.6418336215
RMSE: 786.1117235060303


In [5]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1496.3392
1,temperature,54.5064
2,relative_humidity,0.8243
3,wind_speed,-19.9605
4,precipitation,0.0
5,precipitation_cover,-0.0
6,cloud_cover,-0.0341
7,event,0.0
8,40-44,76.301
9,45-49,277.4478


In [6]:
train = boston[boston.index.isin(range(2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.594028055620037
MAE: 1019.9828821994734
MSE: 1365381.976354292
RMSE: 1168.4956039088431


In [7]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1436.3641
1,temperature,87.0489
2,relative_humidity,11.3109
3,wind_speed,0.3845
4,precipitation,1975.2966
5,precipitation_cover,1.8445
6,cloud_cover,-14.7215
7,event,-0.0
8,40-44,244.984
9,45-49,633.9413


In [8]:
train = chicago[chicago.index.isin(range(2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.2725471851075677
MAE: 1203.131809653503
MSE: 1910346.701232175
RMSE: 1382.1529225205782


In [9]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1541.0975
1,temperature,43.5592
2,relative_humidity,-22.9555
3,wind_speed,-50.6158
4,precipitation,0.082
5,precipitation_cover,82.0162
6,cloud_cover,6.5565
7,event,-0.0
8,40-44,98.4427
9,45-49,432.8624


In [10]:
train = london[london.index.isin(range(2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8918669606236133
MAE: 424.71171180760075
MSE: 265354.09926356043
RMSE: 515.1253238422281


In [11]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1920.4039
1,temperature,28.37
2,relative_humidity,-12.9644
3,wind_speed,42.8827
4,precipitation,-1300.7131
5,precipitation_cover,13.9915
6,cloud_cover,-2.1418
7,event,-0.0
8,40-44,-369.7455
9,45-49,-148.8873


In [12]:
train = nyc[nyc.index.isin(range(2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.9633423894706844
MAE: 309.73259672207115
MSE: 141803.26118345742
RMSE: 376.5677378420215


In [13]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1914.7468
1,temperature,28.0
2,relative_humidity,1.4718
3,wind_speed,12.5439
4,precipitation,0.0
5,precipitation_cover,-0.0
6,cloud_cover,2.9354
7,event,0.0
8,40-44,134.974
9,45-49,531.3913


In [14]:
train = combined[combined.index.isin(range(2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8739058628892775
MAE: 545.4801248295083
MSE: 457047.07459035964
RMSE: 676.0525679193591


In [15]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1632.1048
1,temperature,49.5529
2,relative_humidity,-7.256
3,wind_speed,3.6119
4,precipitation,519.2358
5,precipitation_cover,1.7451
6,cloud_cover,2.2639
7,boston,102.315
8,chicago,1282.341
9,london,1990.9025


## Elastic Net

In [16]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [1_000_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

In [17]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.9005834615772087
MAE: 412.14729720768355
MSE: 243964.15893995226
RMSE: 493.9272810241931


In [18]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 1000000}

In [19]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1916.40386
1,temperature,26.031854
2,relative_humidity,-13.002544
3,wind_speed,39.175108
4,precipitation,-0.0
5,precipitation_cover,1.692222
6,cloud_cover,-2.546316
7,event,0.0
8,40-44,-386.055234
9,45-49,-165.195087


In [20]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.9639687304404474
MAE: 305.55252621916975
MSE: 139380.3757077611
RMSE: 373.3368126876334


In [21]:
engs.best_params_

{'alpha': 0.01, 'l1_ratio': 0.9696969696969697, 'max_iter': 1000000}

In [22]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1912.389997
1,temperature,27.99939
2,relative_humidity,1.471979
3,wind_speed,12.543453
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,2.935374
7,event,0.0
8,40-44,96.955643
9,45-49,492.415349


In [23]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.5481224451513053
MAE: 1044.966963396892
MSE: 1519773.662812517
RMSE: 1232.79100532593


In [24]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 1000000}

In [25]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1395.757285
1,temperature,86.241631
2,relative_humidity,11.505052
3,wind_speed,9.264506
4,precipitation,0.0
5,precipitation_cover,5.295119
6,cloud_cover,-13.507532
7,event,0.0
8,40-44,-74.672404
9,45-49,154.993876


In [26]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.9204939488918606
MAE: 363.0924118885407
MSE: 189072.3075160023
RMSE: 434.8244559773545


In [27]:
engs.best_params_

{'alpha': 0.1, 'l1_ratio': 0.98989898989899, 'max_iter': 1000000}

In [28]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1473.688883
1,temperature,32.486186
2,relative_humidity,-5.564317
3,wind_speed,11.494128
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,0.718488
7,event,0.0
8,40-44,-54.158599
9,45-49,182.856624


In [29]:
train = chicago[chicago.index.isin(range(2000,2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.7890852892619499
MAE: 577.5761061678155
MSE: 553878.1535395555
RMSE: 744.2299063727253


In [30]:
engs.best_params_

{'alpha': 0.01, 'l1_ratio': 0.8585858585858587, 'max_iter': 1000000}

In [31]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1480.21504
1,temperature,41.596851
2,relative_humidity,-9.859638
3,wind_speed,-3.133919
4,precipitation,-0.0
5,precipitation_cover,-3.328406
6,cloud_cover,11.294888
7,event,0.0
8,40-44,-33.403716
9,45-49,272.543282


In [32]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [100_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.883816970694627
MAE: 479.77167693797935
MSE: 421122.7807874923
RMSE: 648.9397358672778


In [33]:
engs.best_estimator_

ElasticNet(alpha=1, l1_ratio=1.0, max_iter=100000)

In [34]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1643.671827
1,temperature,49.159334
2,relative_humidity,-7.03082
3,wind_speed,16.987278
4,precipitation,-0.0
5,precipitation_cover,0.175502
6,cloud_cover,3.880803
7,boston,739.897886
8,chicago,1515.823987
9,london,2136.300632


## all

In [109]:
london = pd.read_csv('../data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('../data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('../data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('../data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('../data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined = pd.concat(events, ignore_index=True)

aggregate = {col: 'mean' for col in london.drop(columns='year')}
    
london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

aggregate = {col: 'mean' for col in london}

combined = combined.groupby(['year','event', 'age', 'male']).agg(aggregate)
combined = combined.merge(pd.get_dummies(combined['event'], drop_first=True), left_index=True, right_index=True).drop(columns='event')
combined.columns = ['age', 'male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc']

berlin['age'] = berlin['age'].apply(lambda x: undo_age(x))
berlin = berlin.merge(pd.get_dummies(berlin['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

boston['age'] = boston['age'].apply(lambda x: undo_age(x))
boston = boston.merge(pd.get_dummies(boston['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

chicago['age'] = chicago['age'].apply(lambda x: undo_age(x))
chicago = chicago.merge(pd.get_dummies(chicago['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

london['age'] = london['age'].apply(lambda x: undo_age(x))
london = london.merge(pd.get_dummies(london['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

nyc['age'] = nyc['age'].apply(lambda x: undo_age(x))
nyc = nyc.merge(pd.get_dummies(nyc['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')

combined['age'] = combined['age'].apply(lambda x: undo_age(x))
combined = combined.merge(pd.get_dummies(combined['age'], drop_first=True), left_index=True, right_index=True).drop(columns='age')
combined.columns = ['male','time_seconds', 'temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'precipitation_cover', 'cloud_cover', 'clear', 'overcast', 
                   'partially_cloudy', 'rain', 'boston', 'chicago', 'london', 'nyc', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70+']



In [78]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.9273315231056264
MAE: 334.0958304267431
MSE: 172811.96108463165
RMSE: 415.7065805163922


In [79]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1480.0391
1,temperature,24.0551
2,relative_humidity,-6.569
3,wind_speed,7.4266
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,-5.133
7,overcast,525.2319
8,partially_cloudy,41.4653
9,rain,0.0


In [110]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.468662892360622
MAE: 1097.2488716462176
MSE: 1787015.4107233442
RMSE: 1336.792957313639


In [111]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1435.9908
1,temperature,87.1349
2,relative_humidity,4.7512
3,wind_speed,28.9957
4,precipitation,230.7458
5,precipitation_cover,-3.2355
6,cloud_cover,-26.7305
7,overcast,339.6197
8,partially_cloudy,-339.6197
9,rain,632.792


In [82]:
train = chicago[chicago.index.isin(range(2000,2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8106575424757919
MAE: 540.0186227633428
MSE: 497227.7675330046
RMSE: 705.1437920970478


In [83]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1489.6782
1,temperature,44.9011
2,relative_humidity,-8.3388
3,wind_speed,-0.3737
4,precipitation,-0.0344
5,precipitation_cover,-34.3581
6,cloud_cover,11.7818
7,overcast,268.516
8,partially_cloudy,-110.9355
9,rain,-3.4358


In [84]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8777086274043285
MAE: 439.92065584237724
MSE: 300098.0755740706
RMSE: 547.8120805295102


In [85]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1920.4039
1,temperature,35.4164
2,relative_humidity,-8.9395
3,wind_speed,33.4177
4,precipitation,8471.366
5,precipitation_cover,9.1585
6,cloud_cover,-14.4042
7,overcast,-0.0
8,partially_cloudy,791.3369
9,rain,-1149.8441


In [86]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.9724582941054528
MAE: 265.8630548911952
MSE: 106540.05152024716
RMSE: 326.4047357503367


In [87]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1914.7468
1,temperature,35.5278
2,relative_humidity,-4.3565
3,wind_speed,4.7877
4,precipitation,-0.0
5,precipitation_cover,0.0
6,cloud_cover,14.4862
7,overcast,-924.9219
8,partially_cloudy,-597.192
9,rain,-0.0


In [88]:
train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.827700690675113
MAE: 556.1676814422171
MSE: 624524.6375866798
RMSE: 790.2687122660745


In [89]:
pd.DataFrame(list(zip(X_test.columns,np.round(lr.coef_,4))))

Unnamed: 0,0,1
0,male,-1647.2267
1,temperature,49.9369
2,relative_humidity,-8.0156
3,wind_speed,19.1728
4,precipitation,-806.7046
5,precipitation_cover,0.319
6,cloud_cover,2.2252
7,overcast,169.3862
8,partially_cloudy,112.2736
9,rain,171.7314


## Elastic Net

In [90]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [100_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

In [91]:
train = london[london.index.isin(range(2000,2016), level=0)]
test = london[london.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.910273336026977
MAE: 399.7685791338167
MSE: 220185.59947816434
RMSE: 469.23938398024984


In [92]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 100000}

In [93]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1880.40386
1,temperature,26.939088
2,relative_humidity,-12.460083
3,wind_speed,38.303904
4,precipitation,-0.0
5,precipitation_cover,-0.0
6,cloud_cover,-6.422892
7,overcast,0.0
8,partially_cloudy,228.538068
9,rain,-0.0


In [94]:
train = nyc[nyc.index.isin(range(2000,2016), level=0)]
test = nyc[nyc.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.9738439772024968
MAE: 256.4419860329996
MSE: 101179.78991862155
RMSE: 318.0877079024299


In [95]:
engs.best_params_

{'alpha': 0.1, 'l1_ratio': 0.9797979797979799, 'max_iter': 100000}

In [96]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1899.009384
1,temperature,34.036627
2,relative_humidity,-3.380356
3,wind_speed,6.176522
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,12.739162
7,overcast,-783.347839
8,partially_cloudy,-518.093744
9,rain,0.0


In [97]:
train = boston[boston.index.isin(range(2000,2016), level=0)]
test = boston[boston.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.5259578163846342
MAE: 1048.3025336075568
MSE: 1594318.6776824908
RMSE: 1262.6633271313817


In [98]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 100000}

In [99]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1395.758069
1,temperature,88.334071
2,relative_humidity,10.283376
3,wind_speed,15.577666
4,precipitation,0.0
5,precipitation_cover,4.844221
6,cloud_cover,-18.415641
7,overcast,277.115216
8,partially_cloudy,-0.0
9,rain,51.154551


In [100]:
train = berlin[berlin.index.isin(range(2000,2016), level=0)]
test = berlin[berlin.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.9278204537674631
MAE: 336.27062359812453
MSE: 171649.24142794716
RMSE: 414.305734244588


In [101]:
engs.best_params_

{'alpha': 1, 'l1_ratio': 1.0, 'max_iter': 100000}

In [102]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1476.039141
1,temperature,23.323454
2,relative_humidity,-7.000169
3,wind_speed,6.993435
4,precipitation,0.0
5,precipitation_cover,0.0
6,cloud_cover,-4.248373
7,overcast,451.493388
8,partially_cloudy,0.268738
9,rain,0.0


In [103]:
train = chicago[chicago.index.isin(range(2000, 2016), level=0)]
test = chicago[chicago.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8084887148642514
MAE: 522.7321710212303
MSE: 502923.2746345343
RMSE: 709.1708360011248


In [104]:
engs.best_params_

{'alpha': 10, 'l1_ratio': 1.0, 'max_iter': 100000}

In [105]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1449.338135
1,temperature,42.969226
2,relative_humidity,-8.483568
3,wind_speed,-2.614905
4,precipitation,-0.0
5,precipitation_cover,-20.021665
6,cloud_cover,12.953254
7,overcast,0.0
8,partially_cloudy,-143.20512
9,rain,-0.0


In [106]:
enparams = {
    'alpha': [.01, .1, 1, 10, 100, 1000],
    'max_iter': [10_000],
    'l1_ratio': np.linspace(0,1,100)
}

engs = GridSearchCV(
    estimator=ElasticNet(),
    param_grid=enparams
)

train = combined[combined.index.isin(range(2000,2016), level=0)]
test = combined[combined.index.isin(range(2016,2019), level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']


engs.fit(X_train, y_train)
model_scores(X_test, y_test, engs)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

R2: 0.8779218083628848
MAE: 493.3341679625669
MSE: 442490.6790870946
RMSE: 665.1997287184463


In [107]:
engs.best_estimator_

ElasticNet(alpha=1, l1_ratio=1.0, max_iter=10000)

In [108]:
pd.DataFrame(list(zip(X_test.columns,engs.best_estimator_.coef_)))

Unnamed: 0,0,1
0,male,-1643.492775
1,temperature,49.242093
2,relative_humidity,-7.431634
3,wind_speed,16.626597
4,precipitation,-0.0
5,precipitation_cover,-0.906507
6,cloud_cover,3.212559
7,overcast,47.523132
8,partially_cloudy,48.919046
9,rain,87.355006
