In [116]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import itertools

## Functions

In [335]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

## All Participants

### Grouped

In [489]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)

In [490]:
train = london[london.index.isin(range(2017), level=0)]
test = london[london.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8486382770895685
MAE: 500.5378739349461
MSE: 363822.3823038144
RMSE: 603.1769079663233


In [491]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,452.511953
1,male,-1915.789567
2,temperature,39.860797
3,relative_humidity,-8.202356
4,wind_speed,32.583386
5,precipitation,1669.587288
6,precipitation_cover,10.751469
7,cloud_cover,-9.79622
8,overcast,0.0
9,partially_cloudy,580.134974


In [492]:
train = nyc[nyc.index.isin(range(2017), level=0)]
test = nyc[nyc.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.26346539052707507
MAE: 272.47329606419436
MSE: 120235.10134171456
RMSE: 346.74933502706904


In [493]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,109.580496
1,male,-398.981823
2,temperature,29.026549
3,relative_humidity,0.034438
4,wind_speed,16.439797
5,precipitation,76.260346
6,precipitation_cover,10.287346
7,cloud_cover,5.525568
8,overcast,-541.590089
9,partially_cloudy,-193.718247


In [494]:
train = boston[boston.index.isin(range(2017), level=0)]
test = boston[boston.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.47207035106013917
MAE: 1318.2329814451143
MSE: 1953173.294080445
RMSE: 1397.5597640460478


In [495]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,593.763749
1,male,-1431.211599
2,temperature,87.334866
3,relative_humidity,11.906754
4,wind_speed,-0.085594
5,precipitation,1967.982287
6,precipitation_cover,1.077872
7,cloud_cover,-14.533504
8,rain,64.285743
9,event,0.0


In [496]:
train = berlin[berlin.index.isin(range(2017), level=0)]
test = berlin[berlin.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.5841806815774391
MAE: 829.7206237258863
MSE: 923687.0762982728
RMSE: 961.0864041792876


In [497]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,467.0261
1,male,-1455.646
2,temperature,69.20877
3,relative_humidity,5.048345
4,wind_speed,-12.74645
5,precipitation,-6.536993e-13
6,precipitation_cover,2.273737e-13
7,cloud_cover,-13.35731
8,overcast,1103.646
9,partially_cloudy,768.5363


In [498]:
train = chicago[chicago.index.isin(range(2017), level=0)]
test = chicago[chicago.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8727267283684006
MAE: 476.75164008732094
MSE: 355137.6957471701
RMSE: 595.9343048920493


In [499]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,447.221232
1,male,-1535.602797
2,temperature,45.002944
3,relative_humidity,-8.631163
4,wind_speed,-58.098825
5,precipitation,-0.009502
6,precipitation_cover,-9.501577
7,cloud_cover,16.083448
8,overcast,-458.174109
9,partially_cloudy,-795.303901


In [500]:
train = combined[combined.index.isin(range(2017), level=0)]
test = combined[combined.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.3025721670668655
MAE: 1721.7268557546429
MSE: 3811689.766792224
RMSE: 1952.3549284882151


In [501]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,411.122336
1,male,-1334.361499
2,temperature,44.435123
3,relative_humidity,-16.293453
4,wind_speed,-51.240033
5,precipitation,-31.918961
6,precipitation_cover,-21.109005
7,cloud_cover,-2.994308
8,overcast,-824.905476
9,partially_cloudy,59.542532


## Model (on top 10k)

### Separated on Event

In [372]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london_10 = top_n(usable(london_dict, 10_000), 10_000)
nyc_10 = top_n(usable(nyc_dict, 9_000), 9_900)
boston_10 = top_n(usable(boston_dict, 10_000), 10_000)
berlin_10 = top_n(usable(berlin_dict, 10_000), 10_000)
chicago_10 = top_n(usable(chicago_dict, 10_000), 10_000)

aggregate = {col: 'mean' for col in london_10.drop(columns=['year'])}

london_10 = london_10.groupby(['year', 'age', 'male']).agg(aggregate)
nyc_10 = nyc_10.groupby(['year', 'age', 'male']).agg(aggregate)
boston_10 = boston_10.groupby(['year', 'age', 'male']).agg(aggregate)
berlin_10 = berlin_10.groupby(['year', 'age', 'male']).agg(aggregate)
chicago_10 = chicago_10.groupby(['year', 'age', 'male']).agg(aggregate)

combined_10 = pd.concat([london_10, nyc_10, boston_10, berlin_10, chicago_10])

In [120]:
london_train = london_10[london_10['year'] < 2016]
london_test = pd.concat([london_10[london_10['year'] == 2017], london_10[london_10['year'] == 2018]], ignore_index=False)

results = {}
best_london = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = london_train.drop(columns=drop_cols)
        y_train = london_train['time_seconds']

        X_test = london_test.drop(columns=drop_cols)
        y_test = london_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_london[min(results.keys())] = results[min(results.keys())]

In [72]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

BASELINE
MAE: 1068.2269967306665
MSE: 1746534.112775161
RMSE: 1321.5650240435243


In [73]:
list(zip(X_test.columns,lr.coef_))

[('year', -39.05432762054361),
 ('age', 55.346388410271665),
 ('male', -496.3688073149521),
 ('temperature', 17.5986762335581),
 ('relative_humidity', 7.1410609844891475),
 ('wind_speed', 7.4238992350515405),
 ('precipitation', -1695.4840165409041),
 ('precipitation_cover', -37.31469637180682),
 ('cloud_cover', -4.330354224490609),
 ('overcast', 1.1368683772161603e-12),
 ('partially_cloudy', 7.339294166024199),
 ('rain', 406.8481694319067)]

In [121]:
chicago_train = chicago_10[chicago_10['year'] < 2016]
chicago_test = pd.concat([chicago_10[chicago_10['year'] == 2017], chicago_10[chicago_10['year'] == 2018]], ignore_index=False)

results = {}
best_chicago = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = chicago_train.drop(columns=drop_cols)
        y_train = chicago_train['time_seconds']

        X_test = chicago_test.drop(columns=drop_cols)
        y_test = chicago_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_chicago[min(results.keys())] = results[min(results.keys())]

In [76]:
list(zip(X_test.columns,lr.coef_))

[('year', -37.26730458763445),
 ('age', 139.0583752065539),
 ('male', -512.1725733686169),
 ('temperature', 31.844489809845925),
 ('relative_humidity', 6.101147114256167),
 ('wind_speed', -40.93148375150596),
 ('precipitation', -0.039666790175204386),
 ('precipitation_cover', -39.66679017529224),
 ('cloud_cover', 15.915883695912953),
 ('overcast', -665.4862046848956),
 ('partially_cloudy', -497.89626680883106),
 ('rain', -3.966679017529332)]

In [122]:
nyc_train = nyc_10[nyc_10['year'] < 2016]
nyc_test = pd.concat([nyc_10[nyc_10['year'] == 2017], nyc_10[nyc_10['year'] == 2018]], ignore_index=False)


results = {}
best_nyc = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')
        
        X_train = nyc_train.drop(columns=drop_cols)
        y_train = nyc_train['time_seconds']

        X_test = nyc_test.drop(columns=drop_cols)
        y_test = nyc_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_nyc[min(results.keys())] = results[min(results.keys())]

In [78]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

BASELINE
MAE: 878.0976351928733
MSE: 1366971.617349782
RMSE: 1169.1756144180317


In [79]:
list(zip(X_test.columns,lr.coef_))

[('year', -8.477338420793746),
 ('age', 145.95761340821588),
 ('male', -574.133109232959),
 ('temperature', 35.84827427731008),
 ('relative_humidity', -2.4675950263093225),
 ('wind_speed', 4.892343132987524),
 ('precipitation', 710.5146723600894),
 ('precipitation_cover', -1.3016188067533037),
 ('cloud_cover', 4.395722014327733),
 ('overcast', -524.5842476041114),
 ('partially_cloudy', -35.0993413943471),
 ('rain', -238.54072663580737)]

In [123]:
berlin_train = berlin_10[berlin_10['year'] < 2016]
berlin_test = pd.concat([berlin_10[berlin_10['year'] == 2017], berlin_10[berlin_10['year'] == 2018]], ignore_index=False)

results = {}
best_berlin = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')
        
        X_train = berlin_train.drop(columns=drop_cols)
        y_train = berlin_train['time_seconds']

        X_test = berlin_test.drop(columns=drop_cols)
        y_test = berlin_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_berlin[min(results.keys())] = results[min(results.keys())]

In [81]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

BASELINE
MAE: 841.5498645782758
MSE: 1240767.3562776924
RMSE: 1113.8973724170878


In [82]:
list(zip(X_test.columns,lr.coef_))

[('year', -13.252516336610217),
 ('age', 140.95476002295132),
 ('male', -577.5213252822165),
 ('temperature', 28.974388358006703),
 ('relative_humidity', 12.926929439961112),
 ('wind_speed', 1.5138678387849842),
 ('precipitation', -1.1368683772161603e-13),
 ('precipitation_cover', -5.684341886080802e-14),
 ('cloud_cover', -8.275840813642398),
 ('overcast', 230.31434175789968),
 ('partially_cloudy', 191.6948751208017),
 ('rain', 0.0)]

In [124]:
boston_train = boston_10[boston_10['year'] < 2016]
boston_test = pd.concat([boston_10[boston_10['year'] == 2017], boston_10[boston_10['year'] == 2018]], ignore_index=False)

results = {}
best_boston = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = boston_train.drop(columns=drop_cols)
        y_train = boston_train['time_seconds']

        X_test = boston_test.drop(columns=drop_cols)
        y_test = boston_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_boston[min(results.keys())] = results[min(results.keys())]

In [84]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

BASELINE
MAE: 813.7002298093333
MSE: 1118463.2479628962
RMSE: 1057.5742281102052


In [85]:
list(zip(X_test.columns,lr.coef_))

[('year', -109.59491433166193),
 ('age', 320.82232337755266),
 ('male', -1097.5503708532387),
 ('temperature', 54.763235379032054),
 ('relative_humidity', 3.543626642575523),
 ('wind_speed', 27.103351256777742),
 ('precipitation', -1352.667724410968),
 ('precipitation_cover', 11.812541315552076),
 ('cloud_cover', -2.531228755920584),
 ('overcast', 88.39453527965584),
 ('partially_cloudy', -88.39453527965686),
 ('rain', -442.13809802733346)]

### All Combined

In [125]:
combined = pd.concat([london_10, nyc_10, boston_10, berlin_10, chicago_10], ignore_index=True)

combined_train = combined[combined['year'] < 2016]
combined_test = pd.concat([combined[combined['year'] == 2017], combined[combined['year'] == 2018]], ignore_index=False)

results = {}
best_combined = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = combined_train.drop(columns=drop_cols)
        y_train = combined_train['time_seconds']

        X_test = combined_test.drop(columns=drop_cols)
        y_test = combined_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)

        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_combined[min(results.keys())] = results[min(results.keys())]

In [114]:
best_combined[min(best_combined.keys())]

['temperature',
 'relative_humidity',
 'wind_speed',
 'precipitation',
 'cloud_cover',
 'partially_cloudy',
 'time_seconds',
 'clear']

In [88]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

BASELINE
MAE: 926.1824566007724
MSE: 1459087.2338588445
RMSE: 1207.9268329906595


In [89]:
list(zip(X_test.columns,lr.coef_))

[('year', -16.784870800272586),
 ('age', 143.1577326893614),
 ('male', -687.2976343117678),
 ('temperature', 27.578338549805245),
 ('relative_humidity', -6.7995593880080865),
 ('wind_speed', -0.4732307696202921),
 ('precipitation', 893.548685387145),
 ('precipitation_cover', -9.257356901172065),
 ('cloud_cover', 5.609337501768132),
 ('overcast', -529.9119646401199),
 ('partially_cloudy', -53.09701515164906),
 ('rain', 342.0751452445623)]

In [134]:
print((min(best_london.keys()) - 1321.565) / 1321.565)
print((min(best_chicago.keys()) - 1288.553) / 1288.553)
print((min(best_nyc.keys()) - 1169.176) / 1169.176)
print((min(best_berlin.keys()) - 1113.897) / 1113.897)
print((min(best_boston.keys()) - 1057.574) / 1057.574)
print((min(best_combined.keys()) - 1207.927) / 1207.927)

-0.0671435948565952
-0.15388928011059363
-0.09486860096837796
-0.07317991630328588
-0.21463807319337289
-0.0912679874792829


In [133]:
print(best_london[min(best_london.keys())][:-2])
print(best_chicago[min(best_chicago.keys())][:-2])
print(best_nyc[min(best_nyc.keys())][:-2])
print(best_berlin[min(best_berlin.keys())][:-2])
print(best_boston[min(best_boston.keys())][:-2])
print(best_combined[min(best_combined.keys())][:-2])

['wind_speed', 'precipitation', 'partially_cloudy', 'rain']
['temperature', 'relative_humidity', 'wind_speed', 'precipitation_cover', 'cloud_cover', 'overcast', 'rain']
['relative_humidity', 'cloud_cover', 'overcast', 'rain']
['temperature', 'wind_speed', 'precipitation', 'precipitation_cover', 'overcast', 'partially_cloudy', 'rain']
['temperature', 'relative_humidity', 'wind_speed', 'precipitation_cover', 'partially_cloudy']
['temperature', 'relative_humidity', 'wind_speed', 'precipitation', 'cloud_cover', 'partially_cloudy']


## Model on 20-80

In [448]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)



lo_20_80 = n_range(usable(london_dict, 9_900), .2, .8)
ny_20_80 = n_range(usable(nyc_dict, 9_900), .2, .8)
bo_20_80 = n_range(usable(boston_dict, 9_900), .2, .8)
be_20_80 = n_range(usable(berlin_dict, 9_900), .2, .8)
ch_20_80 = n_range(usable(chicago_dict, 9_900), .2, .8)

aggregate = {col: 'mean' for col in lo_20_80.drop(columns=['year'])}

lo_20_80 = lo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ny_20_80 = ny_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
bo_20_80 = bo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
be_20_80 = be_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ch_20_80 = ch_20_80.groupby(['year', 'age', 'male']).agg(aggregate)


events = [bo_20_80, be_20_80, ch_20_80, lo_20_80, ny_20_80]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined_20_80 = pd.concat([lo_20_80, ny_20_80, bo_20_80, be_20_80, ch_20_80])

In [449]:
# https://stackoverflow.com/questions/25224545/filtering-multiple-items-in-a-multi-index-python-panda-dataframe
train = lo_20_80[lo_20_80.index.isin(range(2017), level=0)]
test = lo_20_80[lo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.6386001004336139
MAE: 388.8662563771185
MSE: 193682.6597433724
RMSE: 440.0939215024134


In [450]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,80.1667
1,male,-366.3049
2,temperature,31.87898
3,relative_humidity,-5.692689
4,wind_speed,24.44962
5,precipitation,572.6152
6,precipitation_cover,-7.127266
7,cloud_cover,-6.058525
8,clear,444.5718
9,overcast,-1.733724e-12


In [451]:
train = ny_20_80[ny_20_80.index.isin(range(2017), level=0)]
test = ny_20_80[ny_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -0.983068336178196
MAE: 254.69082092062334
MSE: 107443.12154139999
RMSE: 327.78517590244985


In [452]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,33.189951
1,male,-147.421062
2,temperature,32.186467
3,relative_humidity,-2.028269
4,wind_speed,11.644446
5,precipitation,558.644735
6,precipitation_cover,2.2625
7,cloud_cover,4.776309
8,clear,229.441707
9,overcast,-300.523059


In [453]:
train = be_20_80[be_20_80.index.isin(range(2017), level=0)]
test = be_20_80[be_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -1.7217727594296521
MAE: 732.9959725556669
MSE: 644884.0013565978
RMSE: 803.0466993622462


In [454]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,125.9935
1,male,-545.0487
2,temperature,75.36104
3,relative_humidity,7.772999
4,wind_speed,-7.41623
5,precipitation,-1.705303e-13
6,precipitation_cover,2.273737e-13
7,cloud_cover,-10.5397
8,clear,-570.46
9,overcast,428.4546


In [455]:
train = bo_20_80[bo_20_80.index.isin(range(2017), level=0)]
test = bo_20_80[bo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -3.7354049492327057
MAE: 1623.4565524744876
MSE: 2754163.187314122
RMSE: 1659.5671686660116


In [456]:
y_test

year  age  male
2017  0    0       13980.800052
           1       13898.848268
      1    0       14205.328411
           1       13710.074236
      2    0       14585.428770
           1       13816.312169
      3    0       14825.976244
           1       14000.666165
      4    0       15058.543237
           1       14197.954426
      5    0       15391.638037
           1       14677.560563
      6    0       15479.085714
           1       14962.176471
      7    0       15663.400000
           1       15314.100000
2018  0    0       13953.931187
           1       14344.568827
      1    0       14195.407487
           1       13894.571267
      2    0       14785.074717
           1       13691.541457
      3    0       15077.007535
           1       13777.304049
      4    0       15503.311905
           1       14209.670455
      5    0       16021.187097
           1       14964.290038
      6    0       16277.764706
           1       15455.278970
      7    0       16041

In [457]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,197.5226
1,male,-610.2688
2,temperature,75.53706
3,relative_humidity,-1.089924
4,wind_speed,43.19515
5,precipitation,-925.1298
6,precipitation_cover,-0.8755343
7,cloud_cover,-22.58956
8,clear,1.136868e-13
9,overcast,408.581


In [458]:
bo_20_80

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,male,time_seconds,temperature,relative_humidity,wind_speed,precipitation,precipitation_cover,cloud_cover,clear,overcast,partially_cloudy,rain,event
year,age,male,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2001,0,0,0.0,0.0,13846.053951,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0
2001,0,1,0.0,1.0,13543.116687,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0
2001,1,0,1.0,0.0,14190.927419,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0
2001,1,1,1.0,1.0,13307.306599,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0
2001,2,0,2.0,0.0,14401.854369,46.0,63.25,14.6,0.00,0.0,89.3,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5,1,5.0,1.0,14964.290038,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0
2018,6,0,6.0,0.0,16277.764706,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0
2018,6,1,6.0,1.0,15455.278970,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0
2018,7,0,7.0,0.0,16041.000000,42.8,99.11,29.5,1.16,100.0,100.0,0.0,1.0,0.0,1.0,0


In [459]:
train = ch_20_80[ch_20_80.index.isin(range(2017), level=0)]
test = ch_20_80[ch_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.0043981428879149576
MAE: 558.5243534759802
MSE: 351161.0437268433
RMSE: 592.588426926179


In [460]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,102.768475
1,male,-439.066836
2,temperature,53.869822
3,relative_humidity,-15.331411
4,wind_speed,-27.757072
5,precipitation,0.006238
6,precipitation_cover,6.238292
7,cloud_cover,20.259338
8,clear,416.266431
9,overcast,-57.819313


In [461]:
train = combined_20_80[combined_20_80.index.isin(range(2016), level=0)]
test = combined_20_80[combined_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -0.04178311550272129
MAE: 1481.8674366993062
MSE: 2623035.171845001
RMSE: 1619.5787019607912


In [462]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,121.898402
1,male,-472.377502
2,temperature,44.990805
3,relative_humidity,-18.198668
4,wind_speed,-43.369499
5,precipitation,42.195577
6,precipitation_cover,-16.706898
7,cloud_cover,-2.03217
8,clear,-449.802522
9,overcast,-1012.014941


In [473]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)



lo_20_80 = n_range(usable(london_dict, 9_900), .2, .8)
ny_20_80 = n_range(usable(nyc_dict, 9_900), .2, .8)
bo_20_80 = n_range(usable(boston_dict, 9_900), .2, .8)
be_20_80 = n_range(usable(berlin_dict, 9_900), .2, .8)
ch_20_80 = n_range(usable(chicago_dict, 9_900), .2, .8)

aggregate = {col: 'mean' for col in lo_20_80.drop(columns=['year'])}

lo_20_80 = lo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ny_20_80 = ny_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
bo_20_80 = bo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
be_20_80 = be_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ch_20_80 = ch_20_80.groupby(['year', 'age', 'male']).agg(aggregate)


events = [bo_20_80, be_20_80, ch_20_80, lo_20_80, ny_20_80]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined_20_80 = pd.concat([lo_20_80, ny_20_80, bo_20_80, be_20_80, ch_20_80])

In [474]:
# https://stackoverflow.com/questions/25224545/filtering-multiple-items-in-a-multi-index-python-panda-dataframe
train = lo_20_80[lo_20_80.index.isin(range(2017), level=0)]
test = lo_20_80[lo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.6760901611202041
MAE: 364.16492074864925
MSE: 173590.85928512295
RMSE: 416.64236376672375


In [475]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,80.34725
1,male,-366.858414
2,temperature,32.962957
3,relative_humidity,7.480772
4,wind_speed,15.788958
5,precipitation,-149.26497
6,precipitation_cover,-44.48143
7,cloud_cover,-6.338675
8,overcast,0.0
9,partially_cloudy,109.867907


In [476]:
train = ny_20_80[ny_20_80.index.isin(range(2017), level=0)]
test = ny_20_80[ny_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -0.9830683361781931
MAE: 254.69082092062357
MSE: 107443.12154139983
RMSE: 327.7851759024496


In [477]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,33.189951
1,male,-147.421062
2,temperature,32.186467
3,relative_humidity,-2.028269
4,wind_speed,11.644446
5,precipitation,558.644735
6,precipitation_cover,2.2625
7,cloud_cover,4.776309
8,overcast,-529.964765
9,partially_cloudy,-158.360355


In [478]:
train = be_20_80[be_20_80.index.isin(range(2017), level=0)]
test = be_20_80[be_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -1.7217727594296286
MAE: 732.9959725556616
MSE: 644884.0013565922
RMSE: 803.0466993622426


In [479]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,125.9935
1,male,-545.0487
2,temperature,75.36104
3,relative_humidity,7.772999
4,wind_speed,-7.41623
5,precipitation,5.684342e-14
6,precipitation_cover,4.547474e-13
7,cloud_cover,-10.5397
8,overcast,998.9146
9,partially_cloudy,712.4655


In [480]:
train = bo_20_80[bo_20_80.index.isin(range(2017), level=0)]
test = bo_20_80[bo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -3.7354049492326507
MAE: 1623.4565524744776
MSE: 2754163.18731409
RMSE: 1659.5671686660019


In [481]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,197.522611
1,male,-610.268761
2,temperature,75.537058
3,relative_humidity,-1.089924
4,wind_speed,43.195149
5,precipitation,-925.129789
6,precipitation_cover,-0.875534
7,cloud_cover,-22.589562
8,overcast,408.581037
9,partially_cloudy,-408.581037


In [482]:
train = ch_20_80[ch_20_80.index.isin(range(2017), level=0)]
test = ch_20_80[ch_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.004398142887922396
MAE: 558.524353475979
MSE: 351161.0437268407
RMSE: 592.5884269261767


In [483]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,102.768475
1,male,-439.066836
2,temperature,53.869822
3,relative_humidity,-15.331411
4,wind_speed,-27.757072
5,precipitation,0.006238
6,precipitation_cover,6.238292
7,cloud_cover,20.259338
8,overcast,-474.085744
9,partially_cloudy,-774.713549


In [484]:
train = combined_20_80[combined_20_80.index.isin(range(2017), level=0)]
test = combined_20_80[combined_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: -0.021848336189582485
MAE: 1465.1801133684344
MSE: 2572842.7407111004
RMSE: 1604.008335611477


In [485]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,121.525334
1,male,-467.535484
2,temperature,45.078068
3,relative_humidity,-16.888761
4,wind_speed,-45.256534
5,precipitation,107.728527
6,precipitation_cover,-19.192224
7,cloud_cover,1.379705
8,overcast,-895.085704
9,partially_cloudy,-63.015853


In [486]:
combined_20_80

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,male,time_seconds,temperature,relative_humidity,wind_speed,precipitation,precipitation_cover,cloud_cover,clear,overcast,partially_cloudy,rain,event
year,age,male,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2001,0,0,0.0,0.0,16267.674933,48.0,55.19,16.3,0.00,0.0,63.7,0.0,0.0,1.0,0.0,3
2001,0,1,0.0,1.0,15715.237228,48.0,55.19,16.3,0.00,0.0,63.7,0.0,0.0,1.0,0.0,3
2001,1,0,1.0,0.0,16127.081594,48.0,55.19,16.3,0.00,0.0,63.7,0.0,0.0,1.0,0.0,3
2001,1,1,1.0,1.0,15654.770597,48.0,55.19,16.3,0.00,0.0,63.7,0.0,0.0,1.0,0.0,3
2001,2,0,2.0,0.0,16290.245509,48.0,55.19,16.3,0.00,0.0,63.7,0.0,0.0,1.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5,1,5.0,1.0,15878.586957,59.3,87.71,12.3,0.05,30.0,100.0,0.0,1.0,0.0,1.0,2
2018,6,0,6.0,0.0,16694.900000,59.3,87.71,12.3,0.05,30.0,100.0,0.0,1.0,0.0,1.0,2
2018,6,1,6.0,1.0,16171.053191,59.3,87.71,12.3,0.05,30.0,100.0,0.0,1.0,0.0,1.0,2
2018,7,0,7.0,0.0,17101.750000,59.3,87.71,12.3,0.05,30.0,100.0,0.0,1.0,0.0,1.0,2
