In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import itertools

## Functions

In [2]:
# functions
def num_age(x):
    '''make age categories numeric'''
    if x == '18-39':
        return 0
    elif x == '40-44':
        return 1
    elif x == '45-49':
        return 2
    elif x == '50-54':
        return 3
    elif x == '55-59':
        return 4
    elif x == '60-64':
        return 5
    elif x == '65-69':
        return 6
    return 7


def cat_clean(x):
    '''binary columns were floats, make them ints'''
    return int(x)


def prep(df):
    '''prepare dataframe for EDA'''
    df['age'] = df['age'].apply(lambda x: num_age(x))
    df['clear'] = df['clear'].apply(lambda x: cat_clean(x))
    df['overcast'] = df['overcast'].apply(lambda x: cat_clean(x))
    df['partially_cloudy'] = df['partially_cloudy'].apply(lambda x: cat_clean(x))
    df['rain'] = df['rain'].apply(lambda x: cat_clean(x))

    cols = ['year', 'age', 'male', 'time_seconds', 'minimum_temperature',
           'maximum_temperature', 'temperature', 'relative_humidity', 'wind_speed',
           'precipitation', 'precipitation_cover', 'cloud_cover', 'clear',
           'overcast', 'partially_cloudy', 'rain']

    for col in cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    df.drop(columns=['maximum_temperature', 'minimum_temperature'], inplace=True)

    event_dict = {year: df[df['year'] == year] for year in df['year'].unique()}
    
    return event_dict


def usable(event_dict, num):
    '''specify which years are usable based on minimum number of participants
    returns dict with years that exceed minimum values are correspoding df'''
    can_use = []
    for year in event_dict.keys():
        if len(event_dict[year]) > num:
            can_use.append(year)
    return {year: event_dict[year] for year in can_use}


def top_n(event_dict, num):
    '''take in event dict and first num of finishers that are desired,
    return df of top n racers for all years, use in conjuction with usable()'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[:num]], ignore_index=True)
    return df
        
    
def n_range(event_dict, low, high):
    '''take in dictionary of dfs from prep(), upper percentage as float, lower percentage as float
    return df'''
    df = pd.DataFrame()
    for year in event_dict.keys():
        df = pd.concat([df, event_dict[year].sort_values(by='time_seconds').iloc[int(len(event_dict[year]) * low): int(len(event_dict[year]) * high)]], ignore_index=True)
    return df

def model_scores(X_test, y_test, model):
    nl = "\n"
    y_preds = model.predict(X_test)
    return print(f'R2: {model.score(X_test, y_test)}{nl}MAE: {mean_absolute_error(y_test, y_preds)}{nl}MSE: {mean_squared_error(y_test, y_preds)}{nl}RMSE: {mean_squared_error(y_test, y_preds, squared=False)}')

## All Participants

### Grouped

In [3]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london = n_range(usable(london_dict, 9_900), 0, 1)
nyc = n_range(usable(nyc_dict, 9_900), 0, 1)
boston = n_range(usable(boston_dict, 9_900), 0, 1)
berlin = n_range(usable(berlin_dict, 9_900), 0, 1)
chicago = n_range(usable(chicago_dict, 9_900), 0, 1)

aggregate = {col: 'mean' for col in london.drop(columns=['year'])}

london = london.groupby(['year', 'age', 'male']).agg(aggregate)
nyc = nyc.groupby(['year', 'age', 'male']).agg(aggregate)
boston = boston.groupby(['year', 'age', 'male']).agg(aggregate)
berlin = berlin.groupby(['year', 'age', 'male']).agg(aggregate)
chicago = chicago.groupby(['year', 'age', 'male']).agg(aggregate)

events = [boston, berlin, chicago, london, nyc]
count = 0
for event in events:
    event['event'] = count
    count += 1
    
combined = pd.concat(events)

In [4]:
train = london[london.index.isin(range(2017), level=0)]
test = london[london.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8486382770895685
MAE: 500.5378739349461
MSE: 363822.3823038144
RMSE: 603.1769079663233


In [5]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,452.511953
1,male,-1915.789567
2,temperature,39.860797
3,relative_humidity,-8.202356
4,wind_speed,32.583386
5,precipitation,1669.587288
6,precipitation_cover,10.751469
7,cloud_cover,-9.79622
8,overcast,0.0
9,partially_cloudy,580.134974


In [6]:
train = nyc[nyc.index.isin(range(2017), level=0)]
test = nyc[nyc.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8861914478833699
MAE: 565.6971374730781
MSE: 442993.6853736852
RMSE: 665.5777079903482


In [7]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,754.2211
1,male,-1905.192
2,temperature,37.73948
3,relative_humidity,-5.900199
4,wind_speed,5.303955
5,precipitation,-4.092726e-12
6,precipitation_cover,2.273737e-13
7,cloud_cover,13.81282
8,overcast,-871.3175
9,partially_cloudy,-550.1956


In [8]:
train = boston[boston.index.isin(range(2017), level=0)]
test = boston[boston.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])
X_test = test.drop(columns = ['time_seconds', 'clear', 'overcast', 'partially_cloudy'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.47207035106013917
MAE: 1318.2329814451143
MSE: 1953173.294080445
RMSE: 1397.5597640460478


In [9]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,593.763749
1,male,-1431.211599
2,temperature,87.334866
3,relative_humidity,11.906754
4,wind_speed,-0.085594
5,precipitation,1967.982287
6,precipitation_cover,1.077872
7,cloud_cover,-14.533504
8,rain,64.285743
9,event,0.0


In [10]:
train = berlin[berlin.index.isin(range(2017), level=0)]
test = berlin[berlin.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.5841806815774391
MAE: 829.7206237258863
MSE: 923687.0762982728
RMSE: 961.0864041792876


In [11]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,467.0261
1,male,-1455.646
2,temperature,69.20877
3,relative_humidity,5.048345
4,wind_speed,-12.74645
5,precipitation,-6.536993e-13
6,precipitation_cover,2.273737e-13
7,cloud_cover,-13.35731
8,overcast,1103.646
9,partially_cloudy,768.5363


In [12]:
train = chicago[chicago.index.isin(range(2017), level=0)]
test = chicago[chicago.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.8727267283684006
MAE: 476.75164008732094
MSE: 355137.6957471701
RMSE: 595.9343048920493


In [13]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,447.221232
1,male,-1535.602797
2,temperature,45.002944
3,relative_humidity,-8.631163
4,wind_speed,-58.098825
5,precipitation,-0.009502
6,precipitation_cover,-9.501577
7,cloud_cover,16.083448
8,overcast,-458.174109
9,partially_cloudy,-795.303901


In [14]:
train = combined[combined.index.isin(range(2017), level=0)]
test = combined[combined.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

R2: 0.7916902791307696
MAE: 686.2739802027909
MSE: 759983.6716734858
RMSE: 871.7704237203083


In [15]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

Unnamed: 0,0,1
0,age,531.804243
1,male,-1621.130457
2,temperature,51.010572
3,relative_humidity,-16.261533
4,wind_speed,5.01967
5,precipitation,86.059875
6,precipitation_cover,2.78698
7,cloud_cover,3.582419
8,overcast,58.31308
9,partially_cloudy,46.716189


## Model (on top 10k)

### Separated on Event

In [16]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)

london_10 = top_n(usable(london_dict, 10_000), 10_000)
nyc_10 = top_n(usable(nyc_dict, 9_000), 9_900)
boston_10 = top_n(usable(boston_dict, 10_000), 10_000)
berlin_10 = top_n(usable(berlin_dict, 10_000), 10_000)
chicago_10 = top_n(usable(chicago_dict, 10_000), 10_000)

aggregate = {col: 'mean' for col in london_10.drop(columns=['year'])}

london_10 = london_10.groupby(['year', 'age', 'male']).agg(aggregate)
nyc_10 = nyc_10.groupby(['year', 'age', 'male']).agg(aggregate)
boston_10 = boston_10.groupby(['year', 'age', 'male']).agg(aggregate)
berlin_10 = berlin_10.groupby(['year', 'age', 'male']).agg(aggregate)
chicago_10 = chicago_10.groupby(['year', 'age', 'male']).agg(aggregate)

combined_10 = pd.concat([london_10, nyc_10, boston_10, berlin_10, chicago_10])

In [17]:
london_train = london_10[london_10['year'] < 2016]
london_test = pd.concat([london_10[london_10['year'] == 2017], london_10[london_10['year'] == 2018]], ignore_index=False)

results = {}
best_london = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = london_train.drop(columns=drop_cols)
        y_train = london_train['time_seconds']

        X_test = london_test.drop(columns=drop_cols)
        y_test = london_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_london[min(results.keys())] = results[min(results.keys())]

KeyError: 'year'

In [None]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

In [None]:
list(zip(X_test.columns,lr.coef_))

In [None]:
chicago_train = chicago_10[chicago_10['year'] < 2016]
chicago_test = pd.concat([chicago_10[chicago_10['year'] == 2017], chicago_10[chicago_10['year'] == 2018]], ignore_index=False)

results = {}
best_chicago = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = chicago_train.drop(columns=drop_cols)
        y_train = chicago_train['time_seconds']

        X_test = chicago_test.drop(columns=drop_cols)
        y_test = chicago_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_chicago[min(results.keys())] = results[min(results.keys())]

In [None]:
list(zip(X_test.columns,lr.coef_))

In [None]:
nyc_train = nyc_10[nyc_10['year'] < 2016]
nyc_test = pd.concat([nyc_10[nyc_10['year'] == 2017], nyc_10[nyc_10['year'] == 2018]], ignore_index=False)


results = {}
best_nyc = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')
        
        X_train = nyc_train.drop(columns=drop_cols)
        y_train = nyc_train['time_seconds']

        X_test = nyc_test.drop(columns=drop_cols)
        y_test = nyc_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_nyc[min(results.keys())] = results[min(results.keys())]

In [None]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

In [None]:
list(zip(X_test.columns,lr.coef_))

In [None]:
berlin_train = berlin_10[berlin_10['year'] < 2016]
berlin_test = pd.concat([berlin_10[berlin_10['year'] == 2017], berlin_10[berlin_10['year'] == 2018]], ignore_index=False)

results = {}
best_berlin = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')
        
        X_train = berlin_train.drop(columns=drop_cols)
        y_train = berlin_train['time_seconds']

        X_test = berlin_test.drop(columns=drop_cols)
        y_test = berlin_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_berlin[min(results.keys())] = results[min(results.keys())]

In [None]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

In [None]:
list(zip(X_test.columns,lr.coef_))

In [None]:
boston_train = boston_10[boston_10['year'] < 2016]
boston_test = pd.concat([boston_10[boston_10['year'] == 2017], boston_10[boston_10['year'] == 2018]], ignore_index=False)

results = {}
best_boston = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = boston_train.drop(columns=drop_cols)
        y_train = boston_train['time_seconds']

        X_test = boston_test.drop(columns=drop_cols)
        y_test = boston_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)
        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_boston[min(results.keys())] = results[min(results.keys())]

In [None]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

In [None]:
list(zip(X_test.columns,lr.coef_))

### All Combined

In [None]:
combined = pd.concat([london_10, nyc_10, boston_10, berlin_10, chicago_10], ignore_index=True)

combined_train = combined[combined['year'] < 2016]
combined_test = pd.concat([combined[combined['year'] == 2017], combined[combined['year'] == 2018]], ignore_index=False)

results = {}
best_combined = {}

for i in range(len(cols)):
    for combo in itertools.combinations(cols, i):
        drop_cols = list(combo)
        drop_cols.append('time_seconds')
        drop_cols.append('clear')

        X_train = combined_train.drop(columns=drop_cols)
        y_train = combined_train['time_seconds']

        X_test = combined_test.drop(columns=drop_cols)
        y_test = combined_test['time_seconds']

        lr = LinearRegression()
        lr.fit(X_train, y_train)

        preds = lr.predict(X_test)

        results[mean_squared_error(y_test, preds, squared=False)] = drop_cols

    best_combined[min(results.keys())] = results[min(results.keys())]

In [None]:
best_combined[min(best_combined.keys())]

In [None]:
y_pred = y_train.mean()
y_preds = [y_pred] * len(y_test)
print('BASELINE')
model_scores(y_test, y_preds)

In [None]:
list(zip(X_test.columns,lr.coef_))

In [None]:
print((min(best_london.keys()) - 1321.565) / 1321.565)
print((min(best_chicago.keys()) - 1288.553) / 1288.553)
print((min(best_nyc.keys()) - 1169.176) / 1169.176)
print((min(best_berlin.keys()) - 1113.897) / 1113.897)
print((min(best_boston.keys()) - 1057.574) / 1057.574)
print((min(best_combined.keys()) - 1207.927) / 1207.927)

In [None]:
print(best_london[min(best_london.keys())][:-2])
print(best_chicago[min(best_chicago.keys())][:-2])
print(best_nyc[min(best_nyc.keys())][:-2])
print(best_berlin[min(best_berlin.keys())][:-2])
print(best_boston[min(best_boston.keys())][:-2])
print(best_combined[min(best_combined.keys())][:-2])

## Model on 20-80

In [None]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)



lo_20_80 = n_range(usable(london_dict, 9_900), .2, .8)
ny_20_80 = n_range(usable(nyc_dict, 9_900), .2, .8)
bo_20_80 = n_range(usable(boston_dict, 9_900), .2, .8)
be_20_80 = n_range(usable(berlin_dict, 9_900), .2, .8)
ch_20_80 = n_range(usable(chicago_dict, 9_900), .2, .8)

aggregate = {col: 'mean' for col in lo_20_80.drop(columns=['year'])}

lo_20_80 = lo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ny_20_80 = ny_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
bo_20_80 = bo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
be_20_80 = be_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ch_20_80 = ch_20_80.groupby(['year', 'age', 'male']).agg(aggregate)


events = [bo_20_80, be_20_80, ch_20_80, lo_20_80, ny_20_80]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined_20_80 = pd.concat([lo_20_80, ny_20_80, bo_20_80, be_20_80, ch_20_80])

In [None]:
# https://stackoverflow.com/questions/25224545/filtering-multiple-items-in-a-multi-index-python-panda-dataframe
train = lo_20_80[lo_20_80.index.isin(range(2017), level=0)]
test = lo_20_80[lo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = ny_20_80[ny_20_80.index.isin(range(2017), level=0)]
test = ny_20_80[ny_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = be_20_80[be_20_80.index.isin(range(2017), level=0)]
test = be_20_80[be_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = bo_20_80[bo_20_80.index.isin(range(2017), level=0)]
test = bo_20_80[bo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
y_test

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
bo_20_80

In [None]:
train = ch_20_80[ch_20_80.index.isin(range(2017), level=0)]
test = ch_20_80[ch_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = combined_20_80[combined_20_80.index.isin(range(2016), level=0)]
test = combined_20_80[combined_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds'])
X_test = test.drop(columns = ['time_seconds'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
london = pd.read_csv('./data/London_Data/Clean/Clean_London_Results_Weather.csv')
nyc = pd.read_csv('./data/NYC_Data/Clean/Clean_NYC_Results_Weather.csv')
boston = pd.read_csv('./data/Boston_Data/Clean/Clean_Boston_Results_Weather.csv')
berlin = pd.read_csv('./data/Berlin_Data/Clean/Clean_Berlin_Results_Weather.csv')
chicago = pd.read_csv('./data/Chicago_Data/Clean/Clean_Chicago_Results_Weather.csv')

london_dict = prep(london)
nyc_dict = prep(nyc)
boston_dict = prep(boston)
berlin_dict = prep(berlin)
chicago_dict = prep(chicago)



lo_20_80 = n_range(usable(london_dict, 9_900), .2, .8)
ny_20_80 = n_range(usable(nyc_dict, 9_900), .2, .8)
bo_20_80 = n_range(usable(boston_dict, 9_900), .2, .8)
be_20_80 = n_range(usable(berlin_dict, 9_900), .2, .8)
ch_20_80 = n_range(usable(chicago_dict, 9_900), .2, .8)

aggregate = {col: 'mean' for col in lo_20_80.drop(columns=['year'])}

lo_20_80 = lo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ny_20_80 = ny_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
bo_20_80 = bo_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
be_20_80 = be_20_80.groupby(['year', 'age', 'male']).agg(aggregate)
ch_20_80 = ch_20_80.groupby(['year', 'age', 'male']).agg(aggregate)


events = [bo_20_80, be_20_80, ch_20_80, lo_20_80, ny_20_80]
count = 0
for event in events:
    event['event'] = count
    count += 1

combined_20_80 = pd.concat([lo_20_80, ny_20_80, bo_20_80, be_20_80, ch_20_80])

In [None]:
# https://stackoverflow.com/questions/25224545/filtering-multiple-items-in-a-multi-index-python-panda-dataframe
train = lo_20_80[lo_20_80.index.isin(range(2017), level=0)]
test = lo_20_80[lo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = ny_20_80[ny_20_80.index.isin(range(2017), level=0)]
test = ny_20_80[ny_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = be_20_80[be_20_80.index.isin(range(2017), level=0)]
test = be_20_80[be_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = bo_20_80[bo_20_80.index.isin(range(2017), level=0)]
test = bo_20_80[bo_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = ch_20_80[ch_20_80.index.isin(range(2017), level=0)]
test = ch_20_80[ch_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
train = combined_20_80[combined_20_80.index.isin(range(2017), level=0)]
test = combined_20_80[combined_20_80.index.isin([2017, 2018], level=0)]

X_train = train.drop(columns = ['time_seconds', 'clear'])
X_test = test.drop(columns = ['time_seconds', 'clear'])

y_train = train['time_seconds']
y_test = test['time_seconds']

lr = LinearRegression()
lr.fit(X_train, y_train)
model_scores(X_test, y_test, lr)

In [None]:
pd.DataFrame(list(zip(X_test.columns,lr.coef_)))

In [None]:
combined_20_80