### TO DO

Use Jonathans Utils function
Or at least change my function so it only looks at validation and test scores, not training and validation scores

### Import libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime, timedelta
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, train_test_split
import plotly.express as px

# I had my own get_val_scores-type function but yours is better so I'm using it :)
from utils import extract_dates, get_val_scores

### Define Functions

__denote_null_values__ - Same function from class


In [2]:
def denote_null_values(df):
    #Denotes whether or not there are null values or not
    empty_cols_query = df.isnull().sum() > 0
    empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()
    for col in empty_df_cols:
        col_name = f"{col}_missing"
        df[col_name] = pd.isnull(df[col])
    return df


### Import Data

Import data and see what we're working with

In [3]:
df = pd.read_csv('/Users/cameronlefevre/Data Science/coding/GA-DS-Class/Homework/Unit3/data/bikeshare.csv')

df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [4]:
df.describe()

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,0.028569,0.680875,20.23086,23.655084,61.88646,12.799395,191.574132
std,0.166599,0.466159,7.79159,8.474601,19.245033,8.164537,181.144454
min,0.0,0.0,0.82,0.76,0.0,0.0,1.0
25%,0.0,0.0,13.94,16.665,47.0,7.0015,42.0
50%,0.0,1.0,20.5,24.24,62.0,12.998,145.0
75%,0.0,1.0,26.24,31.06,77.0,16.9979,284.0
max,1.0,1.0,41.0,45.455,100.0,56.9969,977.0


In [5]:
df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
count         0
dtype: int64

In [6]:
df['season'].unique()

array(['Spring', 'Summer', 'Fall', 'Winter'], dtype=object)

In [7]:
df['weather'].unique()

array(['Clear Skies', 'Partly Cloudy', 'Light Storms/Rain',
       'Heavy Storms/Rain'], dtype=object)

In [8]:
df['workingday'].unique()

array([0, 1])

In [9]:
# Re-import data with better data types and sort by 'datetime'

dtypes = {
    'season': 'category',
    'holiday': np.int8,
    'workingday': np.int8,
    'weather': 'category',
    'temp': np.float32,
    'atemp': np.float32,
    'humidity': np.int8,
    'windspeed': np.float32,
    'count': np.int16
}

df = pd.read_csv('/Users/cameronlefevre/Data Science/coding/GA-DS-Class/Homework/Unit3/data/bikeshare.csv', dtype=dtypes, parse_dates=['datetime'])

df.sort_values(by=['datetime'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

Do we have missing data that we'll need to work around?

Yes. In addition to only have data for the first 19 days of each month, many days are missing a couple of entries.

In [10]:
count_by_day = df.groupby([df['datetime'].dt.year, df['datetime'].dt.month, df['datetime'].dt.day])['count'].count()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(count_by_day)

datetime  datetime  datetime
2011      1         1           24
                    2           23
                    3           22
                    4           23
                    5           23
                    6           23
                    7           23
                    8           24
                    9           24
                    10          24
                    11          22
                    12          22
                    13          24
                    14          23
                    15          24
                    16          24
                    17          24
                    18          12
                    19          23
          2         1           23
                    2           24
                    3           23
                    4           23
                    5           24
                    6           24
                    7           24
                    8           24
                    9     

### Initial model to see where we're starting from

In [11]:
# initial model fitting to see where we're starting

pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

X = df.drop(['datetime'], axis=1)
y = df['count']

scores = get_val_scores(pipe, X, y, return_test_score=True, randomize=False, test_size=0.1, val_size=0.1, use_kfold=False)

scores

{'validation_score': 0.9993924836633843, 'test_score': 0.9998530661906342}

### Feature Engineering

First, convert the 'datetime' column into more usable columns

In [17]:
# Add column for hour of day
df['hour_of_day'] = df['datetime'].dt.hour

# Add column for day of week
df['day_of_week'] = df['datetime'].dt.dayofweek

# Add column for day of month
df['day_of_month'] = df['datetime'].dt.day

# Add column for month
df['month'] = df['datetime'].dt.month

# Let's see how that impacted the score
fit_and_score(df)

Validation Set Score: 0.7752723581777791


How are bike rentals impacted by the hour of the day and day of the week?

In [106]:
df['day'] = df['datetime'].dt.day_name()

data = df.groupby(['hour_of_day', 'day'])['count'].mean().reset_index()
fig = px.line(data, x='hour_of_day', y='count', color='day', title='Bike Rentals by Hour by Day of Week')
fig.update_xaxes(nticks=24)
fig.show()

df.drop(['day'], axis=1, inplace=True)

Looks like rush hour is a big driver in bike rentals so let's add that as a column in case it helps.

In [107]:
# Looks like rush hour is relevant so we'll add a column for that
df['rush_hour'] = 0

df.loc[(df['hour_of_day'] >= 7) & (df['hour_of_day'] <= 9) & (df['workingday'] == 1), 'rush_hour'] = 1
df.loc[(df['hour_of_day'] >= 17) & (df['hour_of_day'] <= 19) & (df['workingday'] == 1), 'rush_hour'] = 1

# Let's see how that impacted the score
fit_and_score(df)

Validation Set Score: 0.8166997464723182


How are bike rentals impacted by month?

In [108]:
data = df.groupby(['month'])['count'].mean().reset_index()
fig = px.line(data, x='month', y='count', title='Bike Rentals by Month')
fig.update_xaxes(nticks=12)
fig.show()

How does weather impact bike rentals over a day?

In [109]:
df['day'] = df['datetime'].dt.day_name()

data = df.groupby(['hour_of_day', 'weather'])['count'].mean().reset_index()
fig = px.line(data, x='hour_of_day', y='count', color='weather', title='Bike Rentals by Hour by Weather')
fig.update_xaxes(nticks=24)
fig.show()

df.drop(['day'], axis=1, inplace=True)

In [110]:
# Why no line for Heavy Storms? Because there's only one record

df.groupby(['weather'])['count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
weather,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Clear Skies,7192.0,205.236791,187.959566,1.0,48.0,161.0,305.0,977.0
Heavy Storms/Rain,1.0,164.0,,164.0,164.0,164.0,164.0,164.0
Light Storms/Rain,859.0,118.846333,138.581297,1.0,23.0,71.0,161.0,891.0
Partly Cloudy,2834.0,178.95554,168.366413,1.0,41.0,134.0,264.0,890.0


Aside from that one row for Heavy Storms, it does seem like storms will decrease rentals a bit.  Lets add column to flag whether the ride was during a storm or not in case that helps.

In [111]:
df['is_storming'] = 0

df.loc[df['weather'] == "Light Storms/Rain", 'is_storming'] = 1
df.loc[df['weather'] == "Heavy Storms/Rain", 'is_storming'] = 1

# Let's see how that impacted the score
fit_and_score(df)

Validation Set Score: 0.8176357415889302


#### How do rental counts from previous points in time impact rentals?

It feels like cheating to use the previous hour's rental count. It seems pretty unlikely that a real-world use case would have the previous hour's rental count available for modeling. I wrote this but am not including it.

In order to create a column with the previous hour's rental count, we have to check if the previous row is actually the previous hour.

In [None]:
"""
tmp_previous_hour_count = []

for i, row in df.iterrows():    
    if (row['datetime'] - timedelta(hours=1) == df.iloc[i-1]['datetime']):
        tmp_previous_hour_count.append(df.iloc[i-1]['count'])
    else:
        tmp_previous_hour_count.append(np.nan)

            
df['previous_hour_count'] = tmp_previous_hour_count

df = denote_null_values(df)
df = df.fillna(0)
fit_and_score(df, show_training_score=True)

"""

In real world use, we likely won't have the previous hour's rental count. Instead, we have this option to look at the rental count for the same hour from the previous day and previous week

There's probably a more effecient way to do this....

In [112]:
tmp_previous_day_count = []
tmp_previous_week_count = []

for i, row in df.iterrows():
    
    previous_day = row['datetime'] - timedelta(days=1)
    previous_week = row['datetime'] - timedelta(days=7)
    
    if previous_day in df['datetime'].values: 
        tmp_previous_day_count.append(df.loc[df['datetime'] == previous_day]['count'].values[0])
    else:
        tmp_previous_day_count.append(np.nan)
        
    if previous_week in df['datetime'].values: 
        tmp_previous_week_count.append(df.loc[df['datetime'] == previous_week]['count'].values[0])
    else:
        tmp_previous_week_count.append(np.nan)

df['previous_day_count'] = tmp_previous_day_count
df['previous_week_count'] = tmp_previous_week_count


In [113]:
# fit and score again
fit_and_score(df)

Validation Set Score: 0.8751140112912745


### Fine-tune the model

In [115]:
# Fine-tune the model

df = denote_null_values(df)
df = df.replace(np.nan, 0)

# create training & test sets
train, val, test = create_val_splits(df, return_val=True)

# split into X & y
X_train, y_train = train.drop('count', axis=1), train['count']
X_val, y_val = val.drop('count', axis=1), val['count']

n_estimators  = [100, 250, 500]
learning_rate = [.05, .1, .2]
max_depth     = [3, 4, 5, 6]

# and cycle through our model parameters
for estimators in n_estimators:
    for rate in learning_rate:
        for depth in max_depth:
            print(f"Fitting model with parameters:  n_estimators - {estimators}, learning_rate - {rate}, max_depth - {depth}")
            mod   = GradientBoostingRegressor(n_estimators=estimators, learning_rate=rate, max_depth=depth)
            pipe  = make_pipeline(ce.TargetEncoder(), mod)
            pipe.fit(X_train, y_train)
            training_score = pipe.score(X_train, y_train)
            val_score = pipe.score(X_val, y_val)
            print(f"Training score: {training_score}.\nOut-of-sample score: {val_score}.\nScore difference: {training_score - val_score}\n\n")

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 3
Training score: 0.8598313856834803.
 Out-of-sample score: 0.8401775077911617.
Score difference: 0.01965387789231854


Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 4
Training score: 0.8976743824037086.
 Out-of-sample score: 0.8881949011509972.
Score difference: 0.009479481252711364


Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 5
Training score: 0.9266255368815445.
 Out-of-sample score: 0.9227202803724938.
Score difference: 0.003905256509050714


Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 6
Training score: 0.9483697245590572.
 Out-of-sample score: 0.9487995648165042.
Score difference: -0.0004298402574469673


Fitting model with parameters:  n_estimators - 100, learning_rate - 0.1, max_depth - 3
Training score: 0.8874876180993881.
 Out-of-sample score: 0.8751140112912745.
Score 

In [116]:
# best settings are:
# n_estimators - 500
# learning_rate - 0.2
# max_depth - 6

mod   = GradientBoostingRegressor(n_estimators=500, learning_rate=0.2, max_depth=6)
pipe  = make_pipeline(ce.TargetEncoder(), mod)

# KFold validation

TimeSplitter = TimeSeriesSplit(n_splits=10)

scores = cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=TimeSplitter)

In [117]:
scores

array([0.63959028, 0.73425826, 0.82894208, 0.84707057, 0.8743428 ,
       0.86480393, 0.76913621, 0.69433203, 0.8098736 , 0.8645033 ])

In [118]:
scores.mean()

0.7926853052239933

In [123]:
mod   = GradientBoostingRegressor(n_estimators=500, learning_rate=0.2, max_depth=6)
pipe  = make_pipeline(ce.TargetEncoder(), mod)

# create training & test sets
train, test = create_val_splits(df)

# split into X & y
X_train, y_train = train.drop('count', axis=1), train['count']
X_test, y_test   = test.drop('count', axis=1), test['count']

pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

0.8610761720504639

### Define Functions

__create_val_splits__ - Similar function from class to split dataset into training, val, and test sets.  One change: User can specify percent size for the validation & test split, rather than having to specify an integer

__denote_null_values__ - Same function from class

__fit_and_score__ - Since I'll be fitting & scoring a lot, I put these into a function to make it quicker to call. Also has the ability to show a Feature Importance table

In [355]:
def create_val_splits(df, val_percent=.1, return_val=False):
    # Function that will take in a dataset and split it up into training, validation, and test sets
    # split into training, validation, and test sets
    df = df.drop('datetime', axis=1)
    split_size = int(len(df.index) * val_percent)
    
    train = df.iloc[:-split_size].copy().reset_index(drop=True)
    test  = df.iloc[-split_size:].copy().reset_index(drop=True)

    if return_val:
        train = train.iloc[:-split_size].copy().reset_index(drop=True)
        val   = train.iloc[-split_size:].copy().reset_index(drop=True)
        return train, val, test
    else:
        return train, test
    
    
def denote_null_values(df):
    """Denotes whether or not there are null values or not"""
    empty_cols_query = df.isnull().sum() > 0
    empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()
    for col in empty_df_cols:
        col_name = f"{col}_missing"
        df[col_name] = pd.isnull(df[col])
    return df


def fit_and_score(df, show_training_score=False, show_feature_importance=False):
    pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())
    train, val, test = create_val_splits(df, return_val=True)

    # split into X & y
    X_train, y_train = train.drop('count', axis=1), train['count']
    X_val, y_val = val.drop('count', axis=1), val['count']

    # fit & score
    pipe.fit(X_train, y_train)
    
    if show_training_score:
        print(f'Training Set Score: {pipe.score(X_train, y_train)}')
    
    print(f'Validation Set Score: {pipe.score(X_val, y_val)}')
    
    if show_feature_importance:
        feats = pd.DataFrame({
            'Columns': X_train.columns,
            'Importance': pipe[1].feature_importances_
        }).sort_values(by='Importance', ascending=False)

        print(f'\nFeature Importance:\n{feats}')


In [272]:
# initial model fitting to see where we're starting
fit_and_score(df, show_feature_importance=True)

Validation Set Score: 0.3150735578958531

Feature Importance:
      Columns  Importance
5       atemp    0.425596
6    humidity    0.335576
4        temp    0.126648
0      season    0.048030
2  workingday    0.034379
7   windspeed    0.021627
3     weather    0.006480
1     holiday    0.001664


## Exploratory Data Analysis

### Do we have missing data?
Yes. In addition to only have data for the first 19 days of each month, many days are missing a couple of entries.

In [297]:
count_by_day = df.groupby([df['datetime'].dt.year, df['datetime'].dt.month, df['datetime'].dt.day])['count'].count()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(count_by_day)

datetime  datetime  datetime
2011      1         1           24
                    2           23
                    3           22
                    4           23
                    5           23
                    6           23
                    7           23
                    8           24
                    9           24
                    10          24
                    11          22
                    12          22
                    13          24
                    14          23
                    15          24
                    16          24
                    17          24
                    18          12
                    19          23
          2         1           23
                    2           24
                    3           23
                    4           23
                    5           24
                    6           24
                    7           24
                    8           24
                    9     

In [356]:
# Let's add a bunch of basic date/time columns to see if they will improve the score

# Add column for hour of day
df['hour_of_day'] = df['datetime'].dt.hour

# Add column for day of week
df['day_of_week'] = df['datetime'].dt.dayofweek

# Add column for day of month
df['day_of_month'] = df['datetime'].dt.day

# Add column for month
df['month'] = df['datetime'].dt.month

# fit and score again
fit_and_score(df, show_feature_importance=True)

Validation Set Score: -74.99556090208911

Feature Importance:
         Columns  Importance
4           temp    0.288723
11         month    0.198400
5          atemp    0.123168
6       humidity    0.102170
10  day_of_month    0.085599
0         season    0.079593
9    day_of_week    0.044977
7      windspeed    0.029627
8    hour_of_day    0.025554
2     workingday    0.012283
3        weather    0.006281
1        holiday    0.003625


### Do rentals go up during certain times of the day, like rush hour?

In [284]:
df.groupby([df['hour_of_day']])['count'].mean()

hour_of_day
0      55.138462
1      33.859031
2      22.899554
3      11.757506
4       6.407240
5      19.767699
6      76.259341
7     213.116484
8     362.769231
9     221.780220
10    175.092308
11    210.674725
12    256.508772
13    257.787281
14    243.442982
15    254.298246
16    316.372807
17    468.765351
18    430.859649
19    315.278509
20    228.517544
21    173.370614
22    133.576754
23     89.508772
Name: count, dtype: float64

In [285]:
# Looks like rush hour is relevant so we'll add a column for that
df['rush_hour'] = 0

df.loc[(df['hour_of_day'] >= 7) & (df['hour_of_day'] <= 9), 'rush_hour'] = 1
df.loc[(df['hour_of_day'] >= 17) & (df['hour_of_day'] <= 19), 'rush_hour'] = 1

# fit and score again
fit_and_score(df)

Validation Set Score: 0.799232502196229


### How does the history of rentals at this location impact future rentals?

In [305]:
# Feels like cheating to use the previous hour's rental count. It seems pretty unlikely that
# a real-world use case would have the previous hour's rental count available for modeling. 
# I wrote this but am not including it.

# In order to create a column with the previous hour's rental count, we have to check
# if the previous row is actually the previous hour.

"""
tmp_previous_hour_count = []

for i, row in df.iterrows():    
    if (row['datetime'] - timedelta(hours=1) == df.iloc[i-1]['datetime']):
        tmp_previous_hour_count.append(df.iloc[i-1]['count'])
    else:
        tmp_previous_hour_count.append(np.nan)

            
df['previous_hour_count'] = tmp_previous_hour_count

df = denote_null_values(df)
df = df.fillna(0)
fit_and_score(df, show_training_score=True)

"""

"\ntmp_previous_hour_count = []\n\nfor i, row in df.iterrows():    \n    if (row['datetime'] - timedelta(hours=1) == df.iloc[i-1]['datetime']):\n        tmp_previous_hour_count.append(df.iloc[i-1]['count'])\n    else:\n        tmp_previous_hour_count.append(np.nan)\n\n            \ndf['previous_hour_count'] = tmp_previous_hour_count\n\ndf = denote_null_values(df)\ndf = df.fillna(0)\nfit_and_score(df, show_training_score=True)\n\n"

In [None]:
# In real world use, we likely won't have the previous hour's rental count.
# Instead, we have this option to look at the rental count for the same hour from the 
# previous day and previous week

# There's probably a more effecient way to do this....

tmp_previous_day_count = []
tmp_previous_week_count = []

for i, row in df.iterrows():
    
    previous_day = row['datetime'] - timedelta(days=1)
    previous_week = row['datetime'] - timedelta(days=7)
    
    if previous_day in df['datetime'].values: 
        tmp_previous_day_count.append(df.loc[df['datetime'] == previous_day]['count'].values[0])
    else:
        tmp_previous_day_count.append(np.nan)
        
    if previous_week in df['datetime'].values: 
        tmp_previous_week_count.append(df.loc[df['datetime'] == previous_week]['count'].values[0])
    else:
        tmp_previous_week_count.append(np.nan)

df['previous_day_count'] = tmp_previous_day_count
df['previous_week_count'] = tmp_previous_week_count

### How does weather impact rentals?

In [278]:
# How does atemp impact rentals

df.groupby(['atemp'])['count'].mean()

atemp
0.760000       1.000000
1.515000       3.000000
2.275000      38.000000
3.030000      82.285714
3.790000      39.062500
4.545000      66.090909
5.305000      63.200000
6.060000      64.876712
6.820000      56.380952
7.575000      55.933333
8.335000      58.444444
9.090000      80.000000
9.850000      81.456693
10.605000     95.951807
11.365000     90.442804
12.120000    102.656410
12.880000     89.518219
13.635000     94.308017
14.395000    116.483271
15.150000    133.967456
15.910000    133.897638
16.665001    148.509186
17.424999    147.799363
18.180000    133.585366
18.940001    149.555556
19.695000    179.682353
20.455000    170.217500
21.209999    182.609551
21.969999    160.878049
22.725000    159.692118
23.485001    185.058824
24.240000    204.672783
25.000000    195.109589
25.760000    179.626478
26.514999    212.392405
27.275000    200.503546
28.030001    133.312500
28.790001    142.771429
29.545000    151.046693
30.305000    227.291429
31.059999    308.323398
31.820000 

In [303]:
# How does season impact rentals

df.groupby(['season'])['count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Fall,2733.0,234.417124,197.151001,1.0,68.0,195.0,347.0,977.0
Spring,2686.0,116.343261,125.273974,1.0,24.0,78.0,164.0,801.0
Summer,2733.0,215.251372,192.007843,1.0,49.0,172.0,321.0,873.0
Winter,2734.0,198.988296,177.622409,1.0,51.0,161.0,294.0,948.0


In [302]:
# How does weather impact rentals

df.groupby(['weather'])['count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
weather,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Clear Skies,7192.0,205.236791,187.959566,1.0,48.0,161.0,305.0,977.0
Heavy Storms/Rain,1.0,164.0,,164.0,164.0,164.0,164.0,164.0
Light Storms/Rain,859.0,118.846333,138.581297,1.0,23.0,71.0,161.0,891.0
Partly Cloudy,2834.0,178.95554,168.366413,1.0,41.0,134.0,264.0,890.0


In [304]:
# Aside from that one row for Heavy Storms, it does seem like storms will decrease rentals

# Add column to flag whether the ride was during a storm or not
df['is_storming'] = 0

df.loc[df['weather'] == "Light Storms/Rain", 'is_storming'] = 1
df.loc[df['weather'] == "Heavy Storms/Rain", 'is_storming'] = 1

In [288]:
# How does windspeed impact rentals

df.groupby(['windspeed'])['count'].mean()

windspeed
0.000000     161.101295
6.003200     147.864679
7.001500     169.852031
8.998100     175.645536
11.001400    202.262062
12.998000    202.249520
15.001300    210.833507
16.997900    214.847087
19.001200    218.051775
19.999500    225.235772
22.002800    185.053763
23.999399    220.010949
26.002701    228.744681
27.999300    219.363636
30.002600    217.171171
31.000900    208.955056
32.997501    184.075000
35.000801    230.155172
36.997398    197.045455
39.000702    176.888889
40.997299    189.363636
43.000599    137.916667
43.998901    192.375000
46.002201     67.333333
47.998798    140.500000
50.002102    171.000000
51.998699      5.000000
56.996899    269.500000
Name: count, dtype: float64

In [240]:
# In real world use, we likely won't have the previous hour's rental count
# so we have this option to look at the rental count for the same hour from the 
# previous day and previous week

# There's probably a more effecient way to do this....

tmp_previous_day_count = []
tmp_previous_week_count = []

for i, row in df.iterrows():
    
    previous_day = row['datetime'] - timedelta(days=1)
    previous_week = row['datetime'] - timedelta(days=7)
    
    if previous_day in df['datetime'].values: 
        tmp_previous_day_count.append(df.loc[df['datetime'] == previous_day]['count'].values[0])
    else:
        tmp_previous_day_count.append(np.nan)
        
    if previous_week in df['datetime'].values: 
        tmp_previous_week_count.append(df.loc[df['datetime'] == previous_week]['count'].values[0])
    else:
        tmp_previous_week_count.append(np.nan)

df['previous_day_count'] = tmp_previous_day_count
df['previous_week_count'] = tmp_previous_week_count

In [242]:
# We'll stop here on feature engineering and check our scores and feature importance
df = denote_null_values(df)
df = df.fillna(0)

fit_and_score(df, show_training_score=True, show_feature_importance=True)

Training Set Score: 0.8749104152716035
Validation Set Score: 0.8575537404314376

Feature Importance:
                        Columns  Importance
13           previous_day_count    0.565241
14          previous_week_count    0.139927
8                   hour_of_day    0.121000
5                         atemp    0.036642
2                    workingday    0.030732
12                    rush_hour    0.029629
6                      humidity    0.019855
3                       weather    0.018491
10                 day_of_month    0.014303
9                   day_of_week    0.012507
4                          temp    0.009132
7                     windspeed    0.001066
11                        month    0.000893
0                        season    0.000250
15   previous_day_count_missing    0.000243
1                       holiday    0.000088
16  previous_week_count_missing    0.000000


In [243]:
# Drop the columns that aren't helping
# This list would be different if we included previous_hour_count in our modeling

df.drop(['month', 
         'previous_day_count_missing',
         'season', 
         'holiday', 
         'previous_week_count_missing'], axis=1, inplace=True)



In [244]:
# We'll stop here on feature engineering and check our scores and feature importance
df = denote_null_values(df)
df = df.fillna(0)

fit_and_score(df, show_training_score=True, show_feature_importance=True)

Training Set Score: 0.8759040883351111
Validation Set Score: 0.8586461505474074

Feature Importance:
                Columns  Importance
10   previous_day_count    0.564280
11  previous_week_count    0.140284
6           hour_of_day    0.122178
3                 atemp    0.038452
0            workingday    0.032157
9             rush_hour    0.029612
4              humidity    0.019613
1               weather    0.018660
8          day_of_month    0.013369
7           day_of_week    0.011791
2                  temp    0.008613
5             windspeed    0.000989


In [245]:
# Fine-tune the model

# create training & test sets
train, val, test = create_val_splits(df, return_val=True)

# split into X & y
X_train, y_train = train.drop('count', axis=1), train['count']
X_val, y_val = val.drop('count', axis=1), val['count']

n_estimators  = [100, 250, 500]
learning_rate = [.05, .1, .2]
max_depth     = [3, 4, 5, 6]

# and cycle through our model parameters
for estimators in n_estimators:
    for rate in learning_rate:
        for depth in max_depth:
            print(f"Fitting model with parameters:  n_estimators - {estimators}, learning_rate - {rate}, max_depth - {depth}")
            mod   = GradientBoostingRegressor(n_estimators=estimators, learning_rate=rate, max_depth=depth)
            pipe  = make_pipeline(ce.TargetEncoder(), mod)
            pipe.fit(X_train, y_train)
            training_score = pipe.score(X_train, y_train)
            val_score = pipe.score(X_val, y_val)
            print(f"Training score: {training_score}.\n Out-of-sample score: {val_score}.\nScore difference: {training_score - val_score}\n\n")

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 3
Training score: 0.8524796442662717.  Out-of-sample score: 0.8244698015540509. Score difference: 0.028009842712220734

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 4
Training score: 0.8888918089410122.  Out-of-sample score: 0.8747206188472126. Score difference: 0.014171190093799524

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 5
Training score: 0.9177031482658777.  Out-of-sample score: 0.9134335542027598. Score difference: 0.0042695940631178875

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 6
Training score: 0.9395834060496114.  Out-of-sample score: 0.9417495028838953. Score difference: -0.002166096834283815

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.1, max_depth - 3
Training score: 0.8759040883351111.  Out-of-sample score: 0.8586461505474074. Score dif

In [257]:
# best settings are:
# n_estimators - 500
# learning_rate - 0.2
# max_depth - 6

mod   = GradientBoostingRegressor(n_estimators=500, learning_rate=0.2, max_depth=6)
pipe  = make_pipeline(ce.TargetEncoder(), mod)

# KFold validation

# create training & test sets
train, val, test = create_val_splits(df, return_val=True)

# split into X & y
X_train, y_train = train.drop('count', axis=1), train['count']
X_val, y_val = val.drop('count', axis=1), val['count']
X_test, y_test = test.drop('count', axis=1), test['count']

TimeSplitter = TimeSeriesSplit(n_splits=10)

scores = cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=TimeSplitter)

0.9980887797231751

In [249]:
scores

array([0.68214301, 0.70086594, 0.72910495, 0.72237737, 0.85523153,
       0.82652577, 0.88311853, 0.89350435, 0.7809867 , 0.84265843,
       0.84866671, 0.87338528, 0.77526483, 0.87611667, 0.8170452 ,
       0.83528697, 0.86535836, 0.85650642, 0.89429496, 0.83654006])

In [250]:
scores.mean()

0.8197491020147062

In [266]:
pipe.score(X_test, y_test)

0.8430434440420762

In [320]:
df.to_csv('/Users/cameronlefevre/Data Science/coding/GA-DS-Class/Homework/Unit3/data/bikeshare_check.csv')

In [97]:
feats['Importance'].cumsum()

10    0.687476
12    0.821739
9     0.880944
8     0.923227
14    0.962980
2     0.988269
3     0.993217
5     0.998028
4     0.998632
6     0.999142
13    0.999520
15    0.999792
7     0.999934
0     0.999981
1     1.000000
11    1.000000
Name: Importance, dtype: float64

In [98]:
feats.cumsum()

Unnamed: 0,Columns,Importance
10,previous_hour_count,0.687476
12,previous_hour_countprevious_day_count,0.821739
9,previous_hour_countprevious_day_countrush_hour,0.880944
8,previous_hour_countprevious_day_countrush_hour...,0.923227
14,previous_hour_countprevious_day_countrush_hour...,0.96298
2,previous_hour_countprevious_day_countrush_hour...,0.988269
3,previous_hour_countprevious_day_countrush_hour...,0.993217
5,previous_hour_countprevious_day_countrush_hour...,0.998028
4,previous_hour_countprevious_day_countrush_hour...,0.998632
6,previous_hour_countprevious_day_countrush_hour...,0.999142


In [98]:
# DO NOT USE THESE, THEY DID NOT HELP

# Add column to flag whether the ride was during a storm or not
df['is_storming'] = 0

df.loc[df['weather'] == "Light Storms/Rain", 'is_storming'] = 1
df.loc[df['weather'] == "Heavy Storms/Rain", 'is_storming'] = 1




0.804370252517772

In [88]:
pipe.score(X_train, y_train), pipe.score(X_val, y_val)

(0.8251892371442333, 0.804370252517772)

In [91]:
df.groupby(['weather','season'])['count'].mean()

weather            season
Clear Skies        Fall      243.583420
                   Spring    126.781694
                   Summer    236.729595
                   Winter    209.511163
Heavy Storms/Rain  Fall             NaN
                   Spring    164.000000
                   Summer           NaN
                   Winter           NaN
Light Storms/Rain  Fall      156.582915
                   Spring     61.227488
                   Summer    123.906250
                   Winter    134.466667
Partly Cloudy      Fall      230.771523
                   Spring    106.861538
                   Summer    189.515537
                   Winter    194.784387
Name: count, dtype: float64

In [None]:
df['7DayAvg_new']  = df.groupby('hour_of_day').apply(lambda x: x['count'].rolling(7).mean().shift()).values

In [None]:

#df['7DayAvg']  = df.groupby('hour_of_day').apply(lambda x: x['count'].rolling(7).mean().shift()).values

new_thing = df.groupby('hour_of_day').apply(lambda x: x['count'].rolling(7).mean().shift()).values
np.set_printoptions(threshold=np.inf)
print(new_thing)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(new_thing.get_group(0)[['datetime', 'count']])

In [None]:
grouped_df = df.groupby('hour_of_day')[['datetime','hour_of_day', 'count','7DayAvg']]

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(grouped_df.get_group(0))

#for key, item in grouped_df:
#    print(grouped_df.get_group(key), "\n\n")

In [None]:
df.to_csv('/Users/cameronlefevre/Data Science/coding/GA-DS-Class/Homework/Unit3/data/bikeshare_check.csv')

In [None]:
# Help with rolling hourly average