This next model will determine how many bikes are taken out during a certain hour under a certain configuration of variables. 

Process:

1. calculate number of bikes checked out at each station at each point of the day
2. use these data points to predict the number of bikes for particular set of variables

(a lot of repeated code from training of the location model)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import pickle
%matplotlib inline

In [2]:
# import data
data = pd.read_csv('./data/metro-bike-share-trip-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.head()

Unnamed: 0,Trip ID,Duration,Start Time,End Time,Starting Station ID,Starting Station Latitude,Starting Station Longitude,Ending Station ID,Ending Station Latitude,Ending Station Longitude,Bike ID,Plan Duration,Trip Route Category,Passholder Type,Starting Lat-Long,Ending Lat-Long
0,1912818,180,2016-07-07T04:17:00,2016-07-07T04:20:00,3014.0,34.05661,-118.23721,3014.0,34.05661,-118.23721,6281.0,30.0,Round Trip,Monthly Pass,"{'longitude': '-118.23721', 'latitude': '34.05...","{'longitude': '-118.23721', 'latitude': '34.05..."
1,1919661,1980,2016-07-07T06:00:00,2016-07-07T06:33:00,3014.0,34.05661,-118.23721,3014.0,34.05661,-118.23721,6281.0,30.0,Round Trip,Monthly Pass,"{'longitude': '-118.23721', 'latitude': '34.05...","{'longitude': '-118.23721', 'latitude': '34.05..."
2,1933383,300,2016-07-07T10:32:00,2016-07-07T10:37:00,3016.0,34.052898,-118.24156,3016.0,34.052898,-118.24156,5861.0,365.0,Round Trip,Flex Pass,"{'longitude': '-118.24156', 'latitude': '34.05...","{'longitude': '-118.24156', 'latitude': '34.05..."
3,1944197,10860,2016-07-07T10:37:00,2016-07-07T13:38:00,3016.0,34.052898,-118.24156,3016.0,34.052898,-118.24156,5861.0,365.0,Round Trip,Flex Pass,"{'longitude': '-118.24156', 'latitude': '34.05...","{'longitude': '-118.24156', 'latitude': '34.05..."
4,1940317,420,2016-07-07T12:51:00,2016-07-07T12:58:00,3032.0,34.049889,-118.25588,3032.0,34.049889,-118.25588,6674.0,0.0,Round Trip,Walk-up,"{'longitude': '-118.25588', 'latitude': '34.04...","{'longitude': '-118.25588', 'latitude': '34.04..."


In [4]:
# preps the dataframe with scaling and dummy vars
# returns X, y
def prep_df(data, target_var, cat_vars=[], cont_vars=[]):
    df = data.copy()
    total_vars = cont_vars + cat_vars + [target_var]
    model_df = df[total_vars]
    cleaned_df = model_df.dropna(subset=total_vars)
    
    # turns categorical vars into dummy vars
    for var in cat_vars:
        temp_dummy = pd.get_dummies(cleaned_df[var], prefix=var, drop_first=True)
        cleaned_df = pd.concat([cleaned_df.drop([var], axis=1), temp_dummy], axis=1)
    
    # normalize the data
    for var in cont_vars:
        cleaned_df[var] = scale(cleaned_df[var])
    
    return cleaned_df

In [5]:
# extract month from characters 5:7
# extract hour from -8:-6
# round based on -5:-3
def add_time_var(df):
    df = df.copy()
    hour_data = []
    month_data = []
    
    # calculates time of entry to the nearest hour
    for i, row in df.iterrows():
        hour = int(row['Start Time'][-8:-6])
        minutes = int(row['Start Time'][-5:-3])
        if minutes > 30:
            hour = hour + 1
        if hour == 24: 
            hour = 0
        hour_data.append(hour)
        
        month = row['Start Time'][5:7]
        # converts the month variable into int
        if month[-2] == '0': month = month[-1]
        month_data.append(int(month))
        
    df['Month'] = pd.Series(month_data)
    df['Time_of_Day'] = pd.Series(hour_data)
    return df

In [6]:
'''

df: dataframe (after prep_df)
target_var: variable to predict
random_seed: keeps train/test split consistent across models
pickle: whether to pickle the model for further use
training: whether to train on split data


'''
def train_frequency_model(df, target_var, random_seed=100, pickle_configure={'perform': False, 'filename': 'default.p'}):
    # precaution against mutating original dataframe
    df = df.copy()
    training_vars = [v for v in df.columns if v != target_var]
        
    model = LinearRegression()
    model.fit(df[training_vars], df[target_var])
    
    # saves model for later use
    if pickle_configure['perform']:
        pickle.dump(model, open(pickle_configure['filename'], 'wb'))
    
    return model

# Constructing Dataframe

The first step is to create a dataframe with the number of bikes rented for each station during a certain hour of the day. This will serve as the training examples.

In [7]:
time_df = add_time_var(data)

In [8]:
time_df.head(5)

Unnamed: 0,Trip ID,Duration,Start Time,End Time,Starting Station ID,Starting Station Latitude,Starting Station Longitude,Ending Station ID,Ending Station Latitude,Ending Station Longitude,Bike ID,Plan Duration,Trip Route Category,Passholder Type,Starting Lat-Long,Ending Lat-Long,Month,Time_of_Day
0,1912818,180,2016-07-07T04:17:00,2016-07-07T04:20:00,3014.0,34.05661,-118.23721,3014.0,34.05661,-118.23721,6281.0,30.0,Round Trip,Monthly Pass,"{'longitude': '-118.23721', 'latitude': '34.05...","{'longitude': '-118.23721', 'latitude': '34.05...",7,4
1,1919661,1980,2016-07-07T06:00:00,2016-07-07T06:33:00,3014.0,34.05661,-118.23721,3014.0,34.05661,-118.23721,6281.0,30.0,Round Trip,Monthly Pass,"{'longitude': '-118.23721', 'latitude': '34.05...","{'longitude': '-118.23721', 'latitude': '34.05...",7,6
2,1933383,300,2016-07-07T10:32:00,2016-07-07T10:37:00,3016.0,34.052898,-118.24156,3016.0,34.052898,-118.24156,5861.0,365.0,Round Trip,Flex Pass,"{'longitude': '-118.24156', 'latitude': '34.05...","{'longitude': '-118.24156', 'latitude': '34.05...",7,11
3,1944197,10860,2016-07-07T10:37:00,2016-07-07T13:38:00,3016.0,34.052898,-118.24156,3016.0,34.052898,-118.24156,5861.0,365.0,Round Trip,Flex Pass,"{'longitude': '-118.24156', 'latitude': '34.05...","{'longitude': '-118.24156', 'latitude': '34.05...",7,11
4,1940317,420,2016-07-07T12:51:00,2016-07-07T12:58:00,3032.0,34.049889,-118.25588,3032.0,34.049889,-118.25588,6674.0,0.0,Round Trip,Walk-up,"{'longitude': '-118.25588', 'latitude': '34.04...","{'longitude': '-118.25588', 'latitude': '34.04...",7,13


In [9]:
temp_df = time_df.groupby(['Month', 'Time_of_Day', 'Starting Station ID']).agg('count').reset_index()[['Month', 'Time_of_Day', 'Starting Station ID', 'Bike ID']]
temp_df.rename(columns={'Bike ID': 'Count'}, inplace=True)
model_df = prep_df(temp_df, 'Count', cat_vars=['Month', 'Time_of_Day', 'Starting Station ID'])

In [10]:
len(model_df.index)

10518

In [11]:
temp_df.head()

Unnamed: 0,Month,Time_of_Day,Starting Station ID,Count
0,1,0,3005.0,5
1,1,0,3006.0,1
2,1,0,3007.0,1
3,1,0,3008.0,10
4,1,0,3014.0,1


In [12]:
model_df.head()

Unnamed: 0,Count,Month_2,Month_3,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,Time_of_Day_1,...,Starting Station ID_3074.0,Starting Station ID_3075.0,Starting Station ID_3076.0,Starting Station ID_3077.0,Starting Station ID_3078.0,Starting Station ID_3079.0,Starting Station ID_3080.0,Starting Station ID_3081.0,Starting Station ID_3082.0,Starting Station ID_4108.0
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
pickle_configure = {
    'perform': True,
    'filename': 'frequency.p'
}

In [14]:
frequency_model = train_frequency_model(model_df, 'Count')

In [15]:
dep_vars = [v for v in model_df if v != 'Count']
frequency_model.predict(model_df[9001:9002][dep_vars])

array([16.73047569])

In [16]:
model_df[9001:9002]['Count']

9001    10
Name: Count, dtype: int64

It's interesting that for this one example, there's still quite a bit of difference between the prediction and example even though it was from the training set. But as a rough estimate it may be fine.

Another consideration is scaling the predictions. In the simulation, there will not be the same amount of bikes as in the real world, so I will probably divide the number of bikes in the simulation by the number of bikes in the real world, and then use this as a ratio to scale down the number of bikes rented out.

In [19]:
# pickle the model for the APIs use
# pickle.dump(frequency_model, open('frequency.p', 'wb'))