<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Bike-Demand-Data" data-toc-modified-id="Bike-Demand-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Bike Demand Data</a></span></li></ul></div>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2019-01-26 13:17:20 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.2
pandas 0.23.4
sklearn 0.20.2
matplotlib 2.2.2


# Bike Demand Data

In [2]:
import os

# https://www.kaggle.com/c/bike-sharing-demand/data
data_dir = 'all'
data_path = os.path.join(data_dir, 'train.csv')
df = pd.read_csv(data_path)
print(df.shape)
df.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
datetime = 'datetime'
year = 'year'
month = 'month'
dayofyear = 'dayofyear'
hour = 'hour'


def generate_time_features(df):
    df[datetime] = pd.to_datetime(df[datetime])
    df[year] = df[datetime].dt.year
    df[month] = df[datetime].dt.month
    df[hour] = df[datetime].dt.hour
    df[dayofyear] = df[datetime].dt.dayofyear
    df = df.drop(datetime, axis=1)
    return df

In [4]:
df = generate_time_features(df)
df.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour,dayofyear
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,0,1
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,2,1
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,3,1
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,4,1


In [5]:
cat_cols = [
    'season', 'holiday', 'workingday', 'weather',
    year, month
]
num_cols = [
    'temp', 'atemp', 'humidity', 'windspeed', 'casual'
]

for cat_col in cat_cols:
    df[cat_col] = df[cat_col].astype('category')

df.dtypes

season        category
holiday       category
workingday    category
weather       category
temp           float64
atemp          float64
humidity         int64
windspeed      float64
casual           int64
registered       int64
count            int64
year          category
month         category
hour             int64
dayofyear        int64
dtype: object

In [6]:
count = 'count'

label = df[count]
df = df.drop(count, axis=1)
df.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,year,month,hour,dayofyear
0,1,0,0,1,9.84,14.395,81,0.0,3,13,2011,1,0,1
1,1,0,0,1,9.02,13.635,80,0.0,8,32,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,0.0,5,27,2011,1,2,1
3,1,0,0,1,9.84,14.395,75,0.0,3,10,2011,1,3,1
4,1,0,0,1,9.84,14.395,75,0.0,0,1,2011,1,4,1


In [7]:
from lightgbm import LGBMRegressor


model_lgb = LGBMRegressor(
    n_jobs = -1,
    max_depth = 5,
    min_data_in_leaf = 100,
    subsample = 0.9,
    n_estimators = 80,
    learning_rate = 0.1,
    colsample_bytree = 0.9,
    objective = 'regression',
    boosting_type = 'gbdt'
)
model_lgb.fit(df, label)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.1, max_depth=5, min_child_samples=20,
       min_child_weight=0.001, min_data_in_leaf=100, min_split_gain=0.0,
       n_estimators=80, n_jobs=-1, num_leaves=31, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=0.9, subsample_for_bin=200000, subsample_freq=1)

In [8]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model_lgb.predict(df)

print('r2: ', r2_score(label, y_pred))
print('mean squared error: ', mean_squared_error(label, y_pred))

r2:  0.9987738776554763
mean squared error:  40.22944059491395


In [12]:
from sklearn.preprocessing import OneHotEncoder

workingday = 'workingday'

one_hot = OneHotEncoder(categories='auto', sparse=False)
X = one_hot.fit_transform(df[[workingday]])
print(one_hot.categories_)
X

[array([0, 1])]


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [14]:
import statsmodels.api as sm

X = sm.add_constant(X) # adding a constant
y = label

model = sm.OLS(y, X[:, :-1]).fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                  count   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.463
Date:                Sat, 26 Jan 2019   Prob (F-statistic):              0.226
Time:                        13:18:20   Log-Likelihood:                -72045.
No. Observations:               10886   AIC:                         1.441e+05
Df Residuals:                   10884   BIC:                         1.441e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        193.0119      2.104     91.735      0.0

In [16]:
predictions = model.predict(X[:, :-1])
predictions[:5]

array([188.50662061, 188.50662061, 188.50662061, 188.50662061,
       188.50662061])