In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('bikeshare.csv')
df.head()

rand_state = 1000

In [3]:
df['overload'] = (df['cnt'] > 500).astype(int)

In [4]:
pd.crosstab(df['overload'],df['overload'], normalize='all')*100

overload,0,1
overload,Unnamed: 1_level_1,Unnamed: 2_level_1
0,92.640543,0.0
1,0.0,7.359457


In [5]:
df.isnull().sum()/len(df)

season        0.0
yr            0.0
mnth          0.0
hr            0.0
holiday       0.0
weekday       0.0
notbizday     0.0
weathersit    0.0
temp          0.0
hum           0.0
windspeed     0.0
dteday        0.0
cnt           0.0
overload      0.0
dtype: float64

In [6]:
sum(df.duplicated())

0

In [7]:
df.nunique()

season          4
yr              2
mnth           12
hr             24
holiday         2
weekday         7
notbizday       2
weathersit      4
temp           50
hum            89
windspeed      30
dteday        731
cnt           869
overload        2
dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder

# changing the following columns to label encoding:
labeled_categoricals = ['dteday']

le = LabelEncoder()
le.fit_transform(df['dteday'])

mappings = {}
for column in labeled_categoricals:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    mappings[column] = le.classes_

In [9]:
df[labeled_categoricals] = df[labeled_categoricals].astype('category')  
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  int64   
 1   yr          17379 non-null  int64   
 2   mnth        17379 non-null  int64   
 3   hr          17379 non-null  int64   
 4   holiday     17379 non-null  int64   
 5   weekday     17379 non-null  int64   
 6   notbizday   17379 non-null  int64   
 7   weathersit  17379 non-null  int64   
 8   temp        17379 non-null  float64 
 9   hum         17379 non-null  float64 
 10  windspeed   17379 non-null  float64 
 11  dteday      17379 non-null  category
 12  cnt         17379 non-null  int64   
 13  overload    17379 non-null  int64   
dtypes: category(1), float64(3), int64(10)
memory usage: 1.8 MB


#### Boosting Random Forest Classification

In [10]:
y = df['overload']
X = df.drop(columns = ['overload','cnt'], axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rand_state)

In [11]:
from sklearn.ensemble import RandomForestClassifier # we will be using RF as our benchmark.
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [16]:
%%time
# Fitting RF classifier to the Training set
RF_classifier = RandomForestClassifier(random_state=rand_state)
RF_classifier.fit(X_train, y_train)

CPU times: user 419 ms, sys: 3.7 ms, total: 422 ms
Wall time: 423 ms


In [17]:
%%time
# Fitting AdaBoost classifier to the Training set
AdB_classifier = AdaBoostClassifier(random_state=rand_state)
AdB_classifier.fit(X_train, y_train)



CPU times: user 204 ms, sys: 2.58 ms, total: 207 ms
Wall time: 206 ms


In [18]:
%%time
# Fitting Gradient Boosting classifier to the Training set
GBM_classifier = GradientBoostingClassifier(random_state=rand_state, learning_rate=0.1, max_depth=6)
GBM_classifier.fit(X_train, y_train)

CPU times: user 1.53 s, sys: 2.13 ms, total: 1.54 s
Wall time: 1.55 s


In [19]:
%%time
# Fitting XGBoost classifier to the Training set
XGB_classifier = XGBClassifier(random_state=rand_state, eta=0.1, max_depth=6, enable_categorical=True)
XGB_classifier.fit(X_train, y_train)

CPU times: user 437 ms, sys: 154 ms, total: 591 ms
Wall time: 98.1 ms


In [20]:
# Predicting the Test set probabilities and classes
y_hat_RF       = RF_classifier.predict(X_test)
y_hat_AdB      = AdB_classifier.predict(X_test)
y_hat_GBM      = GBM_classifier.predict(X_test)
y_hat_XGB      = XGB_classifier.predict(X_test)


In [21]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

In [23]:
print('RF  f1 = {}'.format(f1_score(y_test, y_hat_RF)))
print('AdB f1 = {}'.format(f1_score(y_test, y_hat_AdB)))
print('GBM f1 = {}'.format(f1_score(y_test, y_hat_GBM)))
print('XGB f1 = {}'.format(f1_score(y_test, y_hat_XGB)))

RF  f1 = 0.7818696883852692
AdB f1 = 0.5912518853695324
GBM f1 = 0.8683181225554107
XGB f1 = 0.8579017264276229


The model with the best f1 score is the GBM model.  

The ranking based on time would be Adaboost, Random Forest, XGboost, and Gradient Boosting. 

Therefore the best model would either XGB or GBM because they had the best accuracy and decent time. 