# **Bike Sharing ML Design**

In [1]:
import numpy as np
import pandas as pd

In [2]:
import requests

content = requests.get("https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip").content

In [5]:
import zipfile
import os
import io

with zipfile.ZipFile(io.BytesIO(content)) as zf:
    raw_data = pd.read_csv(zf.open('hour.csv'), header=0, sep=',', parse_dates=['dteday'])

In [7]:
import datetime

raw_data.index =raw_data.apply(lambda x: datetime.datetime.combine(x['dteday'].date(), datetime.time(x['hr'])), axis=1)

In [8]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [9]:
# Checking missing values
raw_data.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

In [10]:
# Split my data for drifting analysis
reference = raw_data.loc['2011-01-01 00:00:00': '2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00': '2011-02-28 23:00:00']

In [11]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


### **Statistical Analysis**

In [40]:
# Features according to data type
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season','holiday', 'workingday', 'weekday', 'mnth',  'hr']

In [43]:
from scipy import stats

alpha = 0.05
rejected = 0

for col in numerical_features:
    test = stats.ks_2samp(reference[col], current[col])
    print(f'{col} : {test.pvalue:.2f}')

    if test.pvalue < alpha:
        print(f'{col} : Drift Detected')
        rejected += 1

temp : 0.00
temp : Drift Detected
atemp : 0.00
atemp : Drift Detected
hum : 0.00
hum : Drift Detected
windspeed : 0.04
windspeed : Drift Detected


In [44]:
for col in categorical_features:
    chi2_contingency = stats.chi2_contingency(reference[col].value_counts(), current[col].value_counts())
    print(col,":", chi2_contingency.pvalue)

    if chi2_contingency.pvalue < alpha:
        print(col, ": Drift detected")
        rejected += 1

season : 1.0
holiday : 1.0
workingday : 1.0
weekday : 1.0
mnth : 1.0
hr : 1.0


### **Model Evaluation Technique**

In [46]:
# Target Variable
target = 'cnt'

from sklearn.model_selection import train_test_split

# Data Split
X_train, X_test, y_train, y_test = train_test_split(reference[numerical_features + categorical_features], reference[target], test_size=0.2)

from sklearn import ensemble

# Model on Reference Set
regressor = ensemble.RandomForestRegressor(random_state=0)
regressor.fit(X_train, y_train)

preds = regressor.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Metrics
print("MSE:", mean_squared_error(y_test, preds))
print("MAE:", mean_absolute_error(y_test, preds))

MSE: 179.7478709677419
MAE: 9.782741935483871


In [47]:
# Data Split on Current Set
X_train, X_test, y_train, y_test = train_test_split(current[numerical_features + categorical_features], current[target], test_size=0.2)

# Model on Current Set
regressor = ensemble.RandomForestRegressor(random_state=0)
regressor.fit(X_train, y_train)

preds = regressor.predict(X_test)

print("MSE:", mean_squared_error(y_test, preds))
print("MAE:", mean_absolute_error(y_test, preds))

MSE: 514.2982826388888
MAE: 14.840902777777778


### **Model Evaluation using MLFLOW**

In [50]:
import mlflow

mlflow.set_experiment('Bike-Sharing')

batches = [
    ('2011-01-29 00:00:00', '2011-02-07 23:00:00'),
    ('2011-02-08 00:00:00', '2011-02-14 23:00:00'),
    ('2011-02-15 00:00:00', '2011-02-21 23:00:00')
]

for batch in batches:
    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", str(batch[0]) + "-" + str(batch[1]))

        mlflow.log_param("begin",batch[0])
        mlflow.log_param("end", batch[1])

        current_data= current.loc[batch[0]: batch[1]]
        current_x = current_data[numerical_features+categorical_features]
        current_y= current_data['cnt']

        current_preds = regressor.predict(current_x)

        mlflow.log_metric("MSE", mean_squared_error(current_y, current_preds))
        mlflow.log_metric("MAE", mean_absolute_error(current_y, current_preds))

        mlflow.sklearn.log_model(regressor, "model")

        print(run.info)

2024/05/17 13:46:59 INFO mlflow.tracking.fluent: Experiment with name 'Bike-Sharing' does not exist. Creating a new experiment.


<RunInfo: artifact_uri='file:///home/user/mojorojo-sys-design/mlruns/702114648973503494/378b1e0d76944c52b04cb67b21429c21/artifacts', end_time=None, experiment_id='702114648973503494', lifecycle_stage='active', run_id='378b1e0d76944c52b04cb67b21429c21', run_name='skittish-squid-84', run_uuid='378b1e0d76944c52b04cb67b21429c21', start_time=1715953619786, status='RUNNING', user_id='user'>




<RunInfo: artifact_uri='file:///home/user/mojorojo-sys-design/mlruns/702114648973503494/85bae901e3a2405c9962bfd4814acd05/artifacts', end_time=None, experiment_id='702114648973503494', lifecycle_stage='active', run_id='85bae901e3a2405c9962bfd4814acd05', run_name='bright-hare-324', run_uuid='85bae901e3a2405c9962bfd4814acd05', start_time=1715953624188, status='RUNNING', user_id='user'>
<RunInfo: artifact_uri='file:///home/user/mojorojo-sys-design/mlruns/702114648973503494/ca3955de029b4fd79336eec8cfb0ffe5/artifacts', end_time=None, experiment_id='702114648973503494', lifecycle_stage='active', run_id='ca3955de029b4fd79336eec8cfb0ffe5', run_name='youthful-gnat-185', run_uuid='ca3955de029b4fd79336eec8cfb0ffe5', start_time=1715953627964, status='RUNNING', user_id='user'>


