# Importing Libraries

In [9]:
import datetime
import pandas as pd
import numpy as np
import requests
import zipfile
import io
import json

from sklearn import datasets, ensemble, model_selection
from scipy.stats import anderson_ksamp

# Reading the dataset

In [10]:
#content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
#with zipfile.ZipFile(io.BytesIO(content)) as arc:
#    raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])


#https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset

In [12]:
raw_data = pd.read_csv("archive/hour.csv", header=0, sep=',', parse_dates=['dteday'])

In [13]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [14]:
raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)), axis=1)
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [15]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17379 entries, 2011-01-01 00:00:00 to 2012-12-31 23:00:00
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64  

# Testing Between Numerical Features

In [17]:
from scipy import stats
import random

#Significance level

p_value = 0.05
# p_value = 0.1
rejected = 0

numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday']


reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

1. p_value = prob(observing the evidence in the data | assuming the null hypothesis is True)
2. p_value = prob(data | H0)

In [18]:
for col in numerical_features:
	
	test = stats.ks_2samp(reference[col], current[col])

	if test.pvalue < p_value:
		rejected += 1
		print(f"Numerical feature {col} is drifted with p-value {test.pvalue:.4f}")
	else:
		print(f"Numerical feature {col} is not drifted with p-value {test.pvalue:.4f}")

print(f"Total numerical features rejected: {rejected} out of {len(numerical_features)}")

Numerical feature temp is drifted with p-value 0.0000
Numerical feature atemp is drifted with p-value 0.0000
Numerical feature hum is drifted with p-value 0.0015
Numerical feature windspeed is drifted with p-value 0.0406
Numerical feature mnth is drifted with p-value 0.0000
Numerical feature hr is not drifted with p-value 1.0000
Numerical feature weekday is not drifted with p-value 0.9929
Total numerical features rejected: 5 out of 7


# Test between The Categorical Features

In [19]:
from scipy.stats import chi2_contingency

def drift_chisquare(sample1, sample2):
	return chi2_contingency(sample1, sample2)[1]

rejected = 0

for col in categorical_features:

	val = drift_chisquare(
		reference[col].value_counts(),
		current[col].value_counts()
	)

	if val < p_value:
		rejected += 1
		print(f"Categorical feature {col} is drifted with p-value {val:.4f}")
	else:
		print(f"Categorical feature {col} is not drifted with p-value {val:.4f}")	

print(f"Total categorical features rejected: {rejected} out of {len(categorical_features)}")
	


Categorical feature season is not drifted with p-value 1.0000
Categorical feature holiday is not drifted with p-value 1.0000
Categorical feature workingday is not drifted with p-value 1.0000
Total categorical features rejected: 0 out of 3


# Let's train model on the reference data and calculate metrics on the current data

In [20]:
target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'mnth', 'hr', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', ]#'weathersit']

In [21]:
reference = raw_data.loc['2011-01-01 00:00:00':'2011-01-28 23:00:00']
current = raw_data.loc['2011-01-29 00:00:00':'2011-02-28 23:00:00']

In [22]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    reference[numerical_features + categorical_features],
    reference[target],
    test_size=0.3
)

In [23]:
regressor = ensemble.RandomForestRegressor(random_state = 0)

regressor.fit(X_train, y_train)

preds_test = regressor.predict(X_test)

In [24]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

ref_mae=mean_absolute_error(y_test,preds_test)
ref_mse=mean_squared_error(y_test,preds_test)
ref_r2 = r2_score(y_test,preds_test)


print("MAE",ref_mae)
print("MSE",ref_mse)
print("R2",ref_r2)

MAE 10.192634408602151
MSE 216.70043279569893
R2 0.9005021897176082


Check difference between reference data, test part and current data.

In [25]:
current_x=current[numerical_features + categorical_features]
current_y=current[target]

current_pred = regressor.predict(current_x)

In [26]:
print("MAE",mean_absolute_error(current_y,current_pred))
print("MSE",mean_squared_error(current_y,current_pred))

r2 = r2_score(current_y,current_pred)
print(r2)

MAE 19.382294853963838
MSE 1012.0630938803891
0.7382044216716022
