In [40]:
import pandas as pd
import evidently

import io
import requests
import zipfile
from sklearn import datasets , ensemble

from datetime import datetime,time
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset,TargetDriftPreset,RegressionPreset

In [50]:
import pdfkit
import os

In [29]:
content = requests.get("https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')

In [30]:
data.index = data.apply(lambda row: datetime.combine(row.name,time(hour=int(row['hr']))),axis=1)

In [31]:
data.head()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


# Model Training

In [32]:

data.columns
data.dtypes


# here we have tom filter out numerical and categorical columns for column mapping in evidently

instant         int64
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [61]:
data.tail(10)

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2012-12-31 14:00:00,17370,1,1,12,14,0,1,1,2,0.28,0.2727,0.45,0.2239,62,185,247
2012-12-31 15:00:00,17371,1,1,12,15,0,1,1,2,0.28,0.2879,0.45,0.1343,69,246,315
2012-12-31 16:00:00,17372,1,1,12,16,0,1,1,2,0.26,0.2576,0.48,0.194,30,184,214
2012-12-31 17:00:00,17373,1,1,12,17,0,1,1,2,0.26,0.2879,0.48,0.0896,14,150,164
2012-12-31 18:00:00,17374,1,1,12,18,0,1,1,2,0.26,0.2727,0.48,0.1343,10,112,122
2012-12-31 19:00:00,17375,1,1,12,19,0,1,1,2,0.26,0.2576,0.6,0.1642,11,108,119
2012-12-31 20:00:00,17376,1,1,12,20,0,1,1,2,0.26,0.2576,0.6,0.1642,8,81,89
2012-12-31 21:00:00,17377,1,1,12,21,0,1,1,1,0.26,0.2576,0.6,0.1642,7,83,90
2012-12-31 22:00:00,17378,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61
2012-12-31 23:00:00,17379,1,1,12,23,0,1,1,1,0.26,0.2727,0.65,0.1343,12,37,49


In [33]:
target = 'cnt'
prediction = 'prediction'
numerical_features = ['temp','atemp','hum','windspeed','hr','weekday']
categorical_features = ['season','holiday','workingday']




In [34]:
reference = data.loc["2011-01-01 00:00:00":"2011-01-28 23:00:00"]
current = data.loc["2011-01-29 00:00:00":"2011-02-28 23:00:00"]

In [35]:
reference.head()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [36]:
current.head()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-29 00:00:00,619,1,0,1,0,0,6,0,1,0.22,0.197,0.64,0.3582,2,26,28
2011-01-29 01:00:00,620,1,0,1,1,0,6,0,1,0.22,0.2273,0.64,0.194,0,20,20
2011-01-29 02:00:00,621,1,0,1,2,0,6,0,1,0.22,0.2273,0.64,0.1642,0,15,15
2011-01-29 03:00:00,622,1,0,1,3,0,6,0,1,0.2,0.2121,0.64,0.1343,3,5,8
2011-01-29 04:00:00,623,1,0,1,4,0,6,0,1,0.16,0.1818,0.69,0.1045,1,2,3


# setting up the regressor model


In [37]:
regressor = ensemble.RandomForestRegressor(random_state=0,n_estimators=50)
regressor.fit(reference[numerical_features+categorical_features],reference[target])


In [38]:
reference_prediction = regressor.predict(reference[numerical_features+categorical_features])
current_prediction = regressor.predict(current[numerical_features+categorical_features])


In [39]:
reference[prediction]= reference_prediction
current[prediction] = current_prediction

# Evaluating Model Performance


In [43]:
column_mapping = ColumnMapping()
column_mapping.target = target
column_mapping.prediction = prediction
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features


In [None]:
regression_performance = Report(metrics=[RegressionPreset()])
regression_performance.run(current_data=reference,reference_data=None,column_mapping=column_mapping)

regression_performance.show()

# saving the performance report

In [47]:
#regression_performance.save_html('regression_performance_report_at_training.html')

# evaluating results with week1 now

In [None]:
regression_performance = Report(metrics=[RegressionPreset()])
regression_performance.run(current_data=current.loc["2011-01-29 00:00:00":"2011-02-07 23:00:00"],
                           reference_data=reference,
                           column_mapping=column_mapping)

regression_performance.show()

In [49]:
#regression_performance.save_html("Performance_report_after_1_week.html")

# Evaluating performance reports for next 2 weeks

In [None]:
regression_performance = Report(metrics=[RegressionPreset()])
regression_performance.run(current_data=current.loc["2011-02-07 00:00:00":"2011-02-27 23:00:00"],
                           reference_data=reference,
                           column_mapping=column_mapping)

regression_performance.show()

In [55]:
regression_performance.save_html("Performance_report_next_3_weeks.html")

# checking target data drift for the next 3 weeks we just did

In [None]:
target_drift = Report(metrics=[TargetDriftPreset()])
target_drift.run(current_data=current.loc["2011-02-07 00:00:00":"2011-02-27 23:00:00"],
                 reference_data=reference,
                 column_mapping=column_mapping)
target_drift.show()

In [59]:
target_drift.save_html("Target_Drift_for_next_3_weeks.html")

# now lets retrain the model using newer datasets with increased accuracy

In [None]:

reference2 = data.
