# Data Quality Dashboard for Bike Sharing Dataset

In [1]:
import pandas as pd
import requests
import zipfile
import io

from sklearn.ensemble import RandomForestRegressor

from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataQualityTab

from evidently.model_profile import Profile
from evidently.model_profile.sections import DataQualityProfileSection

## Bike Sharing Demand Data

More information about the dataset can be found in UCI machine learning repository: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Acknowledgement: Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

In [2]:
content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
    raw_data = pd.read_csv(arc.open("day.csv"), header=0, sep=',', parse_dates=['dteday']) #, index_col='dteday')

In [3]:
ref_data = raw_data[:120]
prod_data = raw_data[120:150]

In [4]:
ref_data

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,116,2011-04-26,2,0,4,0,2,1,1,0.631667,0.594083,0.729167,0.326500,678,3722,4400
116,117,2011-04-27,2,0,4,0,3,1,2,0.620000,0.575142,0.835417,0.312200,547,3325,3872
117,118,2011-04-28,2,0,4,0,4,1,2,0.617500,0.578929,0.700833,0.320908,569,3489,4058
118,119,2011-04-29,2,0,4,0,5,1,1,0.510000,0.497463,0.457083,0.240063,878,3717,4595


## Regression Model

### Model training

In [5]:
target = 'cnt'
prediction = 'prediction'
datetime = 'dteday'

numerical_features = ['temp', 'atemp', 'hum', 'windspeed', 'weekday']
categorical_features = ['season', 'holiday', 'workingday', 'weathersit']

features = numerical_features + categorical_features

In [6]:
model = RandomForestRegressor(random_state = 0)

In [7]:
model.fit(ref_data[features], ref_data[target])

RandomForestRegressor(random_state=0)

In [8]:
ref_data['prediction']  = model.predict(ref_data[features])
prod_data['prediction'] = model.predict(prod_data[features])

## Data Quality Dashboard

In [9]:
column_mapping = ColumnMapping(target,
                               'prediction',
                               datetime=datetime,
                               task='regression',
                               numerical_features=numerical_features,
                               categorical_features=categorical_features)

In [11]:
dashboard = Dashboard(tabs=[DataQualityTab()])

In [12]:
dashboard.calculate(ref_data, prod_data, column_mapping=column_mapping)

In [13]:
dashboard.show()

In [14]:
#dashboard.save('reports/bike_sharing_demand_data_quality.html')

## Regression Model Profile

In [15]:
profile = Profile(sections=[DataQualityProfileSection()])

In [16]:
profile.calculate(ref_data, prod_data, column_mapping=column_mapping)

In [17]:
profile = profile.json() 

In [18]:
profile

'{"data_quality": {"name": "data_quality", "datetime": "2022-02-22 12:39:35.954923", "data": {"utility_columns": {"date": "dteday", "id": null, "target": "cnt", "prediction": "prediction"}, "cat_feature_names": ["season", "holiday", "workingday", "weathersit"], "num_feature_names": ["temp", "atemp", "hum", "windspeed", "weekday"], "datetime_feature_names": [], "target_names": null, "metrics": {"reference": {"cnt": {"feature_type": "num", "count": 120, "infinite_count": 0, "infinite_percentage": 0.0, "missing_count": 0, "missing_percentage": 0.0, "unique_count": 118, "unique_percentage": 98.33, "percentile_25": 1458.25, "percentile_50": 1833.0, "percentile_75": 2432.5, "max": 5312, "min": 431, "mean": 2044.32, "most_common_value": 1685, "most_common_value_percentage": 1.67, "std": 961.15}, "dteday": {"feature_type": "datetime", "count": 120, "missing_count": 0, "missing_percentage": 0.0, "unique_count": 120, "unique_percentage": 100.0, "max": "2011-04-30 00:00:00", "min": "2011-01-01 00