# Using deepchecks to validate data and monitor models in time

In [2]:
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from deepchecks.tabular import Dataset
from deepchecks.tabular.suites import train_test_validation
from deepchecks.tabular.suites import model_evaluation

In [2]:
Xtrain = pd.read_csv('../train.csv')
Xtest = pd.read_csv('../holdout.csv')

In [3]:
Xtest.index = max(Xtrain.index)+1+np.arange(Xtest.shape[0])

In [4]:
Xtest

Unnamed: 0,age,work_class,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
17099,19,Private,High School grad,Never-Married,Sales,Own-child,White,Female,0.0,0.0,7.0,United-States,<=50K
17100,49,Private,Masters,Married,White-Collar,Husband,White,Male,15024.0,0.0,80.0,United-States,>50K
17101,50,Private,High School grad,Married,Blue-Collar,Husband,White,Male,3103.0,0.0,40.0,United-States,>50K
17102,39,Local-gov,Masters,Never-Married,Professional,Not-in-family,White,Female,0.0,0.0,50.0,United-States,<=50K
17103,26,Private,High School grad,Married,Admin,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30666,32,Private,Bachelors,Never-Married,Sales,Own-child,White,Male,13550.0,0.0,35.0,United-States,>50K
30667,35,Private,Bachelors,Married,Sales,Husband,White,Male,7298.0,0.0,48.0,United-States,>50K
30668,42,Private,High School grad,Married,Admin,Wife,White,Female,0.0,0.0,36.0,United-States,>50K
30669,41,Private,Bachelors,Separated,Admin,Unmarried,White,Female,0.0,0.0,33.0,United-States,<=50K


In [5]:
cat_feat = [i for i in Xtrain.columns if Xtrain[i].dtype=='object']
cat_feat

['work_class',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'income']

In [6]:
cat_feat.remove('income')
clf = CatBoostClassifier(cat_features=cat_feat, 
                         n_estimators=100, 
                         verbose=False).fit(Xtrain.drop(['income'],axis=1),
                                            Xtrain['income'])

In [7]:
print(classification_report(Xtest['income'],clf.predict(Xtest.drop(['income'],axis=1))))

              precision    recall  f1-score   support

       <=50K       0.89      0.94      0.91     10232
        >50K       0.78      0.64      0.70      3340

    accuracy                           0.87     13572
   macro avg       0.83      0.79      0.81     13572
weighted avg       0.86      0.87      0.86     13572



In [8]:
train_ds1 = Dataset(Xtrain, label='income',cat_features=cat_feat, set_index_from_dataframe_index=True)
test_ds1 = Dataset(Xtest, label='income',cat_features=cat_feat, set_index_from_dataframe_index=True)

In [9]:
validation_suite1 = train_test_validation()
suite_result1 = validation_suite1.run(train_ds1, test_ds1)


In [10]:
suite_result1

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_7IQSZ8HQ1Q65O8DEN2L8KOTSU">Train Test Validat…

In [11]:
suite_result1.save_as_html('report.html')

'report (1).html'

In [12]:
suite_result1.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_KT12W554L1146DX8H90RWVCB9">Train Test Validat…

In [13]:
result1 = model_evaluation().run(train_dataset=train_ds1, test_dataset=test_ds1, model=clf)

In [14]:
result1.show()

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_TUIS51XMPUS90N1UB7JF5F0TP">Model Evaluation S…

In [15]:
result1.save_as_html('model_report.html')

'model_report (2).html'