## Imports and configurations

### Config directory path for demo datasets

In [1]:
import os

NOTE - Set DATASETS_BASEDIR to your local folder that contains all required datasets. <br>
Datasets can be found in shared folder: <br> https://drive.google.com/drive/u/0/folders/1WIjlwoUdgwrQj1S9UmJLMbJT6NuKeX7t

In [49]:
DATASETS_BASEDIR = '../../../../Datasets'

Load dataset paths

In [2]:
# verify that DATASETS_BASEDIR exists a
dataset_names = os.listdir(DATASETS_BASEDIR)
# print(dataset_names)

In [3]:
# List all datasets used
DATASET_PATHS = {}
DATASET_PATHS['Lending_Club'] = os.path.join(DATASETS_BASEDIR, 'Lending Club')

In [4]:
for dataset_name in DATASET_PATHS:
    if not os.path.exists(DATASET_PATHS[dataset_name]):
        print("Verify that all required datasets are in your datasets folder!")
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), DATASET_PATHS[dataset_name])

### General Imports

In [5]:
import pandas as pd
import numpy as np
import os
import joblib
import errno

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier

### Imports for checks

In [7]:
import mlchecks
from mlchecks.base import Dataset

In [8]:
from mlchecks.checks.overview import dataset_info, model_info

from mlchecks.checks.integrity import mixed_nulls
from mlchecks.checks.integrity.is_single_value import is_single_value
from mlchecks.checks import string_mismatch
from mlchecks.checks.leakage import DataSampleLeakageReport, data_sample_leakage_report

from mlchecks.checks import IndexTrainValidationLeakage, index_train_validation_leakage

from mlchecks.checks.performance import ClassificationReport, ConfusionMatrixReport

In [9]:
# Checks that were in demo but aren't in master yet:
# from mlchecks.checks.integrity.rare_format_detection import RareFormatDetection, rare_format_detection
# from mlchecks.checks.overfit import train_validation_difference_overfit
# from mlchecks.checks.integrity.dataset_drift import dataset_drift

## Lending Club

### Load Data & Model

In [10]:
lending_club_path = DATASET_PATHS['Lending_Club']
df_train = pd.read_csv(os.path.join(lending_club_path, 'train.csv'))
df_train.issue_d = pd.to_datetime(df_train.issue_d)
df_val = pd.read_csv(os.path.join(lending_club_path, 'test.csv'))
df_val.issue_d = pd.to_datetime(df_val.issue_d)
lending_club_model = joblib.load(os.path.join(lending_club_path, 'model.joblib'))

#### Define Metadata for Dataset

In [11]:
# dataset metadata (manaul)

categorical_features = ['addr_state',
 'application_type',
 'disbursement_method',
 'grade',
 'home_ownership',
 'initial_list_status',
 'purpose',
 'term',
 'verification_status']

all_features = ['sub_grade', 'term', 'home_ownership', 'fico_range_low',
       'total_acc', 'pub_rec', 'revol_util', 'annual_inc', 'int_rate', 'dti',
       'purpose', 'mort_acc', 'loan_amnt', 'application_type', 'installment',
       'verification_status', 'pub_rec_bankruptcies', 'addr_state',
       'initial_list_status', 'fico_range_high', 'revol_bal', 'open_acc',
       'emp_length', 'time_to_earliest_cr_line']

label_col_name = 'loan_status'
index_col_name = 'id'
date_col_name = 'issue_d'
# label_name_dict = {0: "Default", 1: "OK"}

In [12]:
ds_train = Dataset(df_train, cat_features = categorical_features, features=all_features,
                   label = label_col_name, index = index_col_name, date=date_col_name)
ds_val = Dataset(df_val, cat_features = categorical_features, features=all_features,
                   label = label_col_name, index = index_col_name, date=date_col_name)

### Additional for showing validation faults


#### demo util function

In [13]:
def dataset_from_dict(d: dict, index_name: str = None) -> Dataset:
    dataframe = pd.DataFrame(data=d)
    return Dataset(dataframe, index=index_name)

#### demo data

In [14]:
# mixed nulls
mixed_nulls_demo_data = {'col1': ['nan', None, 'null', 'Nan', '1', 'cat'], 'col2':['', '', 'None', 'a', 'b', 'c'], 'col3': [1,2,3,4,5,6]}
df_mixed_nulls = pd.DataFrame(data=mixed_nulls_demo_data)

In [15]:
# single value
df_single_value_demo = pd.DataFrame({'a':[3,4,1], 'b':[2,2,2], 'c':[None, None, None], 'd':['a', 4, 6]})

In [16]:
# synthetic index leakage
train_df_synthetic_leakage = dataset_from_dict({'col1': [1, 2, 3, 4, 10, 11]}, 'col1')
val_df_synthetic_leakage = dataset_from_dict({'col1': [4, 3, 5, 6, 7]}, 'col1')

In [17]:
# string mismatch data
data = {'col1': ['Deep', 'deep', 'deep!!!', '$deeP$', 'earth', 'foo', 'bar', 'foo?']}
df_string_mismatch = pd.DataFrame(data=data)

In [18]:
# index leakage
iris = load_iris(as_frame=True)
frame = iris.frame
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)
train_ds_index_leakage = Dataset(pd.concat([X_train, y_train], axis=1), 
            features=iris.feature_names,
            label='target')

test_df = pd.concat([X_test, y_test], axis=1)
bad_test = test_df.append(train_ds_index_leakage.data.iloc[[0, 1, 2, 3, 4]], ignore_index=True)
                    
val_ds_index_leakage = Dataset(bad_test, 
            features=iris.feature_names,
            label='target')

In [19]:
# rare format detection
df = pd.DataFrame(np.random.choice(a=['BIG', 'STILL_BIG'], size=(200,3)), columns=['x1', 'x2', 'x3'])
df = df.append({'x1': 'bla', 'x2': 'BIG', 'x3': 1}, ignore_index=True)
df = df.append({'x1': 'bla', 'x2': 'BIG', 'x3': 1}, ignore_index=True)
rare_format_df = df.append({'x1': 'bla2', 'x2': 'BIG', 'x3': 2}, ignore_index=True)

In [20]:
# overfit check
overfit_clf = AdaBoostClassifier()
iris = load_iris(as_frame=True)
frame = iris.frame
X = iris.data
Y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
            X, Y, test_size=0.33, random_state=42)
overfit_train_ds = Dataset(pd.concat([X_train, y_train], axis=1), 
            features=iris.feature_names,
            label='target')
overfit_val_ds = Dataset(pd.concat([X_test, y_test], axis=1), 
            features=iris.feature_names,
            label='target')
overfit_clf.fit(X_train, y_train)

AdaBoostClassifier()

##### Drift demo data

In [21]:
# Commented out all this cause drift feature isn't in master yet

In [22]:
# df = pd.read_csv(os.path.join(KKBOX_DATASET_BASEDIR, 'train_clean.csv'))
# test_df = pd.read_csv(os.path.join(KKBOX_DATASET_BASEDIR, 'test_clean.csv'))

In [23]:
# test_df.date = pd.to_datetime(test_df.date*10**9)

In [24]:
# drift_org_dataset = Dataset(test_df,  
#                  features=['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq',
#        'total_secs', 'days_listened', 'plan_list_price', 'is_auto_renew',
#        'is_cancel', 'gender', 'registered_via', 'secs_per_song', 'num_days'],
#                   label='y_true49a0c676-35fd-11ea-978f-2e728ce88125',
#                   cat_features= ['gender', 'registered_via'],
#                   index='msno', date='date')

# drift_compared_dataset = Dataset(df,
#                           features=['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq',
#        'total_secs', 'days_listened', 'plan_list_price', 'is_auto_renew',
#        'is_cancel', 'gender', 'registered_via', 'secs_per_song', 'num_days'],
#                           label='y_true49a0c676-35fd-11ea-978f-2e728ce88125',
#                           cat_features= ['gender', 'registered_via'],
#                           index='msno')

## Run checks

### Overview

#### Dataset Info

In [25]:
dataset_info(ds_train)

Summarize dataset:   0%|          | 0/35 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

#### Model Info

In [26]:
model_info(lending_club_model)

Unnamed: 0,parameter,value
0,iterations,50
1,learning_rate,0.100141
2,depth,7
3,l2_leaf_reg,4.247331
4,random_seed,0
5,auto_class_weights,Balanced
6,eval_metric,AUC
7,allow_writing_files,False
8,cat_features,"['sub_grade', 'term', 'home_ownership', 'purpose', 'application_type', 'verification_status', 'addr_state', 'initial_list_status']"


### Integrity

#### Mixed Nulls

In [27]:
mixed_nulls(df_mixed_nulls)

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Fraction of data
Column Name,Value,Unnamed: 2_level_1,Unnamed: 3_level_1
col1,,1,0.06
col1,,1,0.06
col1,,1,0.06
col1,Nan,1,0.06
col2,,2,0.11
col2,,1,0.06


In [28]:
mixed_nulls(df_train)

In [29]:
mixed_nulls(df_val)

#### Single Value

In [30]:
is_single_value(df_single_value_demo)

Unnamed: 0,b,c
Single unique value,2,


In [31]:
is_single_value(df_train)

In [32]:
is_single_value(df_val)

#### String Mismatch

In [33]:
df_string_mismatch

Unnamed: 0,col1
0,Deep
1,deep
2,deep!!!
3,$deeP$
4,earth
5,foo
6,bar
7,foo?


In [34]:
string_mismatch(df_string_mismatch)

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Count,Fraction of data
Column Name,Base form,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
col1,deep,deep!!!,1,0.12
col1,deep,deep,1,0.12
col1,deep,$deeP$,1,0.12
col1,deep,Deep,1,0.12
col1,foo,foo?,1,0.12
col1,foo,foo,1,0.12


In [35]:
string_mismatch(df_train)

In [36]:
string_mismatch(df_val)

#### Rare Format Detection

In [37]:
# rare_format_detection(rare_format_df)
# rare_format_df = df.append({'x1': 'bla2', 'x2': 'BIG', 'x3': 2}, ignore_index=True)

In [38]:
# rare_format_detection(df_train)

### Overfit

In [39]:
# commented out because not in master yet

In [40]:
# train_validation_difference_overfit(overfit_train_ds, overfit_val_ds, overfit_clf)

In [41]:
# train_validation_difference_overfit(ds_train, ds_val, lending_club_model)

### Drift

In [42]:
# commented out because not in master yet
# dataset_drift(drift_org_dataset, drift_compared_dataset,over_time=True)

### Leakage

#### Index Train-Validation Leakage

In [43]:
index_train_validation_leakage(train_df_synthetic_leakage, val_df_synthetic_leakage)

Unnamed: 0,0
Sample of validation indexes in train:,"[3, 4]"


In [44]:
index_train_validation_leakage(ds_train, ds_val)

#### Data Sample Leakage

In [45]:
data_sample_leakage_report(val_ds_index_leakage, train_ds_index_leakage)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
test indexes: 23,5.1,3.3,1.7,0.5,0
validation indexes: 48,5.1,3.3,1.7,0.5,0
"test indexes: 101, 142",5.8,2.7,5.1,1.9,2
validation indexes: 45,5.8,2.7,5.1,1.9,2
test indexes: 115,6.4,3.2,5.3,2.3,2
validation indexes: 49,6.4,3.2,5.3,2.3,2
test indexes: 110,6.5,3.2,5.1,2.0,2
validation indexes: 47,6.5,3.2,5.1,2.0,2
test indexes: 131,7.9,3.8,6.4,2.0,2
validation indexes: 46,7.9,3.8,6.4,2.0,2


In [46]:
# data_sample_leakage_report(ds_val, ds_train)

### Performance

#### Classification Report

In [47]:
cls_report_check = ClassificationReport()

In [48]:
cls_report_check.run(ds_val, lending_club_model)

TypeError: Cannot convert StringArray to numpy.ndarray