## Imports

### General Imports

In [1]:
import pandas as pd
import os
import joblib

In [2]:
import mlchecks
from mlchecks.base import Dataset

### Imports for checks

In [3]:
from mlchecks.checks.overview import dataset_info, model_info

from mlchecks.checks.integrity import mixed_nulls
from mlchecks.checks.integrity.is_single_value import is_single_value
from mlchecks.checks import string_mismatch

from mlchecks.checks import IndexTrainValidationLeakage, index_train_validation_leakage

## Lending Club

### Load Data & Model

In [4]:
DATASET_BASEDIR = os.path.join('../datasets', 'Lending Club')
os.listdir(DATASET_BASEDIR)

['test.csv', 'model.joblib', 'train.csv']

0        2014-01-01
1        2014-01-01
2        2014-01-01
3        2014-01-01
4        2014-01-01
            ...    
260575   2016-06-01
260576   2016-06-01
260577   2016-06-01
260578   2016-06-01
260579   2016-06-01
Name: issue_d, Length: 260580, dtype: datetime64[ns]

In [32]:
df_train = pd.read_csv(os.path.join(DATASET_BASEDIR, 'train.csv'))
df_train.issue_d = pd.to_datetime(df_train.issue_d)
df_val = pd.read_csv(os.path.join(DATASET_BASEDIR, 'test.csv'))
df_val.issue_d = pd.to_datetime(df_val.issue_d)

lending_club_model = joblib.load(os.path.join(DATASET_BASEDIR, 'model.joblib'))

#### Define Metadata for Dataset

In [33]:
# dataset metadata (manaul)

categorical_features = ['addr_state',
 'application_type',
 'disbursement_method',
 'grade',
 'home_ownership',
 'initial_list_status',
 'purpose',
 'term',
 'verification_status']

all_features = ['sub_grade', 'term', 'home_ownership', 'fico_range_low',
       'total_acc', 'pub_rec', 'revol_util', 'annual_inc', 'int_rate', 'dti',
       'purpose', 'mort_acc', 'loan_amnt', 'application_type', 'installment',
       'verification_status', 'pub_rec_bankruptcies', 'addr_state',
       'initial_list_status', 'fico_range_high', 'revol_bal', 'open_acc',
       'emp_length', 'time_to_earliest_cr_line',
       'pub_rec_percentiles']

label_col_name = 'loan_status'
index_col_name = 'id'
date_col_name = 'issue_d'
# label_name_dict = {0: "Default", 1: "OK"}

In [34]:
ds_train = Dataset(df_train, cat_features = categorical_features, features=all_features,
                   label = label_col_name, index = index_col_name, date=date_col_name)
ds_val = Dataset(df_val, cat_features = categorical_features, features=all_features,
                   label = label_col_name, index = index_col_name, date=date_col_name)

### Additional for showing validation faults


#### demo util function

In [35]:
def dataset_from_dict(d: dict, index_name: str = None) -> Dataset:
    dataframe = pd.DataFrame(data=d)
    return Dataset(dataframe, index=index_name)

#### demo data

In [36]:
# mixed nulls
mixed_nulls_demo_data = {'col1': ['nan', None, 'null', 'Nan', '1', 'cat'], 'col2':['', '', 'None', 'a', 'b', 'c'], 'col3': [1,2,3,4,5,6]}
df_mixed_nulls = pd.DataFrame(data=mixed_nulls_demo_data)

In [37]:
# single value
df_single_value_demo = pd.DataFrame({'a':[3,4,1], 'b':[2,2,2], 'c':[None, None, None], 'd':['a', 4, 6]})

In [38]:
# synthetic index leakage
train_df_synthetic_leakage = dataset_from_dict({'col1': [1, 2, 3, 4, 10, 11]}, 'col1')
val_df_synthetic_leakage = dataset_from_dict({'col1': [4, 3, 5, 6, 7]}, 'col1')

In [39]:
# string mismatch data
data = {'col1': ['Deep', 'deep', 'deep!!!', '$deeP$', 'earth', 'foo', 'bar', 'foo?']}
df_string_mismatch = pd.DataFrame(data=data)

## Run checks

### Overview

#### Dataset Info

In [40]:
dataset_info(ds_train)

Summarize dataset:   0%|          | 0/35 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

(260580, 27)

#### Model Info

In [41]:
model_info(lending_club_model)

parameter,value
iterations,50
learning_rate,0.100141
depth,7
l2_leaf_reg,4.247331
random_seed,0
auto_class_weights,Balanced
eval_metric,AUC
allow_writing_files,False
cat_features,"[sub_grade, term, home_ownership, purpose, application_type, verification_status, addr_state, initial_list_status]"


### Integrity

#### Mixed Nulls

In [42]:
mixed_nulls(df_mixed_nulls)

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Fraction of data
Column Name,Value,Unnamed: 2_level_1,Unnamed: 3_level_1
col1,,1,0.06
col1,,1,0.06
col1,,1,0.06
col1,Nan,1,0.06
col2,,2,0.11
col2,,1,0.06


In [43]:
mixed_nulls(df_train)

In [44]:
mixed_nulls(df_val)

#### Single Value

In [45]:
is_single_value(df_single_value_demo)

Unnamed: 0,b,c
Single unique value,2,


In [46]:
is_single_value(df_train)

In [47]:
is_single_value(df_val)

#### Index Train-Validation Leakage

In [48]:
index_train_validation_leakage(train_df_synthetic_leakage, val_df_synthetic_leakage)

0,1
Sample of validation indexes in train:,"[3, 4]"


In [49]:
index_train_validation_leakage(ds_train, ds_val)

#### String Mismatch

In [50]:
string_mismatch(df_string_mismatch)

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Count,Fraction of data
Column Name,Base form,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
col1,deep,deep,1,0.12
col1,deep,deep!!!,1,0.12
col1,deep,Deep,1,0.12
col1,deep,$deeP$,1,0.12
col1,foo,foo,1,0.12
col1,foo,foo?,1,0.12


In [51]:
string_mismatch(df_train)

# Using suites

In [54]:
from mlchecks import CheckSuite
from mlchecks.checks import MixedTypes, MixedNulls, DatasetDrift



suite = CheckSuite(
    "Basic Suite",
    MixedTypes(),
    MixedNulls(),
    DatasetDrift(over_time=True, column_names="total_acc")
)


a = suite.run(model=lending_club_model, train_dataset=ds_train, validation_dataset=ds_val)

In [56]:
DatasetDrift(over_time=True, column_names="total_acc").run(ds_val, ds_train, lending_club_model)

In [58]:
MixedTypes().run(ds_train)