## Imports and configurations

### Config directory path for demo datasets

In [1]:
import os

NOTE - Set DATASETS_BASEDIR to your local folder that contains all required datasets. <br>
Datasets can be found in shared folder: <br> https://drive.google.com/drive/u/0/folders/1WIjlwoUdgwrQj1S9UmJLMbJT6NuKeX7t

In [2]:
DATASETS_BASEDIR = '../../../../Datasets'

Load dataset paths

In [3]:
# verify that DATASETS_BASEDIR exists a
dataset_names = os.listdir(DATASETS_BASEDIR)
# print(dataset_names)

In [4]:
# List all datasets used
DATASET_PATHS = {}
DATASET_PATHS['Lending_Club'] = os.path.join(DATASETS_BASEDIR, 'Lending Club')

In [5]:
for dataset_name in DATASET_PATHS:
    if not os.path.exists(DATASET_PATHS[dataset_name]):
        print("Verify that all required datasets are in your datasets folder!")
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), DATASET_PATHS[dataset_name])

### General Imports

In [6]:
import pandas as pd
import numpy as np
import os
import joblib
import errno

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

### Imports for checks

In [8]:
import mlchecks
from mlchecks.base import Dataset

In [19]:
# Note - all checks are initialized also in mlchecks.checks and can be imported directly from there
# Demonstration here it is just for the sake of order

In [18]:
# Overview
from mlchecks.checks.overview import dataset_info, DatasetInfo
from mlchecks.checks.overview import model_info, ModelInfo
from mlchecks.checks.overview import feature_importance, FeatureImportance

In [29]:
# Integrity

from mlchecks.checks.integrity import data_duplicates, DataDuplicates
from mlchecks.checks.integrity import dominant_frequency_change, DominantFrequencyChange
from mlchecks.checks.integrity import is_single_value, IsSingleValue
from mlchecks.checks.integrity import mixed_nulls, MixedNulls
from mlchecks.checks.integrity import mixed_types, MixedTypes
from mlchecks.checks.integrity import new_category_train_validation, CategoryMismatchTrainTest
from mlchecks.checks.integrity import new_label_train_validation, NewLabelTrainTest
from mlchecks.checks.integrity import rare_format_detection, RareFormatDetection
from mlchecks.checks.integrity import special_characters, SpecialCharacters
from mlchecks.checks.integrity import string_length_outlier, StringLengthOutlier
from mlchecks.checks.integrity import string_mismatch, StringMismatch
from mlchecks.checks.integrity import string_mismatch_comparison, StringMismatchComparison

In [None]:
# Leakage

from mlchecks.checks.leakage import data_sample_leakage_report, DataSampleLeakageReport

from mlchecks.checks.leakage import date_train_validation_leakage_overlap, DateTrainTestLeakageOverlap
from mlchecks.checks.leakage import date_train_validation_leakage_duplicates, DateTrainTestLeakageDuplicates

from mlchecks.checks.leage import single_feature_contribution, SingleFeatureContribution
from mlchecks.checks.leage import single_feature_contribution_train_validation, SingleFeatureContributionTrainTest

from mlchecks.checks.leakage import index_train_validation_leakage, IndexTrainTestLeakage

In [17]:
from mlchecks.checks.performance import performance_report, confusion_matrix_report, PerformanceReport, ConfusionMatrixReport

#### Leakage

#### 

In [13]:
# additiona
from mlchecks.checks.integrity.rare_format_detection import RareFormatDetection, rare_format_detection
from mlchecks.checks.overfit import boosting_overfit, BoostingOverfit
from mlchecks.checks.overfit import performance_overfit, PerformanceOverfit
from mlchecks.checks.integrity.dataset_drift import dataset_drift

ImportError: cannot import name 'BoostingOverfit' from 'mlchecks.checks.overfit' (/mnt/c/Users/Shir/NoSync_Documents/Git/MLChecks/mlchecks/checks/overfit/__init__.py)

## Lending Club

### Load Data & Model

In [None]:
lending_club_path = DATASET_PATHS['Lending_Club']
df_train_lending_club = pd.read_csv(os.path.join(lending_club_path, 'train.csv'))
df_train_lending_club.issue_d = pd.to_datetime(df_train.issue_d)
df_val_lending_club = pd.read_csv(os.path.join(lending_club_path, 'test.csv'))
df_val.issue_d = pd.to_datetime(df_val.issue_d)
lending_club_catboost_clf = joblib.load(os.path.join(lending_club_path, 'model.joblib'))

#### Define Metadata for Dataset

In [None]:
# dataset metadata (manaul)

categorical_features = ['addr_state',
 'application_type',
#  'disbursement_method',
#  'grade',
 'home_ownership',
 'initial_list_status',
 'purpose',
 'term',
 'verification_status']

all_features = ['sub_grade', 'term', 'home_ownership', 'fico_range_low',
       'total_acc', 'pub_rec', 'revol_util', 'annual_inc', 'int_rate', 'dti',
       'purpose', 'mort_acc', 'loan_amnt', 'application_type', 'installment',
       'verification_status', 'pub_rec_bankruptcies', 'addr_state',
       'initial_list_status', 'fico_range_high', 'revol_bal', 'open_acc',
       'emp_length', 'time_to_earliest_cr_line']

label_col_name = 'loan_status'
index_col_name = 'id'
date_col_name = 'issue_d'
# label_name_dict = {0: "Default", 1: "OK"}

In [None]:
df_train_lending_club.columns

In [None]:
ds_train_lending_club = Dataset(df_train_lending_club, cat_features = categorical_features, features=all_features,
                   label = label_col_name, index = index_col_name, date=date_col_name)
ds_val_lending_club = Dataset(df_val_lending_club, cat_features = categorical_features, features=all_features,
                   label = label_col_name, index = index_col_name, date=date_col_name)

### Additional for showing validation faults


#### demo util function

In [None]:
def dataset_from_dict(d: dict, index_name: str = None) -> Dataset:
    dataframe = pd.DataFrame(data=d)
    return Dataset(dataframe, index=index_name)

#### demo data

In [None]:
# mixed nulls
mixed_nulls_demo_data = {'col1': ['nan', None, 'null', 'Nan', '1', 'cat'], 'col2':['', '', 'None', 'a', 'b', 'c'], 'col3': [1,2,3,4,5,6]}
df_mixed_nulls = pd.DataFrame(data=mixed_nulls_demo_data)

In [None]:
# single value
df_single_value_demo = pd.DataFrame({'a':[3,4,1], 'b':[2,2,2], 'c':[None, None, None], 'd':['a', 4, 6]})

In [None]:
# synthetic index leakage
train_df_synthetic_leakage = dataset_from_dict({'col1': [1, 2, 3, 4, 10, 11]}, 'col1')
val_df_synthetic_leakage = dataset_from_dict({'col1': [4, 3, 5, 6, 7]}, 'col1')

In [None]:
# string mismatch data
data = {'col1': ['Deep', 'deep', 'deep!!!', '$deeP$', 'earth', 'foo', 'bar', 'foo?']}
df_string_mismatch = pd.DataFrame(data=data)

In [None]:
# index leakage
iris = load_iris(as_frame=True)
frame = iris.frame
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)
train_ds_index_leakage = Dataset(pd.concat([X_train, y_train], axis=1), 
            features=iris.feature_names,
            label='target')

test_df = pd.concat([X_test, y_test], axis=1)
bad_test = test_df.append(train_ds_index_leakage.data.iloc[[0, 1, 2, 3, 4]], ignore_index=True)
                    
val_ds_index_leakage = Dataset(bad_test, 
            features=iris.feature_names,
            label='target')

In [None]:
# rare format detection
df = pd.DataFrame(np.random.choice(a=['BIG', 'STILL_BIG'], size=(200,3)), columns=['x1', 'x2', 'x3'])
df = df.append({'x1': 'bla', 'x2': 'BIG', 'x3': 1}, ignore_index=True)
df = df.append({'x1': 'bla', 'x2': 'BIG', 'x3': 1}, ignore_index=True)
rare_format_df = df.append({'x1': 'bla2', 'x2': 'BIG', 'x3': 2}, ignore_index=True)

In [None]:
# multiclass models - adaboost, randomforest (e.g. for overfit check)
iris = load_iris(as_frame=True)
frame = iris.frame
X_iris = iris.data
Y_iris = iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
            X, Y, test_size=0.33, random_state=42)
ds_train_iris = Dataset(pd.concat([X_train_iris, y_train_iris], axis=1), 
            features=iris.feature_names,
            label='target')
ds_val_iris = Dataset(pd.concat([X_test_iris, y_test_iris], axis=1), 
            features=iris.feature_names,
            label='target')
iris_multiclass_adaboost_clf = AdaBoostClassifier()
iris_multiclass_adaboost_clf.fit(ds_train_iris.data.drop(ds_train_iris.label_name(), axis=1), ds_train_iris.label_col())
iris_multiclass_rf_clf = RandomForestClassifier()
iris_multiclass_rf_clf.fit(ds_train_iris.data.drop(ds_train_iris.label_name(), axis=1), ds_train_iris.label_col())

##### Drift demo data

In [None]:
# Commented out all this cause drift feature isn't in master yet

In [None]:
# df = pd.read_csv(os.path.join(KKBOX_DATASET_BASEDIR, 'train_clean.csv'))
# test_df = pd.read_csv(os.path.join(KKBOX_DATASET_BASEDIR, 'test_clean.csv'))

In [None]:
# test_df.date = pd.to_datetime(test_df.date*10**9)

In [None]:
# drift_org_dataset = Dataset(test_df,  
#                  features=['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq',
#        'total_secs', 'days_listened', 'plan_list_price', 'is_auto_renew',
#        'is_cancel', 'gender', 'registered_via', 'secs_per_song', 'num_days'],
#                   label='y_true49a0c676-35fd-11ea-978f-2e728ce88125',
#                   cat_features= ['gender', 'registered_via'],
#                   index='msno', date='date')

# drift_compared_dataset = Dataset(df,
#                           features=['num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq',
#        'total_secs', 'days_listened', 'plan_list_price', 'is_auto_renew',
#        'is_cancel', 'gender', 'registered_via', 'secs_per_song', 'num_days'],
#                           label='y_true49a0c676-35fd-11ea-978f-2e728ce88125',
#                           cat_features= ['gender', 'registered_via'],
#                           index='msno')

## Run checks

### Overview

#### Dataset Info

In [None]:
dataset_info(ds_train)

#### Feature Importance (SHAP)

##### Binary Classifier

In [None]:
ds_train.data.shape

In [None]:
feature_importance(ds_train, lending_club_model)

In [None]:
res

##### Multi-class Classifier

In [None]:
feature_importance(ds_train_iris, iris_multiclass_rf_clf)

#### Model Info

In [None]:
model_info(lending_club_model)

In [None]:
model_info(iris_multiclass_adaboost_clf)

### Integrity

#### Mixed Nulls

In [None]:
mixed_nulls(df_mixed_nulls)

In [None]:
mixed_nulls(df_train)

In [None]:
mixed_nulls(df_val)

#### Single Value

In [None]:
is_single_value(df_single_value_demo)

In [None]:
is_single_value(df_train)

In [None]:
is_single_value(df_val)

#### String Mismatch - till here done but not updated

In [None]:
df_string_mismatch

In [None]:
string_mismatch(df_string_mismatch)

In [None]:
string_mismatch(df_train)

In [None]:
string_mismatch(df_val)

#### From here TODO

#### Data Duplicates

#### Dominant Frequency Change

#### Mixed Types

#### New Category

#### New Label

#### Special Characters

#### String Length Outlier

#### String Mismatch Comparison

#### Rare Format Detection

In [None]:
# rare_format_detection(rare_format_df)
# rare_format_df = df.append({'x1': 'bla2', 'x2': 'BIG', 'x3': 2}, ignore_index=True)

In [None]:
# rare_format_detection(df_train)

### Overfit

#### TODO - Boosting Overfit

#### TODO - Performance Overfit

### Drift

In [None]:
# TBD

### Leakage

#### Index Train-Validation Leakage

In [None]:
index_train_validation_leakage(train_df_synthetic_leakage, val_df_synthetic_leakage)

In [None]:
index_train_validation_leakage(ds_train, ds_val)

#### Data Sample Leakage

In [None]:
data_sample_leakage_report(val_ds_index_leakage, train_ds_index_leakage)

In [None]:
# data_sample_leakage_report(ds_val, ds_train)

#### TODO - Single Feature Contribution

### Performance

#### Performance Report

In [None]:
cls_report_check = PerformanceReport()

In [None]:
cls_report_check.run(ds_val, lending_club_model)

#### TODO - Confusion Matrix Report

#### TODO - Naive Comparison

#### TODO - ROC Report