## Binary Classifier
This notebook builds a binary classification model and performs data, performance, fairness, security, and privacy assessments for it.

The model predicts whether a loan applicant is unqualified or qualified based on their income, credit, etc.

Dataset preparation notebook available [here](https://github.com/credo-ai/customer_demos/blob/prod/prod/d3_loan_approval/data_preparation.ipynb).

In [2]:
#Loading Libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
import pickle

In [3]:
sensitive_feature_keys = ['Gender', 'Race']
label_key = 'loan_approved'

In [4]:
df = pd.read_pickle('../frozen_data/loan_binary/loan_processed.pkl')

In [5]:
sf = df[sensitive_feature_keys]
target = df[label_key]
features = df.drop(sensitive_feature_keys + [label_key], axis=1)


In [6]:
X_train, X_test, y_train, y_test, sf_train, sf_test = train_test_split(
    features, target, sf, random_state=0, test_size=0.3
    )

In [7]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

In [8]:
print('Accuracy on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy on training set: 0.81
Accuracy on test set: 0.82


In [9]:
df_training = pd.concat([sf_train, X_train, y_train], axis=1)
df_validation = pd.concat([sf_test, X_test, y_test], axis=1)

In [11]:
from credoai.lens import Lens
from credoai.artifacts import TabularData, ClassificationModel
from credoai.evaluators import *
from credoai.governance import Governance
import numpy as np

In [12]:
credo_model = ClassificationModel(
    'loan_default_classifier',
    clf
)
test_data = TabularData(
    name='loan_val',
    X=X_test,
    y=y_test,
    sensitive_features=sf_test
)

In [28]:
validation_data = {
    'name': 'loan_val', 
    'val_features': X_test, 
    'val_labels': y_test, 
    'label_name': 'loan_approved', 
    'sensitive_features': sf_test
}

with open('../frozen_data/loan_binary/loan_validation.pkl', 'wb') as f:
    pickle.dump(validation_data, f)

In [30]:
# pipeline scan be specifed using a sklearn-like style
metrics = ["false_negative_rate", "average_precision_score"]
pipeline = [
    (Performance(metrics), 'Performance Assessment'),
    (ModelFairness(metrics), "ModelFairness Assessment"),
]

pipeline_info = {'metrics': metrics, 'assessments': ['Performance', 'ModelFairness']}

with open('../frozen_results/pipeline_info.pkl', 'wb') as f:
    pickle.dump(pipeline_info, f)


In [31]:
lens = Lens(
    model=credo_model,
    assessment_data=test_data,
    pipeline=pipeline
)

lens.run()

2022-10-20 12:29:24,522 - lens - INFO - Evaluator Performance added to pipeline. 
2022-10-20 12:29:24,528 - lens - INFO - Evaluator ModelFairness added to pipeline. Dataset used: assessment_data. Sensitive feature: Gender
2022-10-20 12:29:24,534 - lens - INFO - Evaluator ModelFairness added to pipeline. Dataset used: assessment_data. Sensitive feature: Race
2022-10-20 12:29:24,535 - lens - INFO - Running evaluation for step: Performance Assessment
2022-10-20 12:29:24,536 - lens - INFO - Running evaluation for step: ModelFairness Assessment


<credoai.lens.lens.Lens at 0x28d5e7a60>

In [32]:
results = lens.get_results()
with open('../frozen_results/binary_clf_results.pkl', 'wb') as f:
    pickle.dump(results, f)
# results['Performance Assessment'][0].to_pickle('../frozen_results/loan_perf.pkl')
# results['Fairness Assessment'][0].to_pickle('../frozen_results/loan_parity.pkl')
# results['Fairness Assessment'][1].to_pickle('../frozen_results/loan_disag.pkl')

In [37]:
results['ModelFairness Assessment'][1]

Unnamed: 0,Race,type,value
0,Black,false_negative_rate,0.039216
1,NHPI,false_negative_rate,0.032258
2,White,false_negative_rate,0.033333
3,Black,average_precision_score,0.860209
4,NHPI,average_precision_score,0.803702
5,White,average_precision_score,0.869738
