# Sensitivity Analysis on a DAI Model

In [1]:
import requests
import math
import os
import pandas as pd
from h2oai_client import Client, ModelParameters, InterpretParameters
from sklearn.model_selection import train_test_split

# Download, explore, and prepare UCI credit card default data
UCI credit card default data: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

The UCI credit card default data contains demographic and payment information about credit card customers in Taiwan in the year 2005. The data set contains 23 input variables:

* LIMIT_BAL: Amount of given credit (NT dollar)
* SEX: 1 = male; 2 = female
* EDUCATION: 1 = graduate school; 2 = university; 3 = high school; 4 = others
* MARRIAGE: 1 = married; 2 = single; 3 = others
* AGE: Age in years
* PAY_0, PAY_2 - PAY_6: History of past payment; PAY_0 = the repayment status in September, 2005; PAY_2 = the repayment status in August, 2005; ...; PAY_6 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; ...; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
* BILL_AMT1 - BILL_AMT6: Amount of bill statement (NT dollar). BILL_AMNT1 = amount of bill statement in September, 2005; BILL_AMT2 = amount of bill statement in August, 2005; ...; BILL_AMT6 = amount of bill statement in April, 2005.
* PAY_AMT1 - PAY_AMT6: Amount of previous payment (NT dollar). PAY_AMT1 = amount paid in September, 2005; PAY_AMT2 = amount paid in August, 2005; ...; PAY_AMT6 = amount paid in April, 2005.

These 23 input variables are used to predict the target variable, whether or not a customer defaulted on their credit card bill in late 2005.

# Import data and clean
The credit card default data is available as an .xls file. Pandas reads .xls files automatically, so it's used to load the credit card default data and give the prediction target a shorter name: DEFAULT_NEXT_MONTH.

In [2]:
#Import XLS file
path = 'data/default_of_credit_card_clients.xls'
data = pd.read_excel(path,
                     skiprows=1)

#Remove spaces from target column name 
data = data.rename(columns={'default payment next month': 'DEFAULT_NEXT_MONTH'})

# Helper function for recoding values in the UCI credit card default data
This simple function maps longer, more understandable character string values from the UCI credit card default data dictionary to the original integer values of the input variables found in the dataset. 

In [3]:
def recode_cc_data(frame):
    
    """ Recodes numeric categorical variables into categorical character variables
    with more transparent values. 
    
    Args:
        frame: Pandas DataFrame version of UCI credit card default data.
        
    Returns: 
        Pandas DataFrame with recoded values.
        
    """
    
    #Define recoded values
    sex_dict = {1:'male', 2:'female'}
    education_dict = {0:'other', 1:'graduate school', 2:'university', 3:'high school', 
                      4:'other', 5:'other', 6:'other'}
    marriage_dict = {0:'other', 1:'married', 2:'single', 3:'divorced'}
    pay_dict = {-2:'no consumption', -1:'pay duly', 0:'use of revolving credit', 1:'1 month delay', 
                2:'2 month delay', 3:'3 month delay', 4:'4 month delay', 5:'5 month delay', 6:'6 month delay', 
                7:'7 month delay', 8:'8 month delay', 9:'9+ month delay'}
    
    #Recode values using Pandas apply() and anonymous function
    frame['SEX'] = frame['SEX'].apply(lambda i: sex_dict[i])
    frame['EDUCATION'] = frame['EDUCATION'].apply(lambda i: education_dict[i])    
    frame['MARRIAGE'] = frame['MARRIAGE'].apply(lambda i: marriage_dict[i]) 
    for name in frame.columns:
        if name in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
            frame[name] = frame[name].apply(lambda i: pay_dict[i])            
                
    return frame

data = recode_cc_data(data)

# Display descriptive statistics for numeric variables

In [4]:
data.describe()

Unnamed: 0,ID,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_NEXT_MONTH
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,35.4855,51223.3309,49179.075167,47013.15,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,9.217904,73635.860576,71173.768783,69349.39,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,21.0,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,28.0,3558.75,2984.75,2666.25,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,34.0,22381.5,21200.0,20088.5,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,41.0,67091.0,64006.25,60164.75,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,79.0,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


# Write train and test sets from client to DAI server disk for later use

In [5]:
train_path = "data/credit_train.csv"
test_path ="data/credit_test.csv"
if not (os.path.isfile(train_path) and os.path.isfile(test_path)):
    train_pd, test_pd = train_test_split(data, test_size=0.3) #Split credticard dataset into train/test
    train_pd.to_csv(path_or_buf="data/credit_train.csv", index=False) #Write train set to csv
    test_pd.to_csv(path_or_buf="data/credit_test.csv", index=False) #Write test set to csv
else:
    traid_pd = pd.read_csv("data/credit_train.csv")
    test_pd = pd.read_csv("data/credit_test.csv")

# Connect to DAI Server

In [6]:
ip = 'localhost'
username = 'username'
password = 'password'
h2oai = Client(address = 'http://' + ip + ':12345', username = username, password = password)

# Upload data to DAI Server

In [7]:
cwd = os.getcwd()
train_path_dai = cwd+"/data/credit_train.csv" #DAI needs absolute path
test_path_dai = cwd+"/data/credit_test.csv"  #DAI needs absolute path
train = h2oai.create_dataset_sync(train_path_dai)
test = h2oai.create_dataset_sync(test_path_dai)

# Setup parameters for DAI experiment

In [8]:
#Set the parameters you want to pass to DAI 
#These are the same parameters you see in the DAI GUI
dataset_key=train.key #Dataset to use for DAI
validset_key='' #Validation set to use for DAI (Note, we are not using one for this experiment)
testset_key=test.key #Test set to use for DAI
target="DEFAULT_NEXT_MONTH" #Target column for DAI
dropped_cols=['ID'] #List of columns to drop. In this case we are dropping 'ID'
weight_col=None #The column that indicates the per row observation weights. 
                #If None, each row will have an observation weight of 1
fold_col=None #The column that indicates the fold. If None, the folds will be determined by DAI
time_col='[OFF]' #Time Column: The column that provides a time order, if applicable.
                  #if [AUTO], Driverless AI will auto-detect a potential time order
                  #if [OFF], auto-detection is disabled
is_time_series=False #Whether or not the experiment is a time series problem
classification=True #Inform DAI if the problem type is a classification (binomial/multinomial) 
                    #or not (regression)
enable_gpus=True #Whether or not to enable GPUs
seed=1234 #Use seed for reproducibility
scorer_str='auc' #Set evaluation metric. In this case we are interested in optimizing AUC
accuracy=5 #Accuracy setting for experiment (One of the 3 knobs you see in the DAI UI)
time=5 #Time setting for experiment (One of the 3 knobs you see in the DAI UI)
interpretability=5 #Interpretability setting for experiment (One of the 3 knobs you see in the DAI UI)
config_overrides=None #Extra parameters that can be passed in TOML format

For information on the experiment settings, refer to the [Experiment Settings](http://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/launching.html#experimentsettings).

# Preview DAI experiment
For this example, we will be predicting default payment next month. The parameters that control the experiment process are:  accuracy, time, and interpretability. We can use the get_experiment_preview_sync function to get a sense of what will happen during the experiment.

We will start out by seeing what the experiment will look like with accuracy, time, and interpretability all set to 5.

In [9]:
exp_preview = h2oai.get_experiment_preview_sync(dataset_key=train.key, 
                                                validset_key=validset_key, 
                                                classification=classification, 
                                                dropped_cols=dropped_cols, 
                                                target_col=target, 
                                                time_col=time_col, 
                                                enable_gpus=enable_gpus, 
                                                accuracy=accuracy, 
                                                time=time, 
                                                interpretability=interpretability, 
                                                config_overrides=config_overrides)
exp_preview

['ACCURACY [5/10]:',
 '- Training data size: *21,000 rows, 24 cols*',
 '- Feature evolution: *XGBoost*, *1/3 validation split*',
 '- Final pipeline: *Ensemble (1xXGBoost), 4-fold CV*',
 '',
 'TIME [5/10]:',
 '- Feature evolution: *4 individuals*, up to *58 iterations*',
 '- Early stopping: After *10* iterations of no improvement',
 '',
 'INTERPRETABILITY [5/10]:',
 '- Feature pre-pruning strategy: None',
 '- Monotonicity constraints: disabled',
 '- Feature engineering search space (where applicable): [Clustering, Date, FrequencyEncoding, Identity, Interactions, NumEncoding, TargetEncoding, Text, TruncatedSVD, WeightOfEvidence]',
 '',
 'XGBoost models to train:',
 '- Model and feature tuning: *16*',
 '- Feature evolution: *104*',
 '- Final pipeline: *4*',
 '',
 'Estimated max. total memory usage:',
 '- Feature engineering: *224.0MB*',
 '- GPU XGBoost: *24.0MB*',
 '',
 'Estimated runtime: *6 minutes*']

With these settings, the Driverless AI experiment should take around 5 minutes to run and will train about 119 models:

* 16 for model and feature tuning
* 102 for feature evolution
* 1 for the final pipeline

Driverless AI can suggest the parameters based on the dataset and target column. Below we will use the get_experiment_tuning_suggestion to see what settings Driverless AI suggests.

In [10]:
#Let Driverless suggest parameters for experiment
params = h2oai.get_experiment_tuning_suggestion(dataset_key=train.key, 
                                                target_col=target, 
                                                is_time_series=is_time_series,
                                                is_classification=classification,
                                                config_overrides=config_overrides)
params.dump()

{'dataset_key': 'suniduco',
 'target_col': 'DEFAULT_NEXT_MONTH',
 'weight_col': '',
 'fold_col': '',
 'orig_time_col': '',
 'time_col': '',
 'is_classification': True,
 'cols_to_drop': [],
 'validset_key': '',
 'testset_key': '',
 'enable_gpus': True,
 'seed': False,
 'accuracy': 6,
 'time': 3,
 'interpretability': 6,
 'scorer': 'AUC',
 'time_groups_columns': [],
 'time_period_in_seconds': None,
 'num_prediction_periods': None,
 'num_gap_periods': None,
 'is_timeseries': False,
 'config_overrides': None}

Driverless AI has found that the best parameters are to set accuracy = 6, time = 3, and interpretability = 6. It has selected AUC as the scorer (this is the default scorer for binomial problems).

We can see our experiment preview with the suggested settings below.

In [11]:
exp_preview = h2oai.get_experiment_preview_sync(dataset_key=dataset_key, 
                                                validset_key=validset_key, 
                                                classification=classification, 
                                                dropped_cols =dropped_cols, 
                                                target_col=target, 
                                                time_col=time_col, 
                                                enable_gpus=enable_gpus, 
                                                accuracy=params.accuracy, #DAI suggested 
                                                                            #for accuracy 
                                                time=params.time, #DAI suggested 
                                                                    #for time
                                                interpretability=params.interpretability, #DAI 
                                                                #suggested for interpretability
                                                config_overrides=config_overrides)
exp_preview

['ACCURACY [6/10]:',
 '- Training data size: *21,000 rows, 24 cols*',
 '- Feature evolution: *XGBoost*, *1/3 validation split*',
 '- Final pipeline: *Ensemble (1xGLM, 1xXGBoost), 5-fold CV*',
 '',
 'TIME [3/10]:',
 '- Feature evolution: *4 individuals*, up to *42 iterations*',
 '- Early stopping: After *5* iterations of no improvement',
 '',
 'INTERPRETABILITY [6/10]:',
 '- Feature pre-pruning strategy: FS',
 '- Monotonicity constraints: disabled',
 '- Feature engineering search space (where applicable): [Date, FrequencyEncoding, Identity, Interactions, NumEncoding, TargetEncoding, Text, WeightOfEvidence]',
 '',
 'XGBoost models to train:',
 '- Model and feature tuning: *24*',
 '- Feature evolution: *64*',
 '- Final pipeline: *10*',
 '',
 'Estimated max. total memory usage:',
 '- Feature engineering: *224.0MB*',
 '- GPU XGBoost: *24.0MB*',
 '',
 'Estimated runtime: *5 minutes*']

# Launch experiment
Launch the experiment using the accuracy, time, and interpretability settings DAI suggested

In [12]:
experiment = h2oai.start_experiment_sync(
    
    #Datasets
    dataset_key=train.key, 
    validset_key=validset_key,
    testset_key=testset_key, 
    
    #Columns
    target_col=target,
    cols_to_drop=dropped_cols,
    weight_col=weight_col,
    fold_col=fold_col,
    orig_time_col=time_col,
    time_col=time_col,
    
    #Parameters
    is_classification=classification,
    enable_gpus=enable_gpus,
    seed=seed,
    accuracy=params.accuracy, #DAI suggested for accuracy
    time=params.time, #DAI suggested for time
    interpretability=params.interpretability, #DAI suggested for interpretability
    scorer=scorer_str,
    is_timeseries=is_time_series
)

# View the final model score for the validation and test datasets
When feature engineering is complete, an ensemble model can be built depending on the accuracy setting. The experiment object also contains the score on the validation and test data for this ensemble model. In this case, the validation score is the score on the training cross-validation predictions.

In [13]:
print("Final Model Score on Validation Data: " + str(round(experiment.valid_score, 3)))
print("Final Model Score on Test Data: " + str(round(experiment.test_score, 3)))

Final Model Score on Validation Data: 0.784
Final Model Score on Test Data: 0.779


# Variable importance for DAI experiment
The table outputted below shows the feature name, its relative importance, and a description. Some features will be engineered by Driverless AI and some can be the original feature.

In [14]:
#Download Summary
import subprocess
summary_path=h2oai.download(src_path=experiment.summary_path, dest_dir=".")
dir_path="./h2oai_experiment_summary_" + experiment.key
subprocess.call(['unzip', '-o', summary_path, '-d', dir_path], shell=False)

#View Features
features = pd.read_table(dir_path + "/features.txt", sep=',', skipinitialspace=True)
features.head(n=30)

Unnamed: 0,Relative Importance,Feature,Description
0,1.0,2_CVTE:PAY_0.0,Out-of-fold mean of the response grouped by: [...
1,0.3755,3_CVTE:PAY_2.0,Out-of-fold mean of the response grouped by: [...
2,0.09991,4_CVTE:PAY_3.0,Out-of-fold mean of the response grouped by: [...
3,0.07735,18_PAY_AMT2,PAY_AMT2 (original)
4,0.074373,5_CVTE:PAY_4.0,Out-of-fold mean of the response grouped by: [...
5,0.065621,7_CVTE:PAY_6.0,Out-of-fold mean of the response grouped by: [...
6,0.062677,16_LIMIT_BAL,LIMIT_BAL (original)
7,0.05977,26_NumCatTE:BILL_AMT1:BILL_AMT3:BILL_AMT4:LIMI...,Out-of-fold mean of the response grouped by: [...
8,0.055562,10_BILL_AMT1,BILL_AMT1 (original)
9,0.055515,17_PAY_AMT1,PAY_AMT1 (original)


# Setup scoring package from DAI experiment

In [15]:
h2oai.download(experiment.scoring_pipeline_path, '.')

'./scorer.zip'

#### Execute external processes to install scoring artifact

In [16]:
%%capture
%%bash
#Unzip scoring package and install the scoring python library
unzip scorer;

In [17]:
#Import scoring module
!pip install scoring-pipeline/scoring_h2oai_experiment_*.whl

Processing ./scoring-pipeline/scoring_h2oai_experiment_koguduvo-1.0.0-py3-none-any.whl
Installing collected packages: scoring-h2oai-experiment-koguduvo
Successfully installed scoring-h2oai-experiment-koguduvo-1.0.0
[33mYou are using pip version 9.0.3, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


#### Import scoring package installed directly above

In [18]:
from scoring_h2oai_experiment_koguduvo import Scorer #Make sure to add experiment name to  
                                                     #import scoring_h2oai_experiment_* 

In [19]:
%%capture
#Create a singleton Scorer instance.
#For optimal performance, create a Scorer instance once, and call score() or score_batch() multiple times.
scorer = Scorer()

In [20]:
#Check colum names used for scorer()
scorer.get_column_names()

('LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6')

In [21]:
scores = scorer.score_batch(test_pd).drop('DEFAULT_NEXT_MONTH.0', axis=1)

In [22]:
scores = scores.rename(columns={"DEFAULT_NEXT_MONTH.1": "p_DEFAULT_NEXT_MONTH"})
assert scores.shape[0] == test_pd.shape[0], "Test set rows and score rows should match!"
scores.head()

Unnamed: 0,p_DEFAULT_NEXT_MONTH
0,0.728522
1,0.22365
2,0.72366
3,0.120519
4,0.05963


# Perform sensitivity analysis to test model performance on unseen data
Sensitivity analysis investigates whether model behavior and outputs remain stable when data is intentionally perturbed or other changes are simulated in data. Beyond traditional assessment practices, sensitivity analysis of machine learning model predictions is perhaps the most important validation technique for machine learning models. Machine learning models can make drastically differing predictions for only minor changes in input variable values. In practice, many linear model validation techniques focus on the numerical instability of regression parameters due to correlation between input variables or between input variables and the dependent variable. It may be prudent for those switching from linear modeling techniques to machine learning techniques to focus less on numerical instability of model parameters and to focus more on the potential instability of model predictions.

Here sensitivity analysis is used to understand the impact of changing one of the most important input variable, PAY_0, and the impact of a sociologically sensitive variable, SEX, in the model. If the model changes in reasonable and expected ways when important variable values are changed this can enhance trust in the model. If the contribution of potentially sensitive variables, such as those related to gender, race, age, marital status, or disability status, can be shown to have minimal impact on the model, this is an indication of fairness in the model predictions and can also increase overall trust in the model.

### Bind new model predictions onto test data
Typically, a productive exercise in model debugging and validation is to investigate customers with very high or low predicted probabilities to determine if their predictions stay within reasonable bounds when important variables are changed. The predictions from the new, more accurate model are merged onto the test set to find these potentially interesting customers.

In [23]:
test_pd.reset_index(drop=True, inplace=True)
scores.reset_index(drop=True, inplace=True)
test_yhat = pd.concat([test_pd, scores], axis=1)
test_yhat.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_NEXT_MONTH,p_DEFAULT_NEXT_MONTH
0,26648,40000,male,graduate school,single,26,2 month delay,use of revolving credit,use of revolving credit,2 month delay,...,39103,38945,2000,3900,1500,0,1600,1600,1,0.728522
1,1545,360000,female,university,single,34,1 month delay,no consumption,no consumption,pay duly,...,0,0,0,0,700,0,0,0,1,0.22365
2,13963,30000,female,high school,married,54,2 month delay,2 month delay,2 month delay,use of revolving credit,...,27972,27374,3600,0,1200,2400,0,2300,1,0.72366
3,354,80000,female,university,single,43,use of revolving credit,use of revolving credit,use of revolving credit,use of revolving credit,...,19036,19414,3177,2600,3000,1691,695,882,0,0.120519
4,2177,320000,male,university,single,35,use of revolving credit,use of revolving credit,use of revolving credit,use of revolving credit,...,18627,19167,7000,4100,1000,1000,1000,1000,0,0.05963


In [24]:
def get_percentile_dict(yhat, id_, frame):

    """ Returns the minimum, the maximum, and the deciles of a column, yhat, 
        as the indices based on another column id_.
    
    Args:
        yhat: Column in which to find percentiles.
        id_: Id column that stores indices for percentiles of yhat.
        frame: Pandas DataFrame containing yhat and id_. 
    
    Returns:
        Dictionary of percentile values and index column values.
    
    """
    
    #Create a copy of frame and sort it by yhat
    sort_df = frame
    sort_df = sort_df.sort_values(yhat)
    sort_df = sort_df.reset_index()
    
    #Find top and bottom percentiles
    percentiles_dict = {}
    percentiles_dict[0] = sort_df.loc[0, id_]
    percentiles_dict[99] = sort_df.loc[sort_df.shape[0]-1, id_]

    #Find 10th-90th percentiles
    inc = sort_df.shape[0]//10
    for i in range(1, 10):
        percentiles_dict[i * 10] = sort_df.loc[i * inc,  id_]

    return percentiles_dict

#Display percentiles dictionary
#ID values for rows
#from lowest prediction 
#to highest prediction
pred_percentile_dict = get_percentile_dict('p_DEFAULT_NEXT_MONTH', 'ID', test_yhat)
pred_percentile_dict

{0: 28743,
 99: 16555,
 10: 2778,
 20: 15643,
 30: 10260,
 40: 8375,
 50: 26256,
 60: 25387,
 70: 15318,
 80: 8399,
 90: 17764}

### Display test data prediction range
Below, we can see that the model produces both very low and very high predictions in test set, indicating that it is likely responsive to signal in new data and not simply predicting the majority class or an average value.

In [25]:
y = "DEFAULT_NEXT_MONTH"
yhat = "p_DEFAULT_NEXT_MONTH"
print('Lowest prediction:', test_yhat[test_yhat['ID'] == int(pred_percentile_dict[0])][[y, yhat]])
print('Highest prediction:', test_yhat[test_yhat['ID'] == int(pred_percentile_dict[99])][[y, yhat]])

Lowest prediction:       DEFAULT_NEXT_MONTH  p_DEFAULT_NEXT_MONTH
2562                   0              0.027384
Highest prediction:       DEFAULT_NEXT_MONTH  p_DEFAULT_NEXT_MONTH
3156                   1              0.861882


### Use trained model to test predictions for interesting situations: customer least likely to default
As a starting point for further analysis, sensitivity analysis is performed for the customer least 
likely to default. This woman has a roughly 0.02 probability of defaulting according to the trained DAI model.

In [27]:
test_case = test_yhat[test_yhat['ID'] == int(pred_percentile_dict[0])]
test_case

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_NEXT_MONTH,p_DEFAULT_NEXT_MONTH
2562,28743,400000,female,graduate school,single,29,pay duly,pay duly,use of revolving credit,use of revolving credit,...,70732,68674,62545,53704,4142,5010,66676,66660,0,0.027384


### Test effect of changing SEX
SEX should not have a large impact on predictions. This could indicate unwanted sociological bias in the DAI model.

In [28]:
test_case = test_yhat[test_yhat['ID'] == int(pred_percentile_dict[0])]
test_case = test_case[list(scorer.get_column_names())]
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when SEX is female ", str(score[1]), "***\n")
score = scorer.score(test_case.values.tolist()[0])
test_case['SEX'] = 'male'
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when SEX is male ", str(score[1]), "***")

***Prediction on the first row when SEX is female  0.027383523644371473 ***

***Prediction on the first row when SEX is male  0.027087264440276405 ***


It seems simulating this person as a male does have a very minimal impact on their probability of default but nothing alarming.

### Test effect of changing PAY_0
Variable importance indicates that the value of PAY_0 can have a strong effect on 
model predictions. Measuring the change in predicted probability when the value of PAY_0 is changed from a 
timely payment to late payment is probably a good test case for prediction stability.

In [29]:
#Get original test case and score
test_case = test_yhat[test_yhat['ID'] == int(pred_percentile_dict[0])]
test_case = test_case[list(scorer.get_column_names())]
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when PAY_0 remains unchanged ", str("%.4f" % score[1]), "***\n")

#Change PAY_0 to '2 month delay' and re-score
test_case['PAY_0'] = '2 month delay' 
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when PAY_0 goes from 'pay duly' to '2 month delay' ", str("%.4f" % score[1]), "***\n")

***Prediction on the first row when PAY_0 remains unchanged  0.0274 ***

***Prediction on the first row when PAY_0 goes from 'pay duly' to '2 month delay'  0.2427 ***



When the value is changed from pay duly to two month delay there is a roughly 8.8X increase in predicted probability. Assuming a 0.5 probability cutoff, the predicted outcome is still stable.

### Use trained model to test predictions for interesting situations: customer most likely to default
Now the same test will be performed on the customer most likely to default. This man has a roughly 0.86 probability of default according to the trained DAI model.

In [30]:
test_case = test_yhat[test_yhat['ID'] == int(pred_percentile_dict[99])]
test_case

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_NEXT_MONTH,p_DEFAULT_NEXT_MONTH
3156,16555,10000,male,graduate school,single,30,3 month delay,2 month delay,2 month delay,7 month delay,...,2300,2300,0,0,0,0,0,0,1,0.861882


### Test effect of changing SEX
Changing the value for SEX from male to female for this customer slightly increases the predicted probability to about 0.8621, another small change but nothing alarming.

In [31]:
test_case = test_yhat[test_yhat['ID'] == int(pred_percentile_dict[99])]
test_case = test_case[list(scorer.get_column_names())]
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when SEX is Male ", str("%.4f" % score[1]), "***\n")
score = scorer.score(test_case.values.tolist()[0])
test_case['SEX'] = 'female'
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when SEX is Female ", str("%.4f" % score[1]), "***")

***Prediction on the first row when SEX is Male  0.8619 ***

***Prediction on the first row when SEX is Female  0.8621 ***


### Test effect of changing PAY_0
Switching the riskiest customer's value for PAY_0 from 3 month delay to pay duly reduces the their chance of defaulting from roughly 86% to roughly 66%, a noticable swing in probability but still a higher probability value, markedly greater than the typical 0.5 cutoff

In [33]:
#Get original test case and score
test_case = test_yhat[test_yhat['ID'] == int(pred_percentile_dict[99])]
test_case = test_case[list(scorer.get_column_names())]
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when PAY_0 remains unchanged ", str("%.4f" % score[1]), "***\n")

#Change PAY_0 to 'pay duly' and re-score
test_case['PAY_0'] = 'pay duly' 
score = scorer.score(test_case.values.tolist()[0])
print("***Prediction on the first row when PAY_0 goes from '3 month delay' to 'pay duly' ", str("%.4f" % score[1]), "***\n")

***Prediction on the first row when PAY_0 remains unchanged  0.8619 ***

***Prediction on the first row when PAY_0 goes from '3 month delay' to 'pay duly'  0.6670 ***



# Summary
In this notebook, a DAI model was trained to predict credit card defaults. Sensitivity analysis was used to test the DAI model for trustworthiness and stability. In a small number of boundary test cases, the trained DAI model appeared somewhat stable. Sensitivity analysis is a powerful model debugging techniques and can increase trust in complex models. This technique should generalize well for many types of business and research problems, enabling you to train a complex model and justify it to your colleagues, bosses, and potentially, external regulators.