In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Introduction
Our goal for this project is to develop a predictive model and a decision support system (DSS) that evaluates the risk of Home Equity Line of Credit (HELOC) applications. Ultimately, the decision support system, which is essentially an interactive interface, is able to give understandable explanations for sales representatives in a bank/credit card company. Even without coding experience, they can easily use the system to decide on accepting or rejecting applications.

The dataset and additional information can be found here: https://community.fico.com/s/explainable-machine-learning-challenge

# Part 1) Data Cleaning                                                                                                                                                        

In [2]:
# load the data
df = pd.read_csv('heloc_dataset_v1.csv')
df.describe()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
count,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,...,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0,10459.0
mean,67.425758,184.205373,8.543455,73.843293,19.428052,0.042738,-0.142843,86.661536,6.762406,4.928291,...,32.16646,-0.325366,0.868152,0.812602,31.629888,39.158906,3.185008,0.976097,0.018071,62.079166
std,21.121621,109.683816,13.301745,38.782803,13.004327,2.51391,2.367397,25.999584,20.50125,3.756275,...,20.128634,6.067556,3.179304,3.143698,30.06014,42.101601,4.413173,4.060995,3.358135,27.711565
min,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,...,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0,-9.0
25%,63.0,118.0,3.0,52.0,12.0,0.0,0.0,87.0,-7.0,4.0,...,20.0,-7.0,0.0,0.0,5.0,-8.0,2.0,1.0,0.0,47.0
50%,71.0,178.0,5.0,74.0,19.0,0.0,0.0,96.0,-7.0,6.0,...,31.0,0.0,1.0,1.0,25.0,47.0,3.0,2.0,0.0,67.0
75%,79.0,249.5,11.0,95.0,27.0,1.0,0.0,100.0,14.0,7.0,...,44.0,1.0,2.0,2.0,54.0,79.0,5.0,3.0,1.0,82.0
max,94.0,803.0,383.0,383.0,79.0,19.0,19.0,100.0,83.0,9.0,...,100.0,24.0,66.0,66.0,232.0,471.0,32.0,23.0,18.0,100.0


In [3]:
# Before getting rid of NA values, first transform string into numerical variables.
# Get dummies for categorical column: Risk performance
category = ['RiskPerformance']
all_df = pd.get_dummies(df,columns=category,drop_first=True)
all_df

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance_Good
0,55,144,4,84,20,3,0,83,2,3,...,0,0,0,33,-8,8,1,1,69,0
1,61,58,15,41,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,0
2,67,66,5,24,9,0,0,100,-7,7,...,0,4,4,53,66,4,2,1,86,0
3,66,169,1,73,28,1,1,93,76,6,...,0,5,4,72,83,6,4,3,91,0
4,81,333,27,132,12,0,0,100,-7,7,...,0,1,1,51,89,3,1,0,80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,73,131,5,57,21,0,0,95,80,6,...,7,0,0,26,-8,5,2,0,100,1
10455,65,147,39,68,11,0,0,92,28,6,...,1,1,1,86,53,2,2,1,80,0
10456,74,129,6,64,18,1,1,100,-7,6,...,3,4,4,6,-8,5,-8,0,56,0
10457,72,234,12,113,42,2,2,96,35,6,...,6,0,0,19,-8,4,1,0,38,0


In [4]:
# Replace missing value with nan and drop
all_df.replace(-9, np.nan,inplace=True)
all_df.dropna(inplace=True)

In [5]:
# Get dummies for categorical column: MaxDelqever
category_2 = ['MaxDelqEver']
all_df2 = pd.get_dummies(all_df,columns=category_2,drop_first=False)
all_df2.head(3)

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance_Good,MaxDelqEver_2.0,MaxDelqEver_3.0,MaxDelqEver_4.0,MaxDelqEver_5.0,MaxDelqEver_6.0,MaxDelqEver_7.0,MaxDelqEver_8.0
0,55.0,144.0,4.0,84.0,20.0,3.0,0.0,83.0,2.0,3.0,...,1.0,69.0,0,0,0,0,1,0,0,0
1,61.0,58.0,15.0,41.0,2.0,4.0,4.0,100.0,-7.0,0.0,...,-8.0,0.0,0,0,0,0,0,0,0,1
2,67.0,66.0,5.0,24.0,9.0,0.0,0.0,100.0,-7.0,7.0,...,1.0,86.0,0,0,0,0,0,0,0,1


In [6]:
# Get dummies for categorical column: MaxDelq2PublicRecLast12M
category_3 = ['MaxDelq2PublicRecLast12M']
all_df3 = pd.get_dummies(all_df2,columns=category_3,drop_first=False)
all_df3.head(3)

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,NumTotalTrades,...,MaxDelqEver_8.0,MaxDelq2PublicRecLast12M_0.0,MaxDelq2PublicRecLast12M_1.0,MaxDelq2PublicRecLast12M_2.0,MaxDelq2PublicRecLast12M_3.0,MaxDelq2PublicRecLast12M_4.0,MaxDelq2PublicRecLast12M_5.0,MaxDelq2PublicRecLast12M_6.0,MaxDelq2PublicRecLast12M_7.0,MaxDelq2PublicRecLast12M_9.0
0,55.0,144.0,4.0,84.0,20.0,3.0,0.0,83.0,2.0,23.0,...,0,0,0,0,1,0,0,0,0,0
1,61.0,58.0,15.0,41.0,2.0,4.0,4.0,100.0,-7.0,7.0,...,1,1,0,0,0,0,0,0,0,0
2,67.0,66.0,5.0,24.0,9.0,0.0,0.0,100.0,-7.0,9.0,...,1,0,0,0,0,0,0,0,1,0


In [7]:
# replace -7 and -9 with median value
all_df3.replace([-7,-8], np.nan,inplace=True)
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
imputer.fit(all_df3)
X=imputer.transform(all_df3)
df_new = pd.DataFrame(X, columns=all_df3.columns)



# Part 2) Train the Model

In [8]:
# Split the data into training and testing dataset
from sklearn.model_selection import train_test_split
X=df_new.drop('RiskPerformance_Good',axis=1)
y=df_new['RiskPerformance_Good']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=101)

In [9]:
# Normalize input data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_Xtrain = scaler.fit_transform(X_train)
scaled_Xtest = scaler.transform(X_test)

# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
# Train the model
logmodel = LogisticRegression()
logmodel.fit(scaled_Xtrain,y_train)
# Make prediction
predictions = logmodel.predict(scaled_Xtest)
# Print confusion matrix
print(classification_report(y_test,predictions))
# print roc score
print(roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.75      0.77      0.76      1519
         1.0       0.75      0.73      0.74      1440

    accuracy                           0.75      2959
   macro avg       0.75      0.75      0.75      2959
weighted avg       0.75      0.75      0.75      2959

0.7479690128739668




# Random Forest

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
param_grid = {'n_estimators': [50,100,500], 'max_features': [5,15,25],'max_depth':[5,20,50]} 
grid_search = GridSearchCV(rfc,param_grid,cv=5)
grid_search.fit(scaled_Xtrain,y_train)
grid_predictions = grid_search.predict(scaled_Xtest)
# print accuracy and roc score
print(classification_report(y_test,grid_predictions))
print(roc_auc_score(y_test, grid_predictions))

              precision    recall  f1-score   support

         0.0       0.74      0.79      0.76      1519
         1.0       0.76      0.70      0.73      1440

    accuracy                           0.75      2959
   macro avg       0.75      0.74      0.75      2959
weighted avg       0.75      0.75      0.75      2959

0.7449967083607636


In [12]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features=5, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# SVM (kernel=Poly)

In [13]:
from sklearn.svm import SVC
svc_poly = SVC(kernel='poly',degree=3,C=10,gamma=0.01)
param_grid = {'C': [0.1,1, 10], 'gamma': [0.1,0.01],'degree': [2,5]} 
svc_poly.fit(scaled_Xtrain,y_train)
svc_poly_predictions=svc_poly.predict(scaled_Xtest)
# print accuracy and roc score
print(classification_report(y_test,svc_poly_predictions))
print(roc_auc_score(y_test, svc_poly_predictions))

              precision    recall  f1-score   support

         0.0       0.75      0.73      0.74      1519
         1.0       0.72      0.74      0.73      1440

    accuracy                           0.73      2959
   macro avg       0.73      0.73      0.73      2959
weighted avg       0.73      0.73      0.73      2959

0.7331525217613927


# SVM (kernel=rbf)

In [14]:
svc_rbf = SVC(kernel='rbf',C=10,gamma=0.01)
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'degree': [2,5,10]} 
svc_rbf.fit(scaled_Xtrain,y_train)
svc_rbf_predictions=svc_rbf.predict(scaled_Xtest)
# print accuracy and roc score
print(classification_report(y_test,svc_rbf_predictions))
print(roc_auc_score(y_test, svc_rbf_predictions))

              precision    recall  f1-score   support

         0.0       0.74      0.76      0.75      1519
         1.0       0.74      0.72      0.73      1440

    accuracy                           0.74      2959
   macro avg       0.74      0.74      0.74      2959
weighted avg       0.74      0.74      0.74      2959

0.7406732316582546


# Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
NBmodel = GaussianNB()
NBmodel.fit(scaled_Xtrain,y_train)
y_train_predictions = NBmodel.predict(scaled_Xtrain)
y_test_predictions = NBmodel.predict(scaled_Xtest)
y_score = NBmodel.fit(scaled_Xtrain,y_train).predict_proba(scaled_Xtest)
from sklearn import metrics
print('Accuracy:',metrics.accuracy_score(y_test,y_test_predictions))
fpr,tpr,thresholds = roc_curve(y_test_predictions,y_score[:,1])
roc_auc = auc(fpr,tpr)
roc_auc

Accuracy: 0.6944913822237242


1.0

# SGDC Classifier

In [17]:
from sklearn.linear_model import SGDClassifier
SGDmodel = SGDClassifier(loss='hinge',penalty='l2',max_iter=5)
SGDmodel.fit(scaled_Xtrain,y_train)
y_train_predictions = SGDmodel.predict(scaled_Xtrain)
y_test_predictions = SGDmodel.predict(scaled_Xtest)
print('Accuracy:',metrics.accuracy_score(y_test_predictions,y_test))
fpr,tpr,thresholds = roc_curve(y_test_predictions,y_score[:,1])
roc_auc = auc(fpr,tpr)
roc_auc

Accuracy: 0.7029401824940859




0.8179331385126978

# Decision Tree

In [18]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
dt = DecisionTreeClassifier()
dt.fit(scaled_Xtrain, y_train)
y_train_predictions = dt.predict(scaled_Xtrain)
y_test_predictions = dt.predict(scaled_Xtest)
print('Accuracy:',metrics.accuracy_score(y_test,y_test_predictions))
print(confusion_matrix(y_test, y_test_predictions))
print(classification_report(y_test, y_test_predictions))
fpr,tpr,thresholds = roc_curve(y_test_predictions,y_score[:,1])
roc_auc = auc(fpr,tpr)
roc_auc

Accuracy: 0.6394052044609665
[[982 537]
 [530 910]]
              precision    recall  f1-score   support

         0.0       0.65      0.65      0.65      1519
         1.0       0.63      0.63      0.63      1440

    accuracy                           0.64      2959
   macro avg       0.64      0.64      0.64      2959
weighted avg       0.64      0.64      0.64      2959



0.733357740700519

# Bagging

In [19]:
from sklearn.ensemble import BaggingClassifier
dtc = DecisionTreeClassifier(criterion="entropy")
bag_model=BaggingClassifier(base_estimator=dtc, n_estimators=100, bootstrap=True)
bag_model=bag_model.fit(scaled_Xtrain,y_train)
y_test_predictions=bag_model.predict(scaled_Xtest)
print(bag_model.score(scaled_Xtest, y_test))
print(confusion_matrix(y_test, y_test_predictions)) 
fpr,tpr,thresholds = roc_curve(y_test_predictions,y_score[:,1])
roc_auc = auc(fpr,tpr)
roc_auc

0.7357215275430888
[[1179  340]
 [ 442  998]]


0.9166217129620664

# KNN

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score as kappa
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
clf=neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)
y_expect=y_test
y_pred=clf.predict(X_test)
print(metrics.classification_report(y_expect,y_pred))

              precision    recall  f1-score   support

         0.0       0.69      0.66      0.68      1519
         1.0       0.66      0.69      0.67      1440

    accuracy                           0.68      2959
   macro avg       0.68      0.68      0.68      2959
weighted avg       0.68      0.68      0.68      2959



# Gradient Boosting

In [12]:
import lightgbm as lgb
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
train_data=lgb.Dataset(scaled_Xtrain, label=y_train)
params = {'boosting_type': 'gbdt','max_depth' : -1,'objective': 'binary','nthread': 5,'num_leaves': 64,'learning_rate': 0.07,'max_bin': 512,'subsample_for_bin': 200,'subsample': 1,'subsample_freq': 1,'colsample_bytree': 0.8, 'reg_alpha': 1.2,'reg_lambda': 1.2,'min_split_gain': 0.5,'min_child_weight': 1,'min_child_samples': 5,'scale_pos_weight': 1,'num_class' : 1,'metric' : 'binary_error'}
gridParams = {'learning_rate': [0.07],'n_estimators': [8,16],'num_leaves': [20, 24, 27],'boosting_type' : ['gbdt'],'objective' : ['binary'],'random_state' : [501], 'colsample_bytree' : [0.64, 0.65],'subsample' : [0.7,0.75],}

from sklearn.model_selection import GridSearchCV
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])
mdl.get_params().keys()
grid = GridSearchCV(mdl, gridParams, verbose=2, cv=10, n_jobs=-1)
grid.fit(X_train, y_train)
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = grid.best_params_['learning_rate']
params['num_leaves'] = grid.best_params_['num_leaves']
params['subsample'] = grid.best_params_['subsample']

lgbm = lgb.train(params,train_data,280,verbose_eval= 4)

predictions_lgbm_prob = lgbm.predict(scaled_Xtest)
predictions_lgbm_01 = np.where(predictions_lgbm_prob > 0.5, 1, 0)

acc_lgbm = accuracy_score(y_test,predictions_lgbm_01)

false_positive_rate, recall, thresholds = roc_curve(y_test, predictions_lgbm_prob)
roc_auc = auc(false_positive_rate, recall)
print('Overall accuracy of Light GBM model:', acc_lgbm)
print('Overall roc of Light GBM model:', roc_auc)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   12.2s finished


Overall accuracy of Light GBM model: 0.7445082798242649
Overall roc of Light GBM model: 0.8118988186672519


# Interface

In [13]:
from sklearn.pipeline import Pipeline
import pickle
from sklearn.preprocessing import MinMaxScaler
#pipeline
pipe_logistic = Pipeline([('minmax', MinMaxScaler()), ('lr', LogisticRegression())])
pipe_logistic.fit(X_train, y_train)

#gb
import lightgbm as lgb
from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve, auc
train_data=lgb.Dataset(scaled_Xtrain, label=y_train)
params = {'boosting_type': 'gbdt','max_depth' : -1,'objective': 'binary','nthread': 5,'num_leaves': 64,'learning_rate': 0.07,'max_bin': 512,'subsample_for_bin': 200,'subsample': 1,'subsample_freq': 1,'colsample_bytree': 0.8, 'reg_alpha': 1.2,'reg_lambda': 1.2,'min_split_gain': 0.5,'min_child_weight': 1,'min_child_samples': 5,'scale_pos_weight': 1,'num_class' : 1,'metric' : 'binary_error'}
gridParams = {'learning_rate': [0.07],'n_estimators': [8,16],'num_leaves': [20, 24, 27],'boosting_type' : ['gbdt'],'objective' : ['binary'],'random_state' : [501], 'colsample_bytree' : [0.64, 0.65],'subsample' : [0.7,0.75],}

from sklearn.model_selection import GridSearchCV
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])
mdl.get_params().keys()
clf_gb = GridSearchCV(mdl, gridParams, verbose=2, cv=10, n_jobs=-1)
clf_gb.fit(X_train, y_train)

#rf
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
param_grid = {'n_estimators': [50,100,500], 'max_features': [5,15,25],'max_depth':[5,20,50]} 
clf_rf = GridSearchCV(rfc,param_grid,cv=5)
clf_rf.fit(scaled_Xtrain,y_train)


# print('Accuracy: ', pipe_logistic.score(X_test, y_test))
# Save the data and pipeline
pickle.dump(X_train, open('X_train.sav', 'wb'))
pickle.dump(pipe_logistic, open('pipe_logistic.sav', 'wb'))
pickle.dump(clf_gb, open('clf_gb.sav', 'wb'))
pickle.dump(clf_rf, open('clf_rf.sav', 'wb'))
pickle.dump(X_test, open('X_test.sav', 'wb'))
pickle.dump(y_test, open('y_test.sav', 'wb'))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    9.6s finished


In [18]:
import streamlit as st
import pickle
import numpy as np
from sklearn import metrics

# Load the pipeline and data
# pipe = pickle.load(open('pipe_logistic.sav', 'rb'))
X_test = pickle.load(open('X_test.sav', 'rb'))
y_test = pickle.load(open('y_test.sav', 'rb'))

dic = {0: 'Bad', 1: 'Good'}


#Function to test certain index of dataset
def test_demo(index):
    values = X_test.iloc[index]  # Input the value from dataset

    # Create four sliders in the sidebar
    a = st.sidebar.slider('ExternalRiskEstimate', 0, 100, int(values[0]), 1)
    b = st.sidebar.slider('MSinceOldestTradeOpen', 0, 810, int(values[1]), 1)
    c = st.sidebar.slider('MSinceMostRecentTradeOpen', 0, 400, int(values[2]), 1)
    d = st.sidebar.slider('AverageMInFile', 0, 400, int(values[3]), 1)
    e = st.sidebar.slider('NumSatisfactoryTrades', 0, 80, int(values[4]), 1)
    f = st.sidebar.slider('NumTrades60Ever2DerogPubRec', 0, 20, int(values[5]), 1)
    g = st.sidebar.slider('NumTrades90Ever2DerogPubRec', 0, 20, int(values[6]), 1)
    h = st.sidebar.slider('PercentTradesNeverDelq', 0, 100, int(values[7]), 1)
    i = st.sidebar.slider('MSinceMostRecentDelq', 0, 90, int(values[8]), 1)
    j = st.sidebar.slider('MaxDelq2PublicRecLast12M', 0, 7, int(values[9]), 1)
    k = st.sidebar.slider('MaxDelqEver', 0, 10, int(values[10]), 1)
    l = st.sidebar.slider('NumTotalTrades', 0, 110, int(values[11]), 1)
    m = st.sidebar.slider('NumTradesOpeninLast12M', 0, 20, int(values[12]), 1)
    n = st.sidebar.slider('MPercentInstallTrades', 0, 100, int(values[13]), 1)
    o = st.sidebar.slider('MSinceMostRecentInqexcl7days', 0, 30, int(values[14]), 1)
    p = st.sidebar.slider('MNumInqLast6M', 0, 70, int(values[15]), 1)
    q = st.sidebar.slider('MNumInqLast6Mexcl7days', 0, 70, int(values[16]), 1)
    r = st.sidebar.slider('NetFractionRevolvingBurden', 0, 240, int(values[17]), 1)
    s = st.sidebar.slider('NetFractionInstallBurden', 0, 240, int(values[18]), 1)
    t = st.sidebar.slider('NumRevolvingTradesWBalance', 0, 40, int(values[19]), 1)
    u = st.sidebar.slider('NumInstallTradesWBalance', 0, 40, int(values[20]), 1)    
    v = st.sidebar.slider('NumBank2NatlTradesWHighUtilization', 0, 20, int(values[21]), 1)   
    w = st.sidebar.slider('PercentTradesWBalance', 0, 100, int(values[22]), 1) 

    #Print the prediction result
    alg = [ 'Logistic Regression','Gradient Boosting','Random Forest']

    classifier = st.selectbox('Which algorithm?', alg)
    if classifier == 'Random Forest':
        pipe = pickle.load(open('clf_rf.sav', 'rb'))
        res = pipe.predict(np.array([a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w]).reshape(1, -1))[0]
        st.write('Prediction:  ', dic[res])
        pred = pipe.predict(X_test)
        score = pipe.score(X_test, y_test)
        cm = metrics.confusion_matrix(y_test, pred)
        st.write('Accuracy: ', score)
        st.write('Confusion Matrix: ', cm)

        st.text('Random Forest Chosen')

    elif classifier == 'Gradient Boosting':
        pipe = pickle.load(open('clf_gb.sav', 'rb'))
        res = pipe.predict(np.array([a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w]).reshape(1, -1))[0]
        st.write('Prediction:  ', dic[res])
        pred = pipe.predict(X_test)
        score = pipe.score(X_test, y_test)
        cm = metrics.confusion_matrix(y_test, pred)
        st.write('Accuracy: ', score)
        st.write('Confusion Matrix: ', cm)

        st.text('Gradient Boosting Chosen')

    else:
        pipe = pickle.load(open('pipe_logistic.sav', 'rb'))
        res = pipe.predict(np.array([a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w]).reshape(1, -1))[0]
        st.write('Prediction:  ', dic[res])
        pred = pipe.predict(X_test)
        score = pipe.score(X_test, y_test)
        cm = metrics.confusion_matrix(y_test, pred)
        st.write('Accuracy: ', score)
        st.write('Confusion Matrix: ', cm)


# title
st.title('Risk Perdiction')
# show data
if st.checkbox('Show dataframe'):
    st.write(X_test)
# st.write(X_train) # Show the dataset

number = st.text_input('Choose a row of information in the dataset (0~119):', 5)  # Input the index number

test_demo(int(number))  # Run the test function



ContextualVersionConflict: (tornado 6.0.3 (/opt/anaconda3/lib/python3.7/site-packages), Requirement.parse('tornado<6.0,>=5.0'), {'streamlit'})