In [None]:
%env OMP_NUM_THREADS = 4

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, make_scorer, f1_score,accuracy_score, cohen_kappa_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data = pd.read_excel("Bankruptcy_data_Final.xlsx")

In [None]:
data
data.info()
data.describe()

In [None]:
data.dtypes

In [None]:
#Dropping the first column Year
data = data.drop(['Data Year - Fiscal'],axis=1)

#Filling na with 0
data = data.fillna(0)

In [None]:
df_bk=data.copy()

In [None]:
#Feature Engineering - Scaling the numerical features
#Scaling Data - Standardization 
#Data is scaled so that all numbers can be viewed at one standard level

#First step - remove "target variable" from data as we do not need to scale target variable
X = df_bk.copy()
X = X.drop(['BK'],axis=1)

#Scaling the numerical features
scaler = StandardScaler()
features = list(X.select_dtypes(include=np.number).columns)
X[features] = scaler.fit_transform(X[features])

#Adding taregt variable purchase back to the scaled data
X['BK']=df_bk['BK']

df_bk=X.copy()
df_bk

In [None]:
#Feature Enginnering ratios
df_bk['ratio1']=df_bk['Liquidity']*df_bk['Profitability']
df_bk['ratio2']=df_bk['Leverage Ratio']*df_bk['Asset Turnover']
df_bk['ratio3']=df_bk['Profitability']*df_bk['Productivity']
df_bk['ratio4']=df_bk['Operational Margin']*df_bk['Asset Turnover']
df_bk['ratio5']=df_bk['Return on Equity']*df_bk['EPS']
df_bk['ratio6']=df_bk['Return on Equity']*df_bk['Operational Margin']

df_bk = df_bk.fillna(0)

In [None]:
#SMOTE
y = df_bk.BK
X = df_bk.drop('BK', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sm = SMOTE(random_state=27)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [None]:
y_train.value_counts()
X_train.shape

In [None]:
X_train.info()

In [None]:
# ROC curve
def plot_roc(y_test, y_pred):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1, drop_intermediate = False)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([-0.001, 1.001])
    plt.ylim([-0.001, 1.001])
    plt.xlabel('1-Specificity (False Negative Rate)')
    plt.ylabel('Sensitivity (True Positive Rate)')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show()

# Confusion Matrix returns in the format: cm[0,0], cm[0,1], cm[1,0], cm[1,1]: tn, fp, fn, tp

# Sensitivity
def custom_sensitivity_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    return (tp/(tp+fn))

# Specificity
def custom_specificity_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    return (tn/(tn+fp))

# Positive Predictive Value
def custom_ppv_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    return (tp/(tp+fp))

# Negative Predictive Value
def custom_npv_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    return (tn/(tn+fn))

# Accuracy
def custom_accuracy_score(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
    return ((tn+tp)/(tn+tp+fn+fp))

In [None]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
#Grid Search with cross validation

#Score_func defines the performance measure(auc) which the gridsearchCV should use
score_func = make_scorer(roc_auc_score, greater_is_better=True)

# Create a parameter grid to test various hyper parameter values
param_grid_rf = {
    'max_depth': [80, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [50, 100, 300]
}

# Defining the Random Forest Classifier model
classifier_RF = RandomForestClassifier()

# Hyper-parameter tuning ("optimization") using the function GridSearchCV for maximizing AUC
# 5-fold cross-validation
# Instantiate the grid search model
grid_search_rf = GridSearchCV(estimator = classifier_RF, param_grid = param_grid_rf, 
                          cv = 5, scoring = score_func, n_jobs=-1,return_train_score = True, verbose = 2)

In [None]:
#Fitting the model to the training dataset
grid_search_RF = grid_search_rf.fit(X_train, y_train)

In [None]:
#Finding out which are the best hyper parameter values where auc for the model is highest
print('\nBest Hyper-Parameter values Random Forest:'+str(grid_search_RF.best_params_))
grid_search_RF.best_params_

#Best Estimator for Random Forest Model
best_grid_rf = grid_search_RF.best_estimator_
best_grid_rf

#Score of the best model
best_result_rf = grid_search_RF.best_score_
print("\nBest Score Random Forest: " + str(best_result_rf))

In [None]:
#Using the above (best) model with the best hyper parameter values to predict the testing data
class_threshold = 0.50
y_pred_prob_rf = grid_search_RF.predict_proba(X_test)[:,1]
y_pred_rf = np.where(y_pred_prob_rf > class_threshold, 1, 0) # classification

In [None]:
## Performance Measure

#Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion matrix Random Forest: \n" + str(cm_rf))

#Auc Calculation
auc_rf=roc_auc_score(y_test, y_pred_prob_rf)
print("\nAUC Random Forest: " +str(auc_rf))

#ROC plot
plot_roc(y_test, y_pred_prob_rf)

#Other Performance Metrics
print("                                   Accuracy Random Forest: " + str(custom_accuracy_score(y_test, y_pred_rf))) 
print("                   SENSITIVITY (aka RECALL) Random Forest: " + str(custom_sensitivity_score(y_test, y_pred_rf)))
print("                 SPECIFICITY (aka FALL-OUT) Random Forest: " + str(custom_specificity_score(y_test, y_pred_rf)))
print(" POSITIVE PREDICTIVE VALUE, (aka PRECISION) Random Forest: " + str(custom_ppv_score(y_test, y_pred_rf)))
print("                 NEGATIVE PREDICTIVE VALUE) Random Forest: " + str(custom_npv_score(y_test, y_pred_rf)))

In [None]:
from sklearn.metrics import classification_report
#class_names=[str(x) for x in classifier_RF.]
print(classification_report(y_test,y_pred_rf))

In [None]:
labels = ["True Neg","False Pos","False Neg","True Pos"]
categories = ["Zero", "One"]
make_confusion_matrix(cm_rf, 
                      group_names=labels,
                      categories=categories, 
                      cmap='binary')

In [None]:
#Feature Importance - Using Random Forest Model
importances = grid_search_RF.best_estimator_.feature_importances_ 

#Plot the varibales according to their importance
plt.figure(figsize=(15,5))
plt.title('Feature Importance Random Forest')
plt.xlabel('Decrease in Gini (recal, Gini = 2*AUC-1)')
feature_importances = pd.Series(grid_search_RF.best_estimator_.feature_importances_ , index=X_train.columns)
feature_importances.nlargest(17).sort_values().plot(kind='barh', align='center')

In [None]:
#Grid Search with cross validation

#Score_func defines the performance measure with which the gridsearchCV should use
score_func = make_scorer(roc_auc_score, greater_is_better=True)

# Create the parameter grid to test various hyper parameters
param_grid_logistic = {'penalty' : ['l1', 'l2'], 'C' : np.logspace(-4, 4, 20)}
    
#Define the Logistic model
logistic_model = LogisticRegression(solver='liblinear')

# Hyper-parameter tuning ("optimization") using the function GridSearchCV for maximizing AUC
#5-fold cross-validation
# Instantiate the grid search model
grid_search_logistic = GridSearchCV(estimator = logistic_model, param_grid = param_grid_logistic, 
                          cv = 5, scoring = score_func, n_jobs=-1,return_train_score = True, verbose = 2)

In [None]:
#Fitting the model to training data
grid_search_logistic = grid_search_logistic.fit(X_train, y_train)

In [None]:
#Finding out which are the best hyper parameter values where auc for the model is highest
print('\nBest Hyper-Parameter values Logistic:' + str(grid_search_logistic.best_params_)) 
grid_search_logistic.best_params_

#Best Estimator for Logistic Model
best_grid_logistic = grid_search_logistic.best_estimator_
best_grid_logistic

#Viweing the best score of the model
best_result_logistic = grid_search_logistic.best_score_
print("\nBest Score Logistic: "+str(best_result_logistic))

In [None]:
#Using the above (best) model with the best hyper parameter values to predict the testing data
class_threshold=0.50
y_pred_prob_logistic = grid_search_logistic.predict_proba(X_test)[:,1] # probabilities
y_pred_logistic = np.where(y_pred_prob_logistic > class_threshold, 1, 0) # classification

In [None]:
## Performance Measures
#Confusion Matrix
cm_logistic = confusion_matrix(y_test, y_pred_logistic)
print("\nConfusion matrix Logistic: \n" + str(cm_logistic))

#Auc Calculation
auc_logistic = roc_auc_score(y_test, y_pred_prob_logistic)
print("\nAUC Logistic:  " + str(auc_logistic))

#ROC plot
plot_roc(y_test, y_pred_prob_logistic)

#Other Performance Metrics
print("                                   Accuracy Logistic: " + str(custom_accuracy_score(y_test, y_pred_logistic))) 
print("                   SENSITIVITY (aka RECALL) Logistic: " + str(custom_sensitivity_score(y_test, y_pred_logistic)))
print("                 SPECIFICITY (aka FALL-OUT) Logistic: " + str(custom_specificity_score(y_test, y_pred_logistic)))
print(" POSITIVE PREDICTIVE VALUE, (aka PRECISION) Logistic: " + str(custom_ppv_score(y_test, y_pred_logistic)))
print("                 NEGATIVE PREDICTIVE VALUE) Logistic: " + str(custom_npv_score(y_test, y_pred_logistic)))

In [None]:
print(classification_report(y_test,y_pred_logistic))

In [None]:
labels = ["True Neg","False Pos","False Neg","True Pos"]
categories = ["Zero", "One"]
make_confusion_matrix(cm_logistic, 
                      group_names=labels,
                      categories=categories, 
                      cmap='binary')