# Module 17 Challenge - Assess Credit Risk

In [152]:
# Import dependencies

import pandas as pd
import numpy as np

from pathlib import Path
from collections import Counter

In [153]:
# Supress warnings
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning
Following is based on some initial sample code and template, filled out with working code and comments for clarity and adjusting to local enviroment.

In [154]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [155]:
# Load the data
file_path = Path('Module-17-Challenge-Resources/LoanStats_2019Q1.csv')

# Drom the first row & last two rows of the CSV file
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns, those columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows, those rows where all values are null
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2000,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2000,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.1640,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,RENT,26000.0,Source Verified,Jan-2019,low_risk,n,9.60,...,80.0,0.0,0.0,0.0,20625.0,6798.0,11300.0,5425.0,N,N
68813,12000.0,0.2727,368.37,RENT,63000.0,Not Verified,Jan-2019,low_risk,n,29.07,...,96.2,0.0,0.0,0.0,87939.0,60350.0,13500.0,62939.0,N,N
68814,5000.0,0.1992,185.62,MORTGAGE,52000.0,Source Verified,Jan-2019,low_risk,n,14.86,...,100.0,0.0,1.0,0.0,30592.0,18611.0,3600.0,18492.0,N,N
68815,40000.0,0.0646,1225.24,MORTGAGE,520000.0,Verified,Jan-2019,low_risk,n,9.96,...,98.2,12.5,0.0,0.0,1033574.0,95958.0,100800.0,78634.0,N,N


# Split the Data into Training and Testing

In [156]:
# Create our features dataframe and inspect
# Drop the target column, leaving features assigned to X
X = df.drop(columns='loan_status')

# Create our target, describe the data
y = df['loan_status']

# Inspect the features
print(X.columns.values.tolist())
X

['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'pymnt_plan', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'next_pymnt_d', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recen

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,n,27.24,0.0,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2000,929.09,MORTGAGE,105000.0,Verified,Mar-2019,n,20.23,0.0,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2000,529.88,MORTGAGE,56000.0,Verified,Mar-2019,n,24.26,0.0,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.1640,353.55,RENT,92000.0,Verified,Mar-2019,n,31.44,0.0,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,n,18.76,0.0,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,RENT,26000.0,Source Verified,Jan-2019,n,9.60,0.0,...,80.0,0.0,0.0,0.0,20625.0,6798.0,11300.0,5425.0,N,N
68813,12000.0,0.2727,368.37,RENT,63000.0,Not Verified,Jan-2019,n,29.07,0.0,...,96.2,0.0,0.0,0.0,87939.0,60350.0,13500.0,62939.0,N,N
68814,5000.0,0.1992,185.62,MORTGAGE,52000.0,Source Verified,Jan-2019,n,14.86,0.0,...,100.0,0.0,1.0,0.0,30592.0,18611.0,3600.0,18492.0,N,N
68815,40000.0,0.0646,1225.24,MORTGAGE,520000.0,Verified,Jan-2019,n,9.96,0.0,...,98.2,12.5,0.0,0.0,1033574.0,95958.0,100800.0,78634.0,N,N


In [157]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.052138,2.219423,95.057627,30.626217,0.125972,0.0,210033.2,61338.43,29734.128558,55722.4
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.390633,1.897432,8.326426,33.631463,0.336732,0.0,192808.8,57387.98,26795.394232,50958.45
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,3600.0,235.0,100.0,127.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,1.0,93.0,0.0,0.0,0.0,66977.0,26503.0,11600.0,22880.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,2.0,100.0,20.0,0.0,0.0,146710.0,45357.0,22100.0,42000.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,0.0,3.0,100.0,50.0,0.0,0.0,303640.0,76570.0,39300.0,72499.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,18.0,19.0,100.0,100.0,4.0,0.0,3292782.0,1295455.0,509400.0,1426964.0


In [158]:
# Check dataset objects looking for non-numerical columns
df_unlabeled = X.select_dtypes(include=['object']).copy()
print(df_unlabeled.columns)
df_unlabeled.head()

Index(['home_ownership', 'verification_status', 'issue_d', 'pymnt_plan',
       'initial_list_status', 'next_pymnt_d', 'application_type',
       'hardship_flag', 'debt_settlement_flag'],
      dtype='object')


Unnamed: 0,home_ownership,verification_status,issue_d,pymnt_plan,initial_list_status,next_pymnt_d,application_type,hardship_flag,debt_settlement_flag
0,RENT,Source Verified,Mar-2019,n,w,May-2019,Individual,N,N
1,MORTGAGE,Verified,Mar-2019,n,w,May-2019,Individual,N,N
2,MORTGAGE,Verified,Mar-2019,n,w,May-2019,Individual,N,N
3,RENT,Verified,Mar-2019,n,w,May-2019,Individual,N,N
4,MORTGAGE,Not Verified,Mar-2019,n,w,May-2019,Individual,N,N


In [159]:
# Import dependecy
from sklearn.preprocessing import LabelEncoder

# Transform the non-numeric objects including dates to numeric labels
le = LabelEncoder()
#df_labeled = df_unlabeled.copy()
for column in df_unlabeled.columns:
    X[column] = le.fit_transform(X[column])
#print(df_labeled.describe())
X

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,3,66000.0,1,2,0,27.24,0.0,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,0,0
1,25000.0,0.2000,929.09,1,105000.0,2,2,0,20.23,0.0,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,0,0
2,20000.0,0.2000,529.88,1,56000.0,2,2,0,24.26,0.0,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,0,0
3,10000.0,0.1640,353.55,3,92000.0,2,2,0,31.44,0.0,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,0,0
4,22000.0,0.1474,520.39,1,52000.0,0,2,0,18.76,0.0,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,3,26000.0,1,1,0,9.60,0.0,...,80.0,0.0,0.0,0.0,20625.0,6798.0,11300.0,5425.0,0,0
68813,12000.0,0.2727,368.37,3,63000.0,0,1,0,29.07,0.0,...,96.2,0.0,0.0,0.0,87939.0,60350.0,13500.0,62939.0,0,0
68814,5000.0,0.1992,185.62,1,52000.0,1,1,0,14.86,0.0,...,100.0,0.0,1.0,0.0,30592.0,18611.0,3600.0,18492.0,0,0
68815,40000.0,0.0646,1225.24,1,520000.0,2,1,0,9.96,0.0,...,98.2,12.5,0.0,0.0,1033574.0,95958.0,100800.0,78634.0,0,0


In [160]:
# Inspect the target
y.describe()

count        68817
unique           2
top       low_risk
freq         68470
Name: loan_status, dtype: object

In [161]:
# Count the number of unique values, note the large imbalance between the two target outcomes
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [162]:
# import dependencies
from sklearn.model_selection import train_test_split

# Create X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Target Counter before resampling
Counter(y_test)

Counter({'low_risk': 17104, 'high_risk': 101})

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. 

You will oversample the data using the **naive random oversampling algorithm** and the **SMOTE algorithm**. 

For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [163]:
# Import dependencies
from imblearn.over_sampling import RandomOverSampler
import imblearn

# Resample the training data with the RandomOversampler
# implement random oversampling
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'low_risk': 51366, 'high_risk': 51366})

In [164]:
# Import the dependency
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [165]:
# Import dependency
from sklearn.metrics import confusion_matrix

# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   72,    29],
       [ 6628, 10476]])

In [166]:
#Import dependency
from sklearn.metrics import balanced_accuracy_score

# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6626797969787623

In [193]:
# import dependency
from imblearn.metrics import classification_report_imbalanced

# Display the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205



### SMOTE Oversampling

In [168]:
# Import dependency
from imblearn.over_sampling import SMOTE

# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,sampling_strategy='auto').fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'low_risk': 51366, 'high_risk': 51366})

In [169]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [170]:
# Calculate the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6653275477220313

In [171]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   65,    36],
       [ 5352, 11752]])

In [172]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.64      0.69      0.02      0.66      0.44       101
   low_risk       1.00      0.69      0.64      0.81      0.66      0.44     17104

avg / total       0.99      0.69      0.64      0.81      0.66      0.44     17205



# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. 

You will undersample the data using the **Cluster Centroids** algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [173]:
#Import dependency
from imblearn.under_sampling import ClusterCentroids

# Resample the data using the ClusterCentroids resampler
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_risk': 246, 'low_risk': 246})

In [174]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [175]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6653275477220313

In [176]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  66,   35],
       [9953, 7151]])

In [177]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.65      0.42      0.01      0.52      0.28       101
   low_risk       1.00      0.42      0.65      0.59      0.52      0.27     17104

avg / total       0.99      0.42      0.65      0.59      0.52      0.27     17205



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. 

You will resample the data using the **SMOTEENN** algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [178]:
# Import dependency
from imblearn.combine import SMOTEENN

# Resample the training data with SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'high_risk': 68460, 'low_risk': 62011})

In [179]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [180]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.535777341181265

In [181]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   73,    28],
       [ 7067, 10037]])

In [182]:
# Display the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.01      0.72      0.59      0.02      0.65      0.43       101
   low_risk       1.00      0.59      0.72      0.74      0.65      0.42     17104

avg / total       0.99      0.59      0.72      0.73      0.65      0.42     17205



# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. 

You will train a **Balanced Random Forest Classifier** and an **Easy Ensemble AdaBoost classifier**. 

For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [183]:
# Import dependency
from imblearn.ensemble import BalancedRandomForestClassifier

# Resample the training data with the RandomOversampler
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
brf_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [184]:
# Calculate the balanced accuracy score
y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7290249400290825

In [185]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   62,    39],
       [ 2665, 14439]])

In [186]:
# Display the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.02      0.61      0.84      0.04      0.72      0.51       101
   low_risk       1.00      0.84      0.61      0.91      0.72      0.53     17104

avg / total       0.99      0.84      0.62      0.91      0.72      0.53     17205



In [187]:
# List the features sorted in descending order by feature importance
sorted( zip( brf_model.feature_importances_, X.columns), reverse=True)

[(0.07646711017201446, 'total_rec_prncp'),
 (0.0681731200022894, 'last_pymnt_amnt'),
 (0.05699252404641979, 'total_pymnt'),
 (0.05424778840861491, 'total_pymnt_inv'),
 (0.04735017423439608, 'total_rec_int'),
 (0.031245651401774252, 'int_rate'),
 (0.022619745828795626, 'issue_d'),
 (0.02244519942392171, 'mths_since_recent_inq'),
 (0.020409215004023765, 'installment'),
 (0.01808337089191728, 'tot_hi_cred_lim'),
 (0.017004195413593538, 'bc_util'),
 (0.01640559494238341, 'dti'),
 (0.016118934293676203, 'mths_since_rcnt_il'),
 (0.01581716497633785, 'il_util'),
 (0.015630310867867408, 'max_bal_bc'),
 (0.015494921031886601, 'out_prncp'),
 (0.015281199871082264, 'out_prncp_inv'),
 (0.01470961095035287, 'mo_sin_old_il_acct'),
 (0.014693851231416522, 'mo_sin_old_rev_tl_op'),
 (0.01425370693271944, 'total_bal_ex_mort'),
 (0.014094010852862895, 'total_bc_limit'),
 (0.013976327240119397, 'revol_bal'),
 (0.013793435235420952, 'avg_cur_bal'),
 (0.013416065136567646, 'tot_cur_bal'),
 (0.01339859137918

### Easy Ensemble AdaBoost Classifier

In [188]:
# Import dependency
from imblearn.ensemble import EasyEnsembleClassifier

# Train the Classifier
eeac_model = EasyEnsembleClassifier(n_estimators=100, random_state=1) 
eeac_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [189]:
# Calculate the balanced accuracy score
y_pred = eeac_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9316600714093861

In [190]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[   93,     8],
       [  983, 16121]])

In [191]:
# Display the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205

