# Part 03: Class Imbalance
Explore imbalance between the two classes (defaul and non-default).

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.utils.class_weight import compute_class_weight

## Load the dataset

In [2]:
# loading the dataset
data = pd.read_csv('credit_risk_dataset.csv')
print(data.shape)

(32581, 12)


## Standard Cleaning

- remove odd data (e.g., age > 120 years, employment history length > age + 15 years)
- drop duplicates
- replace ordinal categorical variables or binary nominal variables with discrete numerical variables

In [3]:
# drop outlier and unreasonable data
data = data[(data['person_emp_length'].isnull()) | (data['person_age'] >= 15+data['person_emp_length'])]
data = data[data['person_age']<100]

# drop duplicates
data = data.drop_duplicates()

# replace strings with integers
data['cb_person_default_on_file'].replace(['Y', 'N'], [1, 0], inplace = True)
data['loan_grade'].replace(['A', 'B', 'C', 'D', 'E', 'F', 'G'], [1, 2, 3, 4, 5, 6, 7], inplace = True)

## Handle Missing Values

In [4]:
# HANDLE MISSING VALUES (Notebook 01)
# PART 1: person_emp_length
# replace the missing values in the original dataset with the median
data['person_emp_length'].fillna(data['person_emp_length'].median(), inplace = True)

# PART 2: loan_int_rate
# drop NA values to facilitate computations
data_noNA = data.dropna()

X = data_noNA[['person_age', 'person_income', 'loan_amnt', 'cb_person_cred_hist_length', 'loan_grade', 'cb_person_default_on_file', 'loan_percent_income']]
X.loc[:, 'person_income'] = np.log(X['person_income'])
X.loc[:, 'loan_amnt'] = np.log(X['loan_amnt'])
y = data_noNA['loan_int_rate']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 34)

# train a linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# predict the test set results
y_pred = regressor.predict(X_test)

data_NA = data[data['loan_int_rate'].isnull()]
data_NA_X = data_NA[['person_age', 'person_income', 'loan_amnt', 'cb_person_cred_hist_length', 'loan_grade', 'cb_person_default_on_file', 'loan_percent_income', 'loan_int_rate']]
data_NA_X.loc[:, 'person_income'] = np.log(data_NA_X['person_income'])
data_NA_X.loc[:, 'loan_amnt'] = np.log(data_NA_X['loan_amnt'])

data_NA_X.loc[:, 'loan_int_rate'] = regressor.predict(data_NA_X[['person_age', 'person_income', 'loan_amnt', 'cb_person_cred_hist_length', 'loan_grade', 'cb_person_default_on_file', 'loan_percent_income']])

data.update(data_NA_X)

## Evaluate Class Imbalance

In [8]:
data['loan_status'].value_counts()

loan_status
0    25321
1     7088
Name: count, dtype: int64

There is a clear sign of **class imbalance**.

We are dealing with a classification problem to predict credit risk. Because of the imbalance of the two classesc (positive: default, negative: non-default), any model may lean on the predicting the majority (negative) class. However, in cases of credit risk, we want to make sure that among the real positive cases, we identify as many as possible. We want a high **recall** (in addition to high accuracy, precision, F1 score, AUC-ROC). We will have to balance the two datasets to achieve this. Common techniques include:
- Class weight adjustment
- Undersampling the majority class (e.g., random undersampling)
- Oversampling the minority class (e.g., random oversampling, SMOTE-ENC for nominal categorical variables)

Due to its simplicity, we will first resort to class weight adjustment for the baseline (logistic regression) model. For other, more complicated models, we may conduct a combination of undersampling the majority class and random oversampling the minority class, since creating synthetic data with nominal categorical, ordinal categotical, and discrete numeric features might be a bit challenging

## Class Weights

In [11]:
# the target variables
y = data['loan_status']

In [13]:
# compute the class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights = {0: class_weights[0], 1: class_weights[1]}
print(class_weights)

{0: 0.6399628766636388, 1: 2.2861879232505644}


## Under- & Over-sampling

In [25]:
# make a copy of the data frame
data_copy = data.copy()

# oversample the minority class and undersample the majority class
positiveSample = data_copy[data_copy['loan_status']==1].sample(n=10000, replace=True, random_state=1)
negativeSample = data_copy[data_copy['loan_status']==0].sample(n=10000, replace=False, random_state=1)
data_balanced = pd.concat([positiveSample, negativeSample]).sample(frac=1, random_state=12)
data_balanced['loan_status'].value_counts()

loan_status
0    10000
1    10000
Name: count, dtype: int64