# Lending Club loan data

## import library

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## load dataset

In [57]:
df_origin = pd.read_csv('data/LendingClub_loandata/2015.csv',
                        low_memory=False
                        )

df_origin.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421095 entries, 0 to 421094
Data columns (total 153 columns):
 #    Column                                      Non-Null Count   Dtype  
---   ------                                      --------------   -----  
 0    Unnamed: 0                                  421095 non-null  int64  
 1    id                                          421095 non-null  int64  
 2    member_id                                   0 non-null       float64
 3    loan_amnt                                   421095 non-null  float64
 4    funded_amnt                                 421095 non-null  float64
 5    funded_amnt_inv                             421095 non-null  float64
 6    term                                        421095 non-null  object 
 7    int_rate                                    421095 non-null  float64
 8    installment                                 421095 non-null  float64
 9    grade                                       421095 non-nu

## feature selecting: minimum

In [101]:
select_cols = ['loan_status',
               'annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', ## 수치형 변수
               'term', 'grade', 'home_ownership', ## 범주형 변수: 원핫 인코딩 대상
               'emp_length', 'sub_grade' ## 범주형 변수: 전처리 및 수치형 변수 변환
               ]

In [103]:
df = df_origin[ select_cols ]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421095 entries, 0 to 421094
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   loan_status     421095 non-null  object 
 1   annual_inc      421095 non-null  float64
 2   dti             421093 non-null  float64
 3   loan_amnt       421095 non-null  float64
 4   revol_bal       421095 non-null  float64
 5   revol_util      420933 non-null  float64
 6   term            421095 non-null  object 
 7   grade           421095 non-null  object 
 8   home_ownership  421095 non-null  object 
 9   emp_length      397278 non-null  object 
 10  sub_grade       421095 non-null  object 
dtypes: float64(5), object(6)
memory usage: 35.3+ MB


## drop NaN

In [104]:
df = df.dropna()
df = df.reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397119 entries, 0 to 397118
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   loan_status     397119 non-null  object 
 1   annual_inc      397119 non-null  float64
 2   dti             397119 non-null  float64
 3   loan_amnt       397119 non-null  float64
 4   revol_bal       397119 non-null  float64
 5   revol_util      397119 non-null  float64
 6   term            397119 non-null  object 
 7   grade           397119 non-null  object 
 8   home_ownership  397119 non-null  object 
 9   emp_length      397119 non-null  object 
 10  sub_grade       397119 non-null  object 
dtypes: float64(5), object(6)
memory usage: 33.3+ MB


## one-hot encoding

In [105]:
df = pd.get_dummies(df, columns=['term', 'grade', 'home_ownership',],
                    drop_first=True, dtype=int)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397119 entries, 0 to 397118
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   loan_status              397119 non-null  object 
 1   annual_inc               397119 non-null  float64
 2   dti                      397119 non-null  float64
 3   loan_amnt                397119 non-null  float64
 4   revol_bal                397119 non-null  float64
 5   revol_util               397119 non-null  float64
 6   emp_length               397119 non-null  object 
 7   sub_grade                397119 non-null  object 
 8   term_ 60 months          397119 non-null  int64  
 9   grade_B                  397119 non-null  int64  
 10  grade_C                  397119 non-null  int64  
 11  grade_D                  397119 non-null  int64  
 12  grade_E                  397119 non-null  int64  
 13  grade_F                  397119 non-null  int64  
 14  grad

## to numeric

In [106]:
grade_list = sorted(df['sub_grade'].unique().tolist())
score_list = [x for x in range(len(grade_list), 0, -1)]

grade_map = dict(zip(grade_list, score_list))
df['sub_grade_score'] = df['sub_grade'].map(grade_map)

df = df.drop('sub_grade', axis=1)
df = df.reset_index(drop=True)

In [107]:
emp_length_map = {'< 1 year': 0,
                  '1 year': 1,
                  '2 years': 2,
                  '3 years': 3,
                  '4 years': 4,
                  '5 years': 5,
                  '6 years': 6,
                  '7 years': 7,
                  '8 years': 8,
                  '9 years': 9,
                  '10+ years': 10
                  }

df['emp_length_score'] = df['emp_length'].map(emp_length_map)

df = df.drop('emp_length', axis=1)
df = df.reset_index(drop=True)

## target setting

In [108]:
df = df.loc[ df['loan_status']!='In Grace Period' , ]

status_map = {'Fully Paid': 0,
              'Current': 0,
              'Charged Off': 1,
            #   'In Grace Period': 1,
              'Late (31-120 days)': 1,
              'Late (16-30 days)': 1,
              'Default': 1,
              }

df['loan_status'] = df['loan_status'].map(status_map)

df = df.reset_index(drop=True)

## dataset split

In [109]:
y = df['loan_status']
x = df.drop('loan_status', axis=1)

In [110]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

## data scaling

In [111]:
from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()

x_train_mm = mm_scaler.fit_transform(x_train)
x_test_mm = mm_scaler.transform(x_test)

## modeling

In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

model_lr = LogisticRegression()
model_rfc = RandomForestClassifier()
model_xgbc = XGBClassifier()

# model_lr.fit(x_train, y_train)
# model_rfc.fit(x_train, y_train)
# model_xgbc.fit(x_train, y_train)

model_lr.fit(x_train_mm, y_train)
model_rfc.fit(x_train_mm, y_train)
model_xgbc.fit(x_train_mm, y_train)


# y_pred_lr = model_lr.predict(x_test)
# y_pred_rfc = model_rfc.predict(x_test)
# y_pred_xgbc = model_xgbc.predict(x_test)

y_pred_lr = model_lr.predict(x_test_mm)
y_pred_rfc = model_rfc.predict(x_test_mm)
y_pred_xgbc = model_xgbc.predict(x_test_mm)

## metrics

In [15]:
from sklearn.metrics import classification_report

In [16]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68609
           1       0.00      0.00      0.00     15488

    accuracy                           0.82     84097
   macro avg       0.41      0.50      0.45     84097
weighted avg       0.67      0.82      0.73     84097



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [18]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68609
           1       0.00      0.00      0.00     15488

    accuracy                           0.82     84097
   macro avg       0.41      0.50      0.45     84097
weighted avg       0.67      0.82      0.73     84097



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [41]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc', 'dti']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68609
           1       0.00      0.00      0.00     15488

    accuracy                           0.82     84097
   macro avg       0.41      0.50      0.45     84097
weighted avg       0.67      0.82      0.73     84097



In [43]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc', 'dti']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68609
           1       0.00      0.00      0.00     15488

    accuracy                           0.82     84097
   macro avg       0.41      0.50      0.45     84097
weighted avg       0.67      0.82      0.73     84097



In [54]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc', 'dti', 'loan_amnt']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68609
           1       0.00      0.00      0.00     15488

    accuracy                           0.82     84097
   macro avg       0.41      0.50      0.45     84097
weighted avg       0.67      0.82      0.73     84097



In [56]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc', 'dti', 'loan_amnt']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68609
           1       0.00      0.00      0.00     15488

    accuracy                           0.82     84097
   macro avg       0.41      0.50      0.45     84097
weighted avg       0.67      0.82      0.73     84097



In [68]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68583
           1       0.00      0.00      0.00     15481

    accuracy                           0.82     84064
   macro avg       0.41      0.50      0.45     84064
weighted avg       0.67      0.82      0.73     84064



In [70]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68583
           1       0.00      0.00      0.00     15481

    accuracy                           0.82     84064
   macro avg       0.41      0.50      0.45     84064
weighted avg       0.67      0.82      0.73     84064



In [82]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68583
           1       1.00      0.00      0.00     15481

    accuracy                           0.82     84064
   macro avg       0.91      0.50      0.45     84064
weighted avg       0.85      0.82      0.73     84064



In [84]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     68583
           1       0.53      0.01      0.01     15481

    accuracy                           0.82     84064
   macro avg       0.68      0.50      0.46     84064
weighted avg       0.76      0.82      0.74     84064



In [96]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade', 'emp_length', 'sub_grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     64944
           1       0.50      0.03      0.06     14361

    accuracy                           0.82     79305
   macro avg       0.66      0.51      0.48     79305
weighted avg       0.76      0.82      0.75     79305



In [98]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade', 'emp_length', 'sub_grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     64944
           1       0.50      0.01      0.02     14361

    accuracy                           0.82     79305
   macro avg       0.66      0.50      0.46     79305
weighted avg       0.76      0.82      0.74     79305



In [113]:
print('Results: Logistic Regression')
print('Scaling: X')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: X
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade', 'home_ownership', 'emp_length', 'sub_grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     64944
           1       0.51      0.03      0.06     14361

    accuracy                           0.82     79305
   macro avg       0.66      0.51      0.48     79305
weighted avg       0.77      0.82      0.75     79305



In [119]:
print('Results: Logistic Regression')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

print('======================================')

print('Results: Random Forest Classifier')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_rfc))

print('======================================')

print('Results: XGBoost Classifier')
print('Scaling: min-max')
print(f'Using cols: {select_cols[1:]}')
print('---------------------------')

print(classification_report(y_test, y_pred_xgbc))

Results: Logistic Regression
Scaling: min-max
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade', 'home_ownership', 'emp_length', 'sub_grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     64944
           1       0.53      0.02      0.03     14361

    accuracy                           0.82     79305
   macro avg       0.68      0.51      0.47     79305
weighted avg       0.77      0.82      0.74     79305

Results: Random Forest Classifier
Scaling: min-max
Using cols: ['annual_inc', 'dti', 'loan_amnt', 'revol_bal', 'revol_util', 'term', 'grade', 'home_ownership', 'emp_length', 'sub_grade']
---------------------------
              precision    recall  f1-score   support

           0       0.82      0.99      0.90     64944
           1       0.46      0.05      0.09     14361

    accuracy                           0.82     79305
   macro avg       0.64      0