# Lending Club loan data

## import library

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## load dataset

In [2]:
df_origin = pd.read_csv('data/LendingClub_loandata/2015.csv',
                        low_memory=False
                        )

df_origin.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421095 entries, 0 to 421094
Data columns (total 153 columns):
 #    Column                                      Non-Null Count   Dtype  
---   ------                                      --------------   -----  
 0    Unnamed: 0                                  421095 non-null  int64  
 1    id                                          421095 non-null  int64  
 2    member_id                                   0 non-null       float64
 3    loan_amnt                                   421095 non-null  float64
 4    funded_amnt                                 421095 non-null  float64
 5    funded_amnt_inv                             421095 non-null  float64
 6    term                                        421095 non-null  object 
 7    int_rate                                    421095 non-null  float64
 8    installment                                 421095 non-null  float64
 9    grade                                       421095 non-nu

## feature selecting: minimum

In [3]:
select_cols = ['loan_status', ## 타겟 변수: 대출 상태
               'annual_inc', ## 수치형 변수: 연수입
               'dti', ## 수치형 변수: 총부채 상환비율
               'loan_amnt', ## 수치형 변수: 신청 대출 금액
               'revol_bal', 'revol_util', ## 수치형 변수: 리볼빙 관련
               'fico_range_low', 'fico_range_high', ## 수치형 변수: 신용 점수
               'pub_rec', ## 수치형 변수: 공공 파산/압류 기록
               'delinq_2yrs', ## 수치형 변수: 지난 2년간 30일 이상 연체한 횟수
               
               'term', ## 범주형 변수: 상환 기간 (원핫 대상)
               'home_ownership', ## 범주형 변수: 거주 형태 (원핫 대상)
               'purpose', ## 범주형 변수: 대출 목적 (원핫 대상)
               
               'emp_length', ## 범주형 변수: 근속연수 (전처리 및 수치형 변수 변환)

            #    'grade', 'sub_grade', ## 범주형 변수: 원핫 인코딩 대상 ## 얘네는 심사 후에 결정되는 것...
               ]

In [4]:
df = df_origin[ select_cols ]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421095 entries, 0 to 421094
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loan_status      421095 non-null  object 
 1   annual_inc       421095 non-null  float64
 2   dti              421093 non-null  float64
 3   loan_amnt        421095 non-null  float64
 4   revol_bal        421095 non-null  float64
 5   revol_util       420933 non-null  float64
 6   fico_range_low   421095 non-null  float64
 7   fico_range_high  421095 non-null  float64
 8   pub_rec          421095 non-null  float64
 9   delinq_2yrs      421095 non-null  float64
 10  term             421095 non-null  object 
 11  home_ownership   421095 non-null  object 
 12  purpose          421095 non-null  object 
 13  emp_length       397278 non-null  object 
dtypes: float64(9), object(5)
memory usage: 45.0+ MB


## drop NaN

In [5]:
df = df.dropna()
df = df.reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397119 entries, 0 to 397118
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loan_status      397119 non-null  object 
 1   annual_inc       397119 non-null  float64
 2   dti              397119 non-null  float64
 3   loan_amnt        397119 non-null  float64
 4   revol_bal        397119 non-null  float64
 5   revol_util       397119 non-null  float64
 6   fico_range_low   397119 non-null  float64
 7   fico_range_high  397119 non-null  float64
 8   pub_rec          397119 non-null  float64
 9   delinq_2yrs      397119 non-null  float64
 10  term             397119 non-null  object 
 11  home_ownership   397119 non-null  object 
 12  purpose          397119 non-null  object 
 13  emp_length       397119 non-null  object 
dtypes: float64(9), object(5)
memory usage: 42.4+ MB


## one-hot encoding

In [6]:
df = pd.get_dummies(df, columns=['term', 'home_ownership', 'purpose'],
                    drop_first=True, dtype=int)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397119 entries, 0 to 397118
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   loan_status                 397119 non-null  object 
 1   annual_inc                  397119 non-null  float64
 2   dti                         397119 non-null  float64
 3   loan_amnt                   397119 non-null  float64
 4   revol_bal                   397119 non-null  float64
 5   revol_util                  397119 non-null  float64
 6   fico_range_low              397119 non-null  float64
 7   fico_range_high             397119 non-null  float64
 8   pub_rec                     397119 non-null  float64
 9   delinq_2yrs                 397119 non-null  float64
 10  emp_length                  397119 non-null  object 
 11  term_ 60 months             397119 non-null  int64  
 12  home_ownership_MORTGAGE     397119 non-null  int64  
 13  home_ownership

## to numeric

In [7]:
# grade_list = sorted(df['sub_grade'].unique().tolist())
# score_list = [x for x in range(len(grade_list), 0, -1)]

# grade_map = dict(zip(grade_list, score_list))
# df['sub_grade_score'] = df['sub_grade'].map(grade_map)

# df = df.drop('sub_grade', axis=1)
# df = df.reset_index(drop=True)

In [8]:
emp_length_map = {'< 1 year': 0,
                  '1 year': 1,
                  '2 years': 2,
                  '3 years': 3,
                  '4 years': 4,
                  '5 years': 5,
                  '6 years': 6,
                  '7 years': 7,
                  '8 years': 8,
                  '9 years': 9,
                  '10+ years': 10
                  }

df['emp_length_score'] = df['emp_length'].map(emp_length_map)

df = df.drop('emp_length', axis=1)
df = df.reset_index(drop=True)

## target setting

In [9]:
df = df.loc[ df['loan_status']!='In Grace Period' , ]

status_map = {'Fully Paid': 0,
              'Current': 0,
              'Charged Off': 1,
            #   'In Grace Period': 1,
              'Late (31-120 days)': 1,
              'Late (16-30 days)': 1,
              'Default': 1,
              }

df['loan_status'] = df['loan_status'].map(status_map)

df = df.reset_index(drop=True)

## dataset split

In [10]:
y = df['loan_status']
x = df.drop('loan_status', axis=1)

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

## data scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mm_scaler = MinMaxScaler()
ss_scaler = StandardScaler()

x_train_mm = mm_scaler.fit_transform(x_train)
x_test_mm = mm_scaler.transform(x_test)

x_train_ss = ss_scaler.fit_transform(x_train)
x_test_ss = ss_scaler.transform(x_test)

## oversampling

In [24]:
from imblearn.over_sampling import SMOTE

In [23]:
y_train.value_counts()

loan_status
0    259776
1     57443
Name: count, dtype: int64

In [25]:
smote = SMOTE(random_state=42)

x_train_resampled, y_train_resampled = smote.fit_resample(x_train_ss, y_train)

In [27]:
y_train_resampled.value_counts()

loan_status
0    259776
1    259776
Name: count, dtype: int64

## modeling

In [55]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(random_state=42, max_iter=1000)

model_lr.fit(x_train_resampled, y_train_resampled)

y_pred_lr = model_lr.predict(x_test_ss)

## Grid Search

In [42]:
from sklearn.model_selection import GridSearchCV

model_lr = LogisticRegression(random_state=42, max_iter=1000)

param_grid = {'C': [0.01, 0.1, 1, 10, 100],
               'penalty': ['l1', 'l2'],
               'solver': ['liblinear']
               }

grid_search = GridSearchCV(estimator=model_lr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1',
                           verbose=1,
                           n_jobs=-1,
                           )

grid_search.fit(x_train_resampled, y_train_resampled)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


0,1,2
,estimator,LogisticRegre...ndom_state=42)
,param_grid,"{'C': [0.01, 0.1, ...], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [43]:
print('Best Parameters:', grid_search.best_params_)
print('Best F1-score:', grid_search.best_score_)

Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Best F1-score: 0.6281371114019848


In [44]:
best_model = grid_search.best_estimator_

y_pred_gscv = best_model.predict(x_test_ss)

## Random Search

In [47]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [49]:
param_dist = {'C': uniform(0.01, 100),
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear'],
              }

random_search = RandomizedSearchCV(estimator=model_lr,
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   cv=5,
                                   scoring='f1',
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42,
                                   )

random_search.fit(x_train_resampled, y_train_resampled)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


0,1,2
,estimator,LogisticRegre...ndom_state=42)
,param_distributions,"{'C': <scipy.stats....x76bac11525d0>, 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}"
,n_iter,50
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(0....7658410143283)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [50]:
print('Best Parameters:', random_search.best_params_)
print('Best F1-score:', random_search.best_score_)

Best Parameters: {'C': np.float64(0.08787658410143283), 'penalty': 'l2', 'solver': 'liblinear'}
Best F1-score: 0.6279931463912021


In [51]:
best_model_rscv = random_search.best_estimator_

y_pred_rscv = best_model_rscv.predict(x_test_ss)

## metrics

In [32]:
from sklearn.metrics import classification_report

In [56]:
print('Results: Logistic Regression')
print('Scaling: Standardization')
print('SMOTE Oversampling')
print('---------------------------')

print(classification_report(y_test, y_pred_lr))

Results: Logistic Regression
Scaling: Standardization
SMOTE Oversampling
---------------------------
              precision    recall  f1-score   support

           0       0.88      0.61      0.72     64944
           1       0.26      0.62      0.37     14361

    accuracy                           0.61     79305
   macro avg       0.57      0.62      0.55     79305
weighted avg       0.77      0.61      0.66     79305



In [46]:
print('Results: Logistic Regression GridSearchCV')
print('Scaling: Standardization')
print('SMOTE Oversampling')
print('---------------------------')

print(classification_report(y_test, y_pred_gscv))

Results: Logistic Regression GridSearchCV
Scaling: Standardization
SMOTE Oversampling
---------------------------
              precision    recall  f1-score   support

           0       0.88      0.61      0.72     64944
           1       0.26      0.62      0.37     14361

    accuracy                           0.61     79305
   macro avg       0.57      0.62      0.55     79305
weighted avg       0.77      0.61      0.66     79305



In [52]:
print('Results: Logistic Regression RandomSearchCV')
print('Scaling: Standardization')
print('SMOTE Oversampling')
print('---------------------------')

print(classification_report(y_test, y_pred_rscv))

Results: Logistic Regression RandomSearchCV
Scaling: Standardization
SMOTE Oversampling
---------------------------
              precision    recall  f1-score   support

           0       0.88      0.61      0.72     64944
           1       0.26      0.62      0.37     14361

    accuracy                           0.61     79305
   macro avg       0.57      0.62      0.55     79305
weighted avg       0.77      0.61      0.66     79305



## feature importance

In [57]:
feature_names = x_train.columns

fi_df1 = pd.DataFrame({'feature': feature_names,
                       'coef': np.round(model_lr.coef_, 2)[0],
                       })

fi_df2 = pd.DataFrame({'feature': feature_names,
                       'coef': np.round(best_model.coef_, 2)[0],
                       })

fi_df3 = pd.DataFrame({'feature': feature_names,
                       'coef': np.round(best_model_rscv.coef_, 2)[0],
                       })

In [58]:
fi_df1

Unnamed: 0,feature,coef
0,annual_inc,-0.14
1,dti,0.25
2,loan_amnt,0.15
3,revol_bal,-0.17
4,revol_util,-0.08
5,fico_range_low,-0.23
6,fico_range_high,-0.23
7,pub_rec,-0.01
8,delinq_2yrs,-0.04
9,term_ 60 months,0.31


In [59]:
fi_df2

Unnamed: 0,feature,coef
0,annual_inc,-0.13
1,dti,0.25
2,loan_amnt,0.15
3,revol_bal,-0.16
4,revol_util,-0.08
5,fico_range_low,-0.37
6,fico_range_high,-0.08
7,pub_rec,-0.01
8,delinq_2yrs,-0.03
9,term_ 60 months,0.31


In [60]:
fi_df3

Unnamed: 0,feature,coef
0,annual_inc,-0.14
1,dti,0.25
2,loan_amnt,0.15
3,revol_bal,-0.17
4,revol_util,-0.08
5,fico_range_low,-0.23
6,fico_range_high,-0.23
7,pub_rec,-0.01
8,delinq_2yrs,-0.04
9,term_ 60 months,0.31
