## 1 - Introduction
This notebook is for clean the dataset downloaded from Lending Club website.

## 2 - Dataset

The dataset was download from [Lending Club](https://www.lendingclub.com/statistics/additional-statistics?). It contains information from 2015 to 2019. 

**Data manipulation**: 

    - Concatenate data from 2015 to 2019
    - Remove columns with >20% missing values
    - Save to a new csv file



In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# read data
path = '/Users/cyuancheng/Documents/course/Springboard/Capstone1/data/'
data = pd.read_csv(path+'data_2015_2019.csv',low_memory=True)

In [3]:
data.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,...,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,year
0,68367011,21000.0,21000.0,21000.0,60 months,13.99%,488.53,C,C4,Resident physician,< 1 year,RENT,52000.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,credit_card,Credit card refinancing,441xx,OH,14.47,0.0,Jan-2005,...,0.0,3.0,3.0,4.0,4.0,11.0,4.0,4.0,3.0,5.0,0.0,0.0,0.0,1.0,100.0,75.0,0.0,0.0,370357.0,372407.0,23300.0,347057.0,N,N,2015
1,68537655,16800.0,16800.0,16800.0,60 months,12.88%,381.23,C,C2,CEO,10+ years,MORTGAGE,118000.0,Not Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt consolidation,636xx,MO,34.29,0.0,Jun-1997,...,12.0,3.0,3.0,4.0,22.0,12.0,5.0,28.0,3.0,11.0,0.0,0.0,0.0,1.0,64.4,25.0,0.0,0.0,412771.0,75808.0,12600.0,115941.0,N,N,2015
2,68356421,22400.0,22400.0,22400.0,60 months,12.88%,508.3,C,C2,Executive Director,6 years,MORTGAGE,95000.0,Not Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt consolidation,290xx,SC,22.98,0.0,Apr-1995,...,2.0,3.0,8.0,4.0,5.0,15.0,10.0,15.0,8.0,16.0,0.0,0.0,0.0,3.0,97.0,25.0,0.0,0.0,436841.0,184356.0,21000.0,191682.0,N,N,2015
3,68466926,10000.0,10000.0,10000.0,36 months,6.49%,306.45,A,A2,SERVICE MANAGER,6 years,RENT,85000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,credit_card,Credit card refinancing,160xx,PA,13.07,0.0,Apr-2002,...,0.0,6.0,9.0,7.0,10.0,3.0,13.0,19.0,9.0,14.0,0.0,0.0,0.0,2.0,95.7,28.6,1.0,0.0,61099.0,27957.0,16400.0,30799.0,N,N,2015
4,68616873,8000.0,8000.0,8000.0,36 months,11.48%,263.74,B,B5,Vendor liaison,10+ years,MORTGAGE,42000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,credit_card,Credit card refinancing,029xx,RI,34.8,0.0,Nov-1994,...,1.0,3.0,3.0,3.0,6.0,5.0,5.0,11.0,3.0,8.0,0.0,0.0,0.0,2.0,94.4,33.3,0.0,0.0,256513.0,113782.0,17000.0,135513.0,N,N,2015


In [4]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312464 entries, 0 to 2312463
Data columns (total 106 columns):
id                            object
loan_amnt                     float64
funded_amnt                   float64
funded_amnt_inv               float64
term                          object
int_rate                      object
installment                   float64
grade                         object
sub_grade                     object
emp_title                     object
emp_length                    object
home_ownership                object
annual_inc                    float64
verification_status           object
issue_d                       object
loan_status                   object
pymnt_plan                    object
url                           object
purpose                       object
title                         object
zip_code                      object
addr_state                    object
dti                           float64
delinq_2yrs                  

In [5]:
# check columns
print(sorted(data.columns))

['acc_now_delinq', 'acc_open_past_24mths', 'addr_state', 'all_util', 'annual_inc', 'application_type', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'debt_settlement_flag', 'delinq_2yrs', 'delinq_amnt', 'dti', 'earliest_cr_line', 'emp_length', 'emp_title', 'fico_range_high', 'fico_range_low', 'funded_amnt', 'funded_amnt_inv', 'grade', 'hardship_flag', 'home_ownership', 'id', 'initial_list_status', 'inq_fi', 'inq_last_12m', 'inq_last_6mths', 'installment', 'int_rate', 'issue_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'last_pymnt_amnt', 'last_pymnt_d', 'loan_amnt', 'loan_status', 'max_bal_bc', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_rcnt_il', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_t

Identify the columns we'll be keeping from the original dataset

In [6]:
col_pick = ['id','loan_amnt','funded_amnt','term','int_rate',
                 'installment','grade','emp_length', 'home_ownership',
                 'annual_inc','verification_status','issue_d',
                 'loan_status','purpose','dti', 'delinq_2yrs',
                 'earliest_cr_line','open_acc','pub_rec', 'fico_range_high',
                 'fico_range_low', 'revol_bal','revol_util', 'total_pymnt',
                                                    'last_pymnt_d', 'recoveries']

In [7]:
len(col_pick)

26

Identify the data type of these columns

In [8]:
col_float = ['loan_amnt', 'funded_amnt', 'installment', 'annual_inc',
                     'dti', 'revol_bal', 'delinq_2yrs', 'open_acc', 'pub_rec',
                                'fico_range_high', 'fico_range_low', 'total_pymnt', 'recoveries']
col_cat = ['term', 'grade', 'emp_length', 'home_ownership',
                    'verification_status', 'loan_status', 'purpose']
col_perc = ['int_rate', 'revol_util']
col_date = ['issue_d', 'earliest_cr_line', 'last_pymnt_d']

In [9]:
assert set(col_pick) - set(col_float) - set(col_cat) - set(col_perc) - set(col_date) \
== set(["id"])

### Prepare dataset

In [56]:
data_final = data[col_pick].copy()

In [44]:
print("Dataset has " + str(data_final.shape[0]) + " rows" +' and ' + str(data_final.shape[1]) + ' columns')

Dataset has 2312464 rows and 26 columns


### Target

In [57]:
data_final['loan_status'].value_counts(dropna=False)

Current               1018979
Fully Paid             995328
Charged Off            259485
Late (31-120 days)      22853
In Grace Period         10848
Late (16-30 days)        4683
Issued                    206
Default                    48
NaN                        34
Name: loan_status, dtype: int64

In [58]:
data_final = data_final.loc[data_final['loan_status'].isin(["Fully Paid", "Charged Off"])]

In [59]:
data_final.shape

(1254813, 26)

In [60]:
# check missing value
data_final['loan_status'].isnull().sum()

0

In [61]:
data_final['loan_status'].value_counts()

Fully Paid     995328
Charged Off    259485
Name: loan_status, dtype: int64

Tag the 'Fully Paid' to 0; 'Charged Off' to 1

In [62]:
# fully paid: 0, charged off:1
#data_final['loan_status'] = 
data_final['labels'] = data_final['loan_status'].apply(lambda x:0 if x=='Fully Paid' else 1)
data_final['labels'].value_counts() 

0    995328
1    259485
Name: labels, dtype: int64

In [18]:
# calculate mean default rate
np.mean(data_final['labels'])

0.20679176897274734

#### work on datetime date

In [63]:
data_final[col_date].head()

Unnamed: 0,issue_d,earliest_cr_line,last_pymnt_d
0,Dec-2015,Jan-2005,Aug-2018
3,Dec-2015,Apr-2002,Aug-2018
4,Dec-2015,Nov-1994,Apr-2017
5,Dec-2015,Feb-1999,Jan-2017
6,Dec-2015,May-1984,May-2017


In [64]:
data_final[col_date].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1254813 entries, 0 to 2312457
Data columns (total 3 columns):
issue_d             1254813 non-null object
earliest_cr_line    1254813 non-null object
last_pymnt_d        1252139 non-null object
dtypes: object(3)
memory usage: 38.3+ MB


In [66]:
data_final[col_date].isnull().sum()

issue_d                0
earliest_cr_line       0
last_pymnt_d        2674
dtype: int64

In [69]:
# drop nan from last_pymnt_d column
data_final.dropna(subset=['last_pymnt_d'], inplace=True)

In [70]:
data_final[col_date].isnull().sum()

issue_d             0
earliest_cr_line    0
last_pymnt_d        0
dtype: int64

In [72]:
# convert to correct datetime formate
for col in col_date:
    data_final[col]=data_final[col].apply(lambda x: datetime.strptime(x,'%b-%Y'))

In [73]:
data_final[col_date].head()

Unnamed: 0,issue_d,earliest_cr_line,last_pymnt_d
0,2015-12-01,2005-01-01,2018-08-01
3,2015-12-01,2002-04-01,2018-08-01
4,2015-12-01,1994-11-01,2017-04-01
5,2015-12-01,1999-02-01,2017-01-01
6,2015-12-01,1984-05-01,2017-05-01


In [74]:
data_final[col_date].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1252139 entries, 0 to 2312457
Data columns (total 3 columns):
issue_d             1252139 non-null datetime64[ns]
earliest_cr_line    1252139 non-null datetime64[ns]
last_pymnt_d        1252139 non-null datetime64[ns]
dtypes: datetime64[ns](3)
memory usage: 38.2 MB
