# Get access to data

## 1 - Introduction
This notebook is to download and initially clean the Lending Club dataset

## 2 - Dataset

The dataset was download from [Lending Club](https://www.lendingclub.com/statistics/additional-statistics?). It contains information from 2015 to 2019. 

**Data manipulation**: 

    - Concatenate data from 2012 to 2019
    - Remove columns with >20% missing values
    - Save to a new csv file



In [1]:
from glob import glob
import pandas as pd

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
# load 2015
path = '/Users/cyuancheng/Documents/course/Springboard/Capstone1/data/'              
lc2015 = 'LoanStats3d_securev1.csv.zip'
df_2015 = pd.read_csv(path + lc2015, sep=',',low_memory=True, skiprows=1)

In [3]:
df_2015.shape

(421097, 150)

In [4]:
# load 2014
path = '/Users/cyuancheng/Documents/course/Springboard/Capstone1/data/'              
lc2014 = 'LoanStats3c_securev1.csv.zip'
df_2014 = pd.read_csv(path + lc2014, sep=',',low_memory=True, skiprows=1)

In [5]:
df_2014.shape

(235631, 150)

In [6]:
# load 2012_13
path = '/Users/cyuancheng/Documents/course/Springboard/Capstone1/data/'              
lc2012_13 = 'LoanStats3b_securev1.csv.zip'
df_2012_13 = pd.read_csv(path + lc2012_13, sep=',',low_memory=True, skiprows=1)

In [7]:
df_2012_13.shape

(188183, 150)

In [8]:
def read_file(year):
    '''
    import zip files(Q1-Q4) in each year
    remove first row; add year column
    return a pd dataframe for each year
    '''
    
    filename = glob(path+'LoanStats_securev1_'+str(year)+'*.csv.zip')
    dataframe = pd.concat([pd.read_csv(f,low_memory=True, skiprows=1) for f in filename], \
                    ignore_index=True)
    #dataframe['year'] = year
    return dataframe

In [9]:
# combining multiple dataframe (2012-2019)
df_all = pd.concat(objs=[df_2012_13, df_2014, df_2015, read_file(2016), read_file(2017),\
                               read_file(2018), read_file(2019)], axis=0)

In [10]:
df_all.shape

(2736278, 150)

In [11]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2736278 entries, 0 to 518114
Columns: 150 entries, id to settlement_term
dtypes: float64(111), object(39)
memory usage: 3.1+ GB


In [12]:
df_all.sample(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,...,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
365917,94096092,,9500.0,9500.0,9450.0,36 months,17.99%,343.41,D,D2,Administrative Assistant,< 1 year,OWN,27000.0,Source Verified,Dec-2016,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,802xx,CO,19.38,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
210338,77079624,,25000.0,25000.0,25000.0,60 months,8.39%,511.59,B,B1,Operations Supervisor,10+ years,OWN,73000.0,Not Verified,Apr-2016,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,782xx,TX,25.33,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
412528,121847169,,9500.0,9500.0,9500.0,36 months,5.32%,286.1,A,A1,Admin Asst,10+ years,MORTGAGE,41275.0,Not Verified,Oct-2017,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,380xx,TN,20.82,...,0.0,0.0,,N,,,,,,,,,,,,,,,N,,,,,,
81119,96253004,,4200.0,4200.0,4200.0,36 months,16.99%,149.73,D,D1,Communications Dispatcher,10+ years,RENT,89000.0,Source Verified,Jan-2017,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,954xx,CA,29.97,...,,,,N,,,,,,,,,,,,,,,N,,,,,,
189264,13126937,,13000.0,13000.0,13000.0,36 months,14.64%,448.37,C,C3,Residential Habilitation Specialist,1 year,RENT,64548.0,Source Verified,Mar-2014,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,major_purchase,Major purchase,108xx,NY,11.43,...,,,,N,,,,,,,,,,,,,,,N,,,,,,


### Initial processing

Deal with missing value

In [13]:
# check missing values
df_missing = df_all.isnull().sum()/len(df_all)

In [14]:
# list column name with >20% missing values 
df_missing = df_missing.sort_values(ascending=False)
display(df_missing.head(20))
list_col_del = list(df_missing[df_missing > 0.2].index)
print('columns with > 20% missing values: '+ str(len(list_col_del))+ ' columns')
print('----------------------------')
print(list_col_del)

member_id                                     1.000000
orig_projected_additional_accrued_interest    0.995900
hardship_amount                               0.994786
hardship_type                                 0.994786
hardship_reason                               0.994786
hardship_status                               0.994786
deferral_term                                 0.994786
hardship_last_payment_amount                  0.994786
hardship_payoff_balance_amount                0.994786
hardship_loan_status                          0.994786
hardship_dpd                                  0.994786
hardship_length                               0.994786
payment_plan_start_date                       0.994786
hardship_end_date                             0.994786
hardship_start_date                           0.994786
settlement_term                               0.982357
settlement_percentage                         0.982357
debt_settlement_flag_date                     0.982357
settlement

columns with > 20% missing values: 58 columns
----------------------------
['member_id', 'orig_projected_additional_accrued_interest', 'hardship_amount', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_last_payment_amount', 'hardship_payoff_balance_amount', 'hardship_loan_status', 'hardship_dpd', 'hardship_length', 'payment_plan_start_date', 'hardship_end_date', 'hardship_start_date', 'settlement_term', 'settlement_percentage', 'debt_settlement_flag_date', 'settlement_status', 'settlement_date', 'settlement_amount', 'sec_app_mths_since_last_major_derog', 'desc', 'sec_app_revol_util', 'verification_status_joint', 'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'dti_joint', 'annual_inc_joint', 'mths_since_last_record',

In [15]:
# remove the columns with >20% missing values
df_all.drop(labels =list_col_del, axis='columns', inplace=True)

# save to local disk
df_all.to_csv(path+'data_2012_2019.csv', index=False)

df_all.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,...,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10224583,11100.0,11100.0,11100.0,36 months,14.98%,384.68,C,C3,Teacher,10+ years,MORTGAGE,90000.0,Not Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,other,Other,103xx,NY,3.73,1.0,Jun-2001,...,11.0,1.0,4.0,8.0,4.0,4.0,0.0,8.0,11.0,8.0,9.0,0.0,0.0,1.0,1.0,75.0,50.0,0.0,0.0,385000.0,6619.0,4000.0,0.0,N,N
1,10148122,12000.0,12000.0,12000.0,36 months,7.62%,373.94,A,A3,Systems Engineer,3 years,MORTGAGE,96500.0,Not Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt Consolidation and Credit Transfer,782xx,TX,12.61,0.0,Sep-2003,...,10.0,0.0,4.0,5.0,4.0,10.0,15.0,8.0,14.0,5.0,17.0,0.0,0.0,0.0,3.0,100.0,100.0,0.0,0.0,233004.0,46738.0,14800.0,53404.0,N,N
2,10149342,27050.0,27050.0,27050.0,36 months,10.99%,885.46,B,B2,Team Leadern Customer Ops & Systems,10+ years,OWN,55000.0,Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,Debt Consolidation,481xx,MI,22.87,0.0,Oct-1986,...,8.0,0.0,2.0,4.0,4.0,8.0,8.0,10.0,15.0,4.0,14.0,0.0,0.0,0.0,1.0,100.0,25.0,0.0,0.0,138554.0,70186.0,35700.0,33054.0,N,N
3,10129454,12000.0,12000.0,12000.0,36 months,10.99%,392.81,B,B2,Project Manager,4 years,RENT,60000.0,Not Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,debt_consolidation,No Regrets,281xx,NC,4.62,0.0,Dec-2009,...,3.0,0.0,4.0,7.0,8.0,10.0,0.0,15.0,18.0,7.0,15.0,0.0,0.0,0.0,4.0,100.0,0.0,0.0,0.0,29700.0,7137.0,18100.0,0.0,N,N
4,10149488,4800.0,4800.0,4800.0,36 months,10.99%,157.13,B,B2,Surgical Technician,2 years,MORTGAGE,39600.0,Source Verified,Dec-2013,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,home_improvement,For The House,782xx,TX,2.49,0.0,Aug-1995,...,3.0,0.0,2.0,2.0,3.0,4.0,1.0,3.0,7.0,2.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,25700.0,4136.0,25700.0,0.0,N,N


In [16]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2736278 entries, 0 to 518114
Data columns (total 92 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   id                          object 
 1   loan_amnt                   float64
 2   funded_amnt                 float64
 3   funded_amnt_inv             float64
 4   term                        object 
 5   int_rate                    object 
 6   installment                 float64
 7   grade                       object 
 8   sub_grade                   object 
 9   emp_title                   object 
 10  emp_length                  object 
 11  home_ownership              object 
 12  annual_inc                  float64
 13  verification_status         object 
 14  issue_d                     object 
 15  loan_status                 object 
 16  pymnt_plan                  object 
 17  url                         object 
 18  purpose                     object 
 19  title                 

In [17]:
df_all['term'].value_counts(normalize=True, dropna=False)

 36 months    0.704473
 60 months    0.295513
NaN           0.000014
Name: term, dtype: float64

In [18]:
df_all['loan_status'].value_counts(dropna=False)

Fully Paid            1347641
Current               1019024
Charged Off            330886
Late (31-120 days)      22894
In Grace Period         10858
Late (16-30 days)        4683
Issued                    206
Default                    48
NaN                        38
Name: loan_status, dtype: int64

In [19]:
(df_all.isnull().sum()/len(df_all)).sort_values(ascending=False).head(30)

mths_since_recent_inq         0.115178
emp_title                     0.088408
emp_length                    0.070199
num_tl_120dpd_2m              0.043221
mo_sin_old_il_acct            0.040447
bc_util                       0.014655
percent_bc_gt_75              0.014321
bc_open_to_buy                0.014146
mths_since_recent_bc          0.013462
pct_tl_nvr_dlq                0.010209
avg_cur_bal                   0.010189
mo_sin_old_rev_tl_op          0.010152
mo_sin_rcnt_rev_tl_op         0.010152
num_rev_accts                 0.010152
num_actv_rev_tl               0.010152
mo_sin_rcnt_tl                0.010152
total_rev_hi_lim              0.010152
tot_cur_bal                   0.010152
num_accts_ever_120_pd         0.010152
num_actv_bc_tl                0.010152
tot_coll_amt                  0.010152
num_tl_op_past_12m            0.010152
num_il_tl                     0.010152
num_op_rev_tl                 0.010152
num_rev_tl_bal_gt_0           0.010152
total_il_high_credit_limi