# Lending Club Loan Data 

In [115]:
import pandas as pd
import numpy as np
import pandas_profiling
import warnings
import pandas_profiling
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

warnings.filterwarnings('ignore')

%matplotlib inline

## Part 1. Data Exploration and Evaluation

In this section, we 

### Reading the data

In [83]:
date_columns = ['issue_d', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'hardship_start_date', 
         'hardship_end_date', 'payment_plan_start_date', 'debt_settlement_flag_date', 'settlement_date']

In [84]:
# loading the data from csv file and parsing the date columns as dates
data = pd.read_csv('/Users/christina/Desktop/LC/data/loan.csv', low_memory=False, parse_dates=date_columns)

In [85]:
data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,NaT,,NaT,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,NaT,,NaT,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,NaT,,NaT,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,NaT,,NaT,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,NaT,,NaT,,,


In [86]:
data.shape

(2260668, 145)

### Fixing Data Types

In [87]:
data.dtypes.value_counts()

float64           105
object             27
datetime64[ns]      9
int64               4
dtype: int64

The dataset includes some numerical and some categorical variables. Let's group them.

We first group them based on the datatypes assigned during the data loading (which included parsing the date variables). Then we manually examine the variables and move the incorrectly assinged fields to the correct categories (e.g. `url` should be a categorical variable not numerical, and `policy_code` should be categorical and not numerical). 

The data types are split into 3 categories: 
* numerical
* categorical
* datetime

In [102]:
# grouping variables based on existing datatypes
numerical = []
categorical = []
integers = []
dates = []

for col in list(data.columns):
    if data[col].dtype == np.float64 or data[col].dtype == np.int64:
        numerical.append(col)
    elif data[col].dtype == np.object:
        categorical.append(col) 
    else:
        dates.append(col)  

In [110]:
numerical = ['loan_amnt', 'funded_amnt', 'revol_bal',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'annual_inc_joint',
 'dti_joint',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_bc_dlq',
 'mths_since_recent_inq',
 'mths_since_recent_revol_delinq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_120dpd_2m',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'pct_tl_nvr_dlq',
 'percent_bc_gt_75',
 'pub_rec_bankruptcies',
 'tax_liens',
 'tot_hi_cred_lim',
 'total_bal_ex_mort',
 'total_bc_limit',
 'total_il_high_credit_limit',
 'revol_bal_joint',
 'sec_app_inq_last_6mths',
 'sec_app_mort_acc',
 'sec_app_open_acc',
 'sec_app_revol_util',
 'sec_app_open_act_il',
 'sec_app_num_rev_accts',
 'sec_app_chargeoff_within_12_mths',
 'sec_app_collections_12_mths_ex_med',
 'sec_app_mths_since_last_major_derog',
 'deferral_term',
 'hardship_amount',
 'hardship_length',
 'hardship_dpd',
 'orig_projected_additional_accrued_interest',
 'hardship_payoff_balance_amount',
 'hardship_last_payment_amount',
 'settlement_amount',
 'settlement_percentage',
 'settlement_term']

In [118]:
categorical = [
 'id',
 'member_id', 'term',
 'grade',
 'url',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'loan_status',
 'pymnt_plan',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line',
 'initial_list_status',
 'application_type',
 'verification_status_joint',
 'sec_app_earliest_cr_line',
 'hardship_flag',
 'hardship_type',
 'hardship_reason',
 'hardship_status',
 'hardship_loan_status',
 'disbursement_method',
 'debt_settlement_flag',
 'settlement_status', 'policy_code']

In [112]:
dates 

['issue_d',
 'last_pymnt_d',
 'next_pymnt_d',
 'last_credit_pull_d',
 'hardship_start_date',
 'hardship_end_date',
 'payment_plan_start_date',
 'debt_settlement_flag_date',
 'settlement_date']

Now we will create two data pipelines to ensure the correct datatypes for numerical and categorical variables. 

In [142]:
def select_categorical_features(X):
    return X[categorical]

def select_numerical_features(X):
    return X[numerical]

def ensure_numerical_data_type(X):
    return X.apply(lambda col: col.astype(float))

def ensure_categorical_data_type(X):
    return X.apply(lambda col: col.astype(str))

# Pipeline for numeric features 
numeric_feature_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_numerical_features, validate=False)),
    ('datatype', FunctionTransformer(ensure_numerical_data_type, validate=False))
])

# Pipeline for categorical features 
categorical_pipeline = Pipeline(steps=[
    ('select', FunctionTransformer(select_categorical_features, validate=False)),
    ('datatype', FunctionTransformer(ensure_categorical_data_type, validate=False))
])

# Combining the two sets of features
fu = FeatureUnion([
    ('categorical', categorical_pipeline),
    ('numeric', numeric_feature_pipeline)
])

In [144]:
data_transformed = fu.fit_transform(data)
data_transformed = pd.DataFrame(data_transformed, columns=categorical+numerical)
data_transformed = pd.concat([data_transformed, data[dates]], axis=1)

In [153]:
data_transformed.head()

Unnamed: 0,id,member_id,term,grade,url,sub_grade,emp_title,emp_length,home_ownership,verification_status,...,settlement_term,issue_d,last_pymnt_d,next_pymnt_d,last_credit_pull_d,hardship_start_date,hardship_end_date,payment_plan_start_date,debt_settlement_flag_date,settlement_date
0,,,36 months,C,,C1,Chef,10+ years,RENT,Not Verified,...,,2018-12-01,2019-02-01,2019-03-01,2019-02-01,NaT,NaT,NaT,NaT,NaT
1,,,60 months,D,,D2,Postmaster,10+ years,MORTGAGE,Source Verified,...,,2018-12-01,2019-02-01,2019-03-01,2019-02-01,NaT,NaT,NaT,NaT,NaT
2,,,36 months,D,,D1,Administrative,6 years,MORTGAGE,Source Verified,...,,2018-12-01,2019-02-01,2019-03-01,2019-02-01,NaT,NaT,NaT,NaT,NaT
3,,,36 months,D,,D2,IT Supervisor,10+ years,MORTGAGE,Source Verified,...,,2018-12-01,2019-02-01,2019-03-01,2019-02-01,NaT,NaT,NaT,NaT,NaT
4,,,60 months,C,,C4,Mechanic,10+ years,MORTGAGE,Not Verified,...,,2018-12-01,2019-02-01,2019-03-01,2019-02-01,NaT,NaT,NaT,NaT,NaT


In [154]:
assert data_transformed.shape == data.shape

### Removing Null Columns

In this section we will remove the columns that have 100% null values

In [None]:
# store the columns that have 100% missing values and drop them from the db 
missing = data.apply(lambda col: col.isnull()).sum().sort_values(ascending=False)/data.shape[0]
drop_columns = list(missing[missing == 1].index)
drop_columns