<a href="https://colab.research.google.com/github/chiragpipalia/p2p-lending-prediction/blob/main/p2p_lending.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pylab as pl
import yaml

**REF**
- https://github.com/fiddler-labs/p2p-lending-data/tree/master
- https://gist.github.com/lukemerrick/af14f5b498ddf3900ba77c7bd840fc8c
- https://github.com/nateGeorge/preprocess_lending_club_data


In [3]:
import pathlib
# define paths to all the files
data_dir = pathlib.Path('/content/drive/My Drive/Data/p2p_lending/')

feature_schema_yaml = data_dir / 'feature_schema.yaml'
label_schema_yaml = data_dir / 'label_schema.yaml'

train_feature_csv = data_dir / 'train' / 'train_features.csv.gz'
train_label_csv = data_dir / 'train' / 'train_labels.csv.gz'
test_feature_csv = data_dir / 'test' / 'test_features.csv.gz'
test_label_csv = data_dir / 'test' / 'test_labels.csv.gz'

In [4]:
# load the schemas
with feature_schema_yaml.open() as yaml_file:
    feature_schema = yaml.safe_load(yaml_file)
with label_schema_yaml.open() as yaml_file:
    label_schema = yaml.safe_load(yaml_file)


train_features = pd.read_csv(train_feature_csv, **feature_schema)
train_labels = pd.read_csv(train_label_csv, **label_schema)

test_features = pd.read_csv(test_feature_csv, **feature_schema)
test_labels = pd.read_csv(test_label_csv, **label_schema)

In [5]:
train_features.head()

Unnamed: 0_level_0,loan_amnt,emp_title,emp_length,home_ownership,annual_inc,desc,purpose,title,addr_state,dti,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,fico_range_midpoint
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36805548,10400.0,Truck Driver Delivery Personel,8 years,MORTGAGE,58000.0,,credit_card,Credit card refinancing,CA,14.92,...,0.0,4.0,83.3,0.0,0.0,179407.0,15030.0,13000.0,11325.0,712.0
37662224,7650.0,Technical Specialist,< 1 year,RENT,50000.0,,debt_consolidation,Debt consolidation,AZ,34.81,...,0.0,2.0,100.0,0.0,0.0,82331.0,64426.0,4900.0,64031.0,687.0
37822187,9600.0,Admin Specialist,10+ years,RENT,69000.0,,debt_consolidation,Debt consolidation,NJ,25.81,...,0.0,3.0,100.0,0.0,0.0,52490.0,38566.0,21100.0,24890.0,682.0
37701596,10000.0,Investment Consultant,8 years,RENT,90000.0,,debt_consolidation,Debt consolidation,MI,8.44,...,0.0,0.0,100.0,0.0,0.0,24200.0,23723.0,21200.0,0.0,677.0
37800722,12975.0,Sales,10+ years,RENT,60000.0,,house,Home buying,FL,22.42,...,0.0,4.0,89.5,0.0,0.0,42943.0,17281.0,5500.0,27243.0,682.0


In [6]:
train_labels.head()

Unnamed: 0_level_0,loan_status,issue_d,zip_code_prefix,grade,sub_grade,installment,int_rate,collection_recovery_fee,recoveries,debt_settlement_flag,settlement_amount,settlement_date,settlement_percentage,settlement_status,settlement_term,total_pymnt,total_pymnt_inv,total_rec_int,total_rec_late_fee,total_rec_prncp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
36805548,Charged Off,2014-12-01,937,A,A3,321.08,6.99,93.8286,521.27,N,,NaT,,,,6611.69,6611.69,872.67,0.0,5217.75
37662224,Charged Off,2014-12-01,850,C,C3,260.2,13.66,222.8382,1237.99,N,,NaT,,,,2281.98,2281.98,339.61,0.0,704.38
37822187,Fully Paid,2014-12-01,77,C,C3,326.53,13.66,0.0,0.0,N,,NaT,,,,9973.43,9973.43,373.43,0.0,9600.0
37701596,Charged Off,2014-12-01,483,B,B5,332.1,11.99,0.0,0.0,N,,NaT,,,,6957.45,6957.45,1562.16,0.0,5395.29
37800722,Charged Off,2014-12-01,331,D,D5,468.17,17.86,272.799,1515.55,N,,NaT,,,,5746.89,5746.89,1603.2,0.0,2628.14


In [8]:
train_features.columns

Index(['loan_amnt', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'desc', 'purpose', 'title', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths',
       'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths',
       'delinq_amnt', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op',
       'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc',
       'mths_since_recent_inq', 'mths_since_recent_revol_delinq',
       'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
       'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30d

In [10]:
train_labels['loan_status'].value_counts(normalize=True)

Fully Paid     0.864994
Charged Off    0.135006
Name: loan_status, dtype: float64

### Missing Data
1. Dropping columns with more than 50% missing data
2. Columns with datatype object, string or datetime

In [11]:
most_missing_cols = train_features.isna().mean().sort_values(ascending=False)
most_missing_cols.loc[most_missing_cols > 0.5]
drop_most_missing_cols = most_missing_cols.index[:4]

In [12]:
train_features.drop(columns = drop_most_missing_cols, inplace = True)

In [13]:
drop_features = train_features.select_dtypes(include = ["object", "string", "datetime64"])
train_features.drop(columns = drop_features.columns, inplace = True)
print(f"{drop_features.columns}")

Index(['emp_title', 'title', 'earliest_cr_line'], dtype='object')


Dropping from test data frame

In [14]:
test_features.drop(columns = drop_most_missing_cols, inplace = True)
test_features.drop(columns = drop_features.columns, inplace = True)

In [15]:
train_y = train_labels['loan_status'].map({"Fully Paid": 0, "Charged Off": 1 })
test_y = test_labels['loan_status'].map({"Fully Paid": 0, "Charged Off": 1 })

In [16]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier

Fitting model without feature selection and hyperparameter tuning as a baseline score

In [17]:
clf = XGBClassifier(enable_categorical=True)
clf.fit(train_features, train_y)

In [18]:
y_proba = clf.predict_proba(train_features)
print(f"Train AUC: {roc_auc_score(train_y, y_proba[:,1])}")

Train AUC: 0.7922626391028673


In [21]:
y_proba = clf.predict_proba(test_features)
print(f"Test AUC: {roc_auc_score(test_y, y_proba[:,1])}")

Test AUC: 0.6762464893714447


**Default Model Performance** <br>
Model is overfitting, we can tune hyperparameters to fix that.
Steps to final model.
1. Feature Selection using RFE
2. Hyperparameter tuning
3. Performance measure on OOV and digging more into output

In [None]:
train_y.value_counts(normalize = True), test_y.value_counts(normalize = True)

(0    0.864994
 1    0.135006
 Name: loan_status, dtype: float64,
 0    0.850456
 1    0.149544
 Name: loan_status, dtype: float64)

In [22]:
numeric_features = train_features.select_dtypes(include=[np.number]).columns
categorical_features = train_features.select_dtypes(exclude=[np.number]).columns

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the final pipeline with XGBoost classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

In [27]:
transformed_x = preprocessor.fit_transform(train_features)
transformed_x_test = preprocessor.transform(test_features)

In [25]:
clf = XGBClassifier(enable_categorical=True, eval_metric = 'logloss', objective = 'binary:logistic')
clf.fit(transformed_x, train_y)

In [26]:
y_proba = clf.predict_proba(transformed_x)
print(f"Train AUC: {roc_auc_score(train_y, y_proba[:,1])}")

Train AUC: 0.7677462557077601


In [28]:
y_proba = clf.predict_proba(transformed_x_test)
print(f"Test AUC: {roc_auc_score(test_y, y_proba[:,1])}")

Test AUC: 0.6862229689179816


In [38]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [64]:
train_cv_X, test_cv_X, train_cv_y, test_cv_y = train_test_split(train_features, train_y, test_size=0.33)

In [39]:
from bayes_opt import BayesianOptimization

In [71]:
def xgb_cv(max_depth, learning_rate, min_child_weight, n_estimators):
    params = {
    'objective': 'binary:logistic',
    'max_depth' : int(max_depth),
    'learning_rate' : np.round(learning_rate, 2),
    'min_child_weight' : np.round(min_child_weight, 0),
    'n_estimators' : int(n_estimators)
    }

    model = XGBClassifier(**params, n_jobs=15, tree_method= 'hist', use_label_encoder=False, eval_metric = 'logloss', enable_categorical=True)
    transformed_x_train = preprocessor.fit_transform(train_cv_X)
    transformed_x_test = preprocessor.transform(test_cv_X)
    model.fit(transformed_x_train, train_cv_y)
    roc_auc = roc_auc_score(test_cv_y, model.predict_proba(transformed_x_test)[:,1])
    return roc_auc


In [72]:
pbounds = {
      'max_depth': (2, 5),
      'learning_rate': (0.01, 0.1),
      'min_child_weight': (50, 100),
      'n_estimators': (100, 500)
  }

optimizer = BayesianOptimization(f=xgb_cv, pbounds=pbounds, random_state=0)
#optimizer.probe(opt_param, lazy = True)
optimizer.maximize(init_points=1, n_iter=5)
best_opt_params = optimizer.max['params']
print(best_opt_params)


|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.6829   [0m | [0m0.05939  [0m | [0m4.146    [0m | [0m80.14    [0m | [0m318.0    [0m |
| [95m2        [0m | [95m0.683    [0m | [95m0.06367  [0m | [95m4.411    [0m | [95m83.37    [0m | [95m305.0    [0m |
| [0m3        [0m | [0m0.6667   [0m | [0m0.05021  [0m | [0m2.392    [0m | [0m81.97    [0m | [0m170.5    [0m |
| [95m4        [0m | [95m0.6845   [0m | [95m0.08023  [0m | [95m3.123    [0m | [95m99.8     [0m | [95m486.6    [0m |
| [0m5        [0m | [0m0.6731   [0m | [0m0.03066  [0m | [0m2.308    [0m | [0m99.87    [0m | [0m487.8    [0m |
| [0m6        [0m | [0m0.6841   [0m | [0m0.08614  [0m | [0m4.225    [0m | [0m67.22    [0m | [0m328.5    [0m |
{'learning_rate': 0.08023000150480938, 'max_depth': 3.1232030664249404, 'min_child_weight': 99.7962840402414

In [75]:
best_opt_params

{'learning_rate': 0.08023000150480938,
 'max_depth': 3.1232030664249404,
 'min_child_weight': 99.79628404024143,
 'n_estimators': 486.5749947491004}

In [77]:
best_opt_params_conv = dict()
best_opt_params_conv['max_depth'] = int(best_opt_params['max_depth'])
best_opt_params_conv['learning_rate'] = np.round(best_opt_params['learning_rate'], 2)
best_opt_params_conv['min_child_weight'] = int(best_opt_params['min_child_weight'])
best_opt_params_conv['n_estimators'] = int(best_opt_params['n_estimators'])
best_opt_params_conv

{'max_depth': 3,
 'learning_rate': 0.08,
 'min_child_weight': 99,
 'n_estimators': 486}

In [78]:
#best_opt_params
model = XGBClassifier(**best_opt_params_conv, n_jobs=15, tree_method= 'hist', use_label_encoder=False, eval_metric = 'logloss', enable_categorical=True)
transformed_x_train = preprocessor.fit_transform(train_cv_X)
transformed_x_test = preprocessor.transform(test_cv_X)
model.fit(transformed_x_train, train_cv_y)

roc_auc_train = roc_auc_score(train_cv_y, model.predict_proba(transformed_x_train)[:,1])
roc_auc_test = roc_auc_score(test_cv_y, model.predict_proba(transformed_x_test)[:,1])
print(f"Train AUC: {roc_auc_train} , Test AUC: {roc_auc_test}")

Train AUC: 0.7064321162800347 , Test AUC: 0.6842269052458483


In [80]:
transformed_x_test_oov = preprocessor.transform(test_features)
y_proba_oov = model.predict_proba(transformed_x_test_oov)
roc_auc_test_oov = roc_auc_score(test_y, y_proba_oov[:,1])
print(f"Test OOV AUC: {roc_auc_test_oov}")

Test OOV AUC: 0.6913803957516982
