## Lending Club Pipeline Example

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Data located at cicdmodeldeployment
d1 = pd.read_csv('lending_club_selected_features_train.csv')
print(d1.shape)

d2 = pd.read_csv('lending_club_selected_features_test.csv')
print(d2.shape)

data = pd.concat([d1, d2])
print(data.shape)

(31391, 24)
(7848, 24)
(39239, 24)


In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39239 entries, 0 to 7847
Data columns (total 24 columns):
loan_amnt               39239 non-null float64
term                    39239 non-null object
installment             39239 non-null float64
grade                   39239 non-null object
emp_length              38182 non-null object
home_ownership          39239 non-null object
annual_inc              39239 non-null float64
verification_status     39239 non-null object
target                  39239 non-null int64
purpose                 39239 non-null object
title                   39228 non-null object
addr_state              39239 non-null object
dti                     39239 non-null float64
delinq_2yrs             39239 non-null float64
earliest_cr_line        39239 non-null object
inq_last_6mths          39239 non-null float64
open_acc                39239 non-null float64
pub_rec                 39239 non-null float64
revol_bal               39239 non-null float64
revol_util

In [4]:
# Separate target from predictors
y = data.target
X = data.drop(['target', 'purpose', 'title', 'addr_state'], axis=1)

In [5]:
# Divide data into training and validation subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [6]:
# Select categorical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [7]:
# Prepare the pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])   

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [8]:
# Build the model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=25)

In [9]:
# Build the pipeline
lendingclub_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model) ])

# Preprocessing of training data, fit model 
lendingclub_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [10]:
# Make predictions
predictions = lendingclub_pipeline.predict(X_test)

In [11]:
# Get some metrics
import numpy as np

# Evaluate the model
print('Mean:', np.mean(predictions == y_test))

Mean: 0.8673547400611621
