# Pipelines and exploring the lending club data

1. Cool new EDA tools
1. Proj set up - whys 
1. Setting up a pipeline (why?!?!?!?! :( )) to deal with data issues
1. **The question: Can we predict which loans will default? Credit modeling**



In [10]:
# import lots of functions
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from df_after_transform import df_after_transform
from sklearn.model_selection import KFold, cross_validate, GridSearchCV


In [3]:
# load data 
loans = pd.read_csv('lendingclub/2013_subsample.zip')


In [5]:
# !pip install pandas-profiling
# https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/introduction.html

######################################
# WARNING DO THIS ON X_TRAIN+Y_TRAIN ONLY,
# DON'T DO EDA ON ALL DATA (IE DONT LEARN FROM TEST DATA!)
######################################

from pandas_profiling import ProfileReport

profile = ProfileReport(loans, 
                        title='Lending Club Profiling Report',
                        html={'style':{'full_width':True}}) 
profile.to_file("lending_club_INITIAL.html") # can take a minute or two with this dataset size. Let's look at the one I uploaded...

######################################
# EDA CONCLUSIONS - any vars that might go in the model?
# generally, ML models want no missing values, mean 0, std 1
######################################

# numerical - annual_income (heavily skewed), int_rate, loan_amnt, dti, 

# cat - public_rec_bankruptcies, grade 



In [7]:
# split to test and train (link to split page/sk docs)

# first let's separate y from X
y = loans.loan_status == 'Charged Off'
y.value_counts()
loans = loans.drop('loan_status',axis=1)

# stratify will make sure that test/train both have equal fractions of outcome
X_train, X_test, y_train, y_test = train_test_split(loans, y, stratify=y, test_size=.2, random_state=0)


In [None]:
## pre-modeling (on the training data only!)
# run ProfileReport here!!!!

# do lots of EDA
# look for missing values, which variables are what type, and outliers 
# figure out how you'd clean the data (imputation, scaling, encoding categorical vars)
# these lessons will go into the preprocessign portion of your pipeline 

In [15]:
## optimize a series of models 

# set up pipeline to clean each type of variable (1 pipe per var type)

numer_pipe = make_pipeline(SimpleImputer(),StandardScaler()) 
cat_pipe   = make_pipeline(OneHotEncoder(drop='first'))

# combine those pipes into "preprocess" pipe

preproc_pipe = ColumnTransformer(  
    [ # arg 1 of ColumnTransformer is a list, so this starts the list
    # a tuple for the numerical vars: name, pipe, which vars to apply to
    ("num_impute", numer_pipe, ['annual_inc']),
    # a tuple for the categorical vars: name, pipe, which vars to apply to
    ("cat_trans", cat_pipe, ['grade'])
    ]
    , remainder = 'drop' # you either drop or passthrough any vars not modified above
)

In [17]:
###########
# hot tip: check out what this preprocessing does before you continue!
###########

from df_after_transform import df_after_transform

preproc_df = df_after_transform(preproc_pipe,X_train)
print('There are {preproc_df.shape[1]} columns in the preprocessed data.')
preproc_df.describe().T.round(2)

There are {preproc_df.shape[1]} columns in the preprocessed data.


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annual_inc,107843.0,-0.0,1.0,-1.36,-0.55,-0.19,0.32,121.55
grade_B,107843.0,0.33,0.47,0.0,0.0,0.0,1.0,1.0
grade_C,107843.0,0.28,0.45,0.0,0.0,0.0,1.0,1.0
grade_D,107843.0,0.15,0.36,0.0,0.0,0.0,0.0,1.0
grade_E,107843.0,0.07,0.25,0.0,0.0,0.0,0.0,1.0
grade_F,107843.0,0.03,0.18,0.0,0.0,0.0,0.0,1.0
grade_G,107843.0,0.01,0.08,0.0,0.0,0.0,0.0,1.0


In [12]:
# set up cv (can set up iterable to do OOS! or TimeSeriesSplit, or...)



# set up scoring 

# let's evaulate on "precision" today...https://ledatascifi.github.io/ledatascifi-2021/content/05/03d_whatToMax.html

In [20]:
## optimize candidate model type #1: 

#     set up pipeline (combines preprocessing, estimator)

logit_pipe = make_pipeline(preproc_pipe, LogisticRegression())
logit_pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_impute',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['annual_inc']),
                                                 ('cat_trans',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['grade'])])),
                ('logisticregression', LogisticRegression())])

In [27]:
scores = cross_validate(logit_pipe,X_train,y_train,scoring='precision_micro',
               cv=10)

In [29]:
scores['test_score'].mean()  # a dictionary

0.8440418014840922

In [30]:
#     set up hyper param grid - what params in a pipeline do you want to change?
# a dictionary. keys are things to change in pipeline
# key: <stepname>__<parametername>

parameters =  {'logisticregression__C': [0.1,1,5]}

#     find optimal hyper params (gridsearchcv)

grid_search = GridSearchCV(estimator = logit_pipe, 
                           param_grid = parameters,
                           scoring='precision_micro'
                           )

results = grid_search.fit(X_train,y_train)

#     save pipeline with optimal params in place
#     (Note: you should spend time interrogating model predictions, plotting and printing.
#     Does the model struggle predicting certain obs? Excel at some?)

In [33]:
df = pd.DataFrame(results.cv_results_).set_index('params')
df

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
{'logisticregression__C': 0.1},0.528666,0.01363,0.044998,0.006388,0.1,0.844035,0.844035,0.844035,0.844028,0.844075,0.844042,1.7e-05,1
{'logisticregression__C': 1},0.532052,0.027821,0.046999,0.005896,1.0,0.844035,0.844035,0.844035,0.844028,0.844075,0.844042,1.7e-05,1
{'logisticregression__C': 5},0.562557,0.036129,0.047199,0.004832,5.0,0.844035,0.844035,0.844035,0.844028,0.844075,0.844042,1.7e-05,1


In [None]:
## optimize candidate model type #2

# ...

## optimize candidate model type #N

## compare the N optimized models

# build list of models (each with own optimized hyperparams)
# for model in models:
#    cross_validate(model, X, y,...)
# pick the winner!