In [16]:
# import libraries 
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
from matplotlib import colorbar
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from pickle import dump, load

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

np.random_state = 42

## Load in Dataset

In [36]:
df = pd.read_csv('data/cleaned_df.csv', index_col = 0)
df.head()

Unnamed: 0,funded_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,loan_status,purpose,addr_state,dti,delinq_2yrs,years_credit,fico_range_high,inq_last_6mths,mths_since_last_delinq,open_acc,revol_bal,revol_util,total_acc,application_type,open_act_il,open_il_24m,open_rv_24m,all_util,total_rev_hi_lim,inq_last_12m,acc_open_past_24mths,bc_open_to_buy,bc_util,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,total_bal_ex_mort,total_bc_limit
0,4000,36 months,11.99,132.84,8 years,RENT,45000.0,Fully Paid,debt_consolidation,AZ,0.67,0,12,744,1.0,0.0,7,1102,6.9,9,Individual,0.0,0.0,4.0,7.0,15900,5.0,4,6398.0,14.7,0,14.0,0.0,0.0,0,9,1,7,0,1,100.0,0.0,0,1102,7500
1,7200,36 months,11.47,237.33,< 1 year,RENT,85000.0,Current,debt_consolidation,OR,10.17,0,20,669,2.0,35.0,6,13964,90.1,10,Individual,2.0,0.0,2.0,68.0,15500,2.0,2,951.0,93.2,0,4.0,0.0,0.0,0,8,4,6,0,2,90.0,100.0,0,33021,14000
2,20000,36 months,16.29,706.01,8 years,MORTGAGE,56000.0,Current,debt_consolidation,FL,33.71,0,19,664,0.0,39.0,21,11198,49.1,33,Individual,7.0,1.0,11.0,73.0,22800,0.0,12,2074.0,78.6,3,14.0,0.0,0.0,0,20,12,21,0,4,96.6,75.0,2,54298,9700
3,16000,60 months,12.99,363.97,5 years,MORTGAGE,110000.0,Current,debt_consolidation,IL,20.53,0,36,674,1.0,35.0,14,40709,78.1,25,Individual,1.0,0.0,7.0,67.0,52100,2.0,7,1221.0,96.9,3,7.0,35.0,35.0,2,21,11,14,0,2,68.0,87.5,0,45733,39400
4,28000,60 months,15.31,670.69,3 years,MORTGAGE,180000.0,Current,debt_consolidation,TN,24.56,0,26,684,0.0,37.0,14,128213,96.0,33,Individual,5.0,2.0,2.0,76.0,127800,2.0,5,1138.0,97.8,4,10.0,37.0,37.0,0,14,7,14,0,5,93.9,100.0,0,273107,51800


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413177 entries, 0 to 438990
Data columns (total 45 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   funded_amnt                     413177 non-null  int64  
 1   term                            413177 non-null  object 
 2   int_rate                        413177 non-null  float64
 3   installment                     413177 non-null  float64
 4   emp_length                      413177 non-null  object 
 5   home_ownership                  413177 non-null  object 
 6   annual_inc                      413177 non-null  float64
 7   loan_status                     413177 non-null  object 
 8   purpose                         413177 non-null  object 
 9   addr_state                      413177 non-null  object 
 10  dti                             413177 non-null  float64
 11  delinq_2yrs                     413177 non-null  int64  
 12  years_credit    

## Preprocessing Features

#### Establish order of ordinal columns

In [38]:
df.columns

Index(['funded_amnt', 'term', 'int_rate', 'installment', 'emp_length',
       'home_ownership', 'annual_inc', 'loan_status', 'purpose', 'addr_state',
       'dti', 'delinq_2yrs', 'years_credit', 'fico_range_high',
       'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'revol_bal',
       'revol_util', 'total_acc', 'application_type', 'open_act_il',
       'open_il_24m', 'open_rv_24m', 'all_util', 'total_rev_hi_lim',
       'inq_last_12m', 'acc_open_past_24mths', 'bc_open_to_buy', 'bc_util',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
       'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
       'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
       'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq',
       'percent_bc_gt_75', 'pub_rec_bankruptcies', 'total_bal_ex_mort',
       'total_bc_limit'],
      dtype='object')

In [41]:
# identify ordinal columns and define order of the features
# only ordinal column is emp_length
emp_length_list = ['< 1 year',
                   '1 year',
                   '2 years',
                   '3 years',
                   '4 years',
                   '5 years',
                   '6 years',
                   '7 years',
                   '8 years',
                   '9 years',
                   '10+ years']

In [43]:
# convert to type category
df['emp_length'] = df['emp_length'].astype('category')

In [45]:
# get all nominal columns
nominal_cols = list(df.select_dtypes(include='object').columns)
# get all ordinal columns
ordinal_cols = list(df.select_dtypes(include='category').columns)

In [49]:
print("Nominal Columns:", nominal_cols)
print("")
print("Oridinal Columns:", ordinal_cols)

Nominal Columns: ['term', 'home_ownership', 'loan_status', 'purpose', 'addr_state', 'application_type']

Oridinal Columns: ['emp_length']


### Building pipeline transformer

In [50]:
# define numeric transformation pipeline that scales the numbers
numeric_pipeline = Pipeline([('numnorm', StandardScaler())]) 

# define an ordinal transformation pipeline that ordinal encodes the cats
ordinal_pipeline = Pipeline([('ordinalenc', OrdinalEncoder(categories = [emp_length_list]))])

# define a nominal transformation pipeline that OHE the cats
nominal_pipeline = Pipeline([('onehotenc', OneHotEncoder(categories= "auto", 
                                                         drop = "first",
                                                         sparse = False, 
                                                         handle_unknown = 'ignore'))]) 

In [53]:
# construct column transformer for the selected columns with pipelines
ct = ColumnTransformer(transformers = [("nominalpipe", nominal_pipeline, [nominal_cols]),
                                       ("ordinalpipe", ordinal_pipeline, [ordinal_cols]),
                                       ("numericpipe", numeric_pipeline, df.select_dtypes(['int', 'float']).columns)])

In [55]:
# save the column transformer
dump(ct, open('pkl/column_transformer.pkl', 'wb'))