In [None]:
import pandas as pd
import numpy as np
import time
from datetime import datetime as dt
import matplotlib.pyplot as plt
import math
import seaborn as sns
sns.set()

pd.set_option('display.max_columns', None)


In [None]:
df_backup = pd.read_csv("/Users/admin/Downloads/LGD/PD_LGD_EAD_model/loan_data_2007_2014.csv")

In [None]:
df = df_backup.copy()

In [None]:
df.head()

GENERAL PREPROCESSING

Preprocessing few continuous variables

We have to get rid of these unique characters and label them as integers

In [None]:
df['emp_length_int'] = df['emp_length'].str.replace('+ years', '')
df['emp_length_int'] = df['emp_length_int'].str.replace('< ', '')
df['emp_length_int'] = df['emp_length_int'].str.replace(' years', '')
df['emp_length_int'] = df['emp_length_int'].str.replace(' year', '')
df['emp_length_int'] = pd.to_numeric(df['emp_length_int'])
df['term_int'] = df['term'].str.replace(' months', '')
df['term_int'] = df['term_int'].str.replace(' ', '')
df['term_int'] = pd.to_numeric(df['term_int'])
df['term'] = df['term_int']
df['emp_length'] = df['emp_length_int']
df.drop(columns=['term_int', 'emp_length_int'])

In [None]:
curr_date = pd.to_datetime('2017-12-01')
df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%y')
df['issue_d_days'] = curr_date - df['issue_d']
df['issue_d_weeks'] = round((curr_date - df['issue_d'])/np.timedelta64(1, 'W'))
df['issue_d_weeks']
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%y')
df['earliest_cr_line_days'] = curr_date - df['earliest_cr_line']
df['earliest_cr_line_weeks'] = round((curr_date - df['earliest_cr_line'])/np.timedelta64(1, 'W'))
df['earliest_cr_line_weeks']

In [None]:
# df.loc[:, ['earliest_cr_line', 'earliest_cr_line_weeks', 'earliest_cr_line_days']][df['earliest_cr_line_days'] < 0]
df['earliest_cr_line_weeks'].describe()

In [None]:
df['earliest_cr_line_weeks'][df['earliest_cr_line_weeks'] < 0] = df['earliest_cr_line_weeks'].fillna(df['earliest_cr_line_weeks'].max(), inplace=True)

In [None]:
df['issue_d'].describe()

Preprocessing Few Discrete Variables

In [None]:
def create_dummy_vars(df, cols):
    df_dummies = pd.DataFrame()
    for col in cols:
        x = pd.get_dummies(df[col], prefix=col, prefix_sep='_')
        df_dummies = pd.concat([df_dummies, x], axis=1)
    return df_dummies
categorical_cols = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'loan_status', 'purpose', 'addr_state', 'initial_list_status']
df_dummies = create_dummy_vars(df, categorical_cols)
df.drop(columns='Unnamed: 0', inplace=True)
# df

In [None]:
df = pd.concat([df, df_dummies], axis=1)

Check for missing values and Clean


In [None]:
df['annual_inc'].fillna(df['annual_inc'].mean(), inplace=True)
cols = ['earliest_cr_line_weeks', 'acc_now_delinq', 'total_acc', 'pub_rec', 'open_acc', 'inq_last_6mths', 'delinq_2yrs', 'emp_length']
df[cols] = df[cols].fillna(0)

In [None]:
(df['loan_status'].value_counts()/df['loan_status'].count())*100

In [None]:
df['loan_good_bad'] = np.where((df['loan_status'].isin(['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'
                                                        , 'Late (31-120 days)'])), 0, 1)

In [None]:
df['loan_good_bad']

Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['loan_good_bad'], axis=1), df['loan_good_bad'], random_state=42, test_size=0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

DATA PREPARATION

In [89]:
# df_features_train, df_targets_train = X_train, y_train
df_features_train, df_targets_train = X_test, y_test


In [90]:
def check_WoE_IV(df, target, col):
    df1 = pd.concat([df[col], target], axis=1)
    df1 = pd.concat([df1.groupby(df1.columns.values[0])[df1.columns.values[1]].count().reset_index(),
                    df1.groupby(df1.columns.values[0])[df1.columns.values[1]].mean().reset_index()], axis=1)
    df1 = df1.iloc[:, [0,1,3]]
    df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']
    df1['prop_n_obs'] = df1['n_obs']/df1['n_obs'].sum()
    df1['n_good'] = df1['n_obs']*df1['prop_good']
    df1['n_bad'] = df1['n_obs']*(1-df1['prop_good'])
    df1[f'prop_{col}_good'] = df1['n_good']/df1['n_good'].sum()
    df1[f'prop_{col}_bad'] = df1['n_bad']/df1['n_bad'].sum()
    df1['WoE'] = np.log(df1[f'prop_{col}_good']/df1[f'prop_{col}_bad'])
    df1['IV'] = (df1[f'prop_{col}_good'] - df1[f'prop_{col}_bad'])*df1['WoE']
    df1['IV'] = df1['IV'].sum()
    df1['WoE_diff'] = df1['WoE'].diff().abs()
    print(f"IV for {col} is ", df1['IV'].iloc[0])
    df1.sort_values(by='WoE', inplace=True)
    return df1

# df_WoE = check_WoE_IV(df_features_train, df_targets_train, 'home_ownership')
# print(df_WoE)

def plotByWoE(df_WoE, rotate_x=0, figsize=(8,6)):
    x = np.array(df_WoE.iloc[:,0].apply(str))
    y = df_WoE['WoE']
    plt.figure(figsize=figsize)
    plt.plot(x, y, marker='o', linestyle='--', color='b')
    plt.xlabel(df_WoE.columns.values[0])
    plt.ylabel('Weight of Evidence')
    plt.title('weight of Evidence of ' + str(df_WoE.columns.values[0]))
    plt.xticks(rotation=rotate_x)

# plotByWoE(df_WoE, rotate_x=45)

We can see that the Other and None variable explain when there is PD but these categories are very less in obs but we dont wanna lose them, so we can combine them together, so we combined these dummies and create one new dummy that is associated max with PD

In [91]:
# categorical_cols = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'initial_list_status']
# other_cols = categorical_cols + \
#     ['earliest_cr_line_weeks_', 'acc_now_delinq_', 'total_acc_', 'pub_rec_', 'open_acc_', 'inq_last_6mths_', 'delinq_2yrs_', 'emp_length_'] + \
#     ['emp_length_', 'term_', 'issue_d_', 'int_rate_', 'annual_inc_', 'mths_since_last_delinq_', 'dti_', 'mths_since_last_record_']

# for col in categorical_cols:
#     df_woe = check_WoE_IV(df_features_train, df_targets_train, col)
#     plotByWoE(df_woe)

In [92]:
# df_woe = check_WoE_IV(df_features_train, df_targets_train, 'grade')
# df_woe

In [93]:
# main_category = df_features_train.filter(like='grade_')
# main_category = main_category.loc[:,~main_category.columns.str.contains('sub_grade')]
# reference_category = main_category.iloc[:,-1]
# main_category = main_category.iloc[:,:-1]
# reference_category

In [94]:
df_features_train['home_ownership:RENT_OTHER_ANY_NONE'] = sum([df_features_train['home_ownership_RENT'],df_features_train['home_ownership_OTHER'],
                                                        df_features_train['home_ownership_ANY'], df_features_train['home_ownership_NONE']])
df_features_train['home_ownership:RENT_OTHER_ANY_NONE']

362514    0
288564    0
213591    0
263083    0
165001    0
         ..
115       0
296284    1
61777     0
91763     0
167512    1
Name: home_ownership:RENT_OTHER_ANY_NONE, Length: 93257, dtype: int64

In [95]:
# main_category = pd.concat([main_category,df_features_train.filter(like='home_ownership')], axis=1)
# main_category
# main_category = main_category.loc[:,~main_category.columns.str.contains('sub_grade')]
# reference_category = main_category.iloc[:,-1]
# main_category = main_category.iloc[:,:-1]
# reference_category

In [96]:
# df_WoE = check_WoE_IV(df_features_train, df_targets_train, 'addr_state')
# len(df_WoE)
if 'addr_state_ND' in df_features_train.columns:
    pass
else:
    df_features_train['addr_state_ND'] = 0
# plotByWoE(df_WoE.iloc[6:-6,:], rotate_x=45)
# df_WoE


We can see the first two rows and last two rows have very less n_obs, so we combine them and also combine the the next two worst categories because there isn't a lot of difference and call this category as worst

In [97]:
df_features_train['addr_state_worst'] = sum([df_features_train['addr_state_ND'], df_features_train['addr_state_NV'], df_features_train['addr_state_IA'],
                                             df_features_train['addr_state_NE'], df_features_train['addr_state_FL'], df_features_train['addr_state_HI'],
                                             df_features_train['addr_state_AL']])
df_features_train['addr_state_best'] = sum([df_features_train['addr_state_DC'], df_features_train['addr_state_ME'], df_features_train['addr_state_ID'],
                                             df_features_train['addr_state_WY'], df_features_train['addr_state_NH'], df_features_train['addr_state_WV']])

We can now combine other variables by their indivdual WoE but also taking into account the n_obs each has and if they don't deviate by a lot for eg NM,VA,NY,OK,TN,MO,LA,MD,NC,CA - we can combine them together but we can check that NY and CA have considerably high no of obs so we cobine the first two and then NY and the next 6 and then CA and we can repeat the similar process

In [98]:
df_features_train['addr_state_NM_VA'] = sum([df_features_train['addr_state_NM'], df_features_train['addr_state_VA']])

df_features_train['addr_state_OK_TN_MO_LA_MD_NC'] = sum([df_features_train['addr_state_OK'], df_features_train['addr_state_TN'], df_features_train['addr_state_MO'],
                                             df_features_train['addr_state_LA'], df_features_train['addr_state_MD'], df_features_train['addr_state_NC']])

df_features_train['addr_state_UT_KY_AZ_NJ_MD_NC'] = sum([df_features_train['addr_state_UT'], df_features_train['addr_state_KY'], df_features_train['addr_state_AZ'],
                                             df_features_train['addr_state_NJ'], df_features_train['addr_state_MD'], df_features_train['addr_state_NC']])

df_features_train['addr_state_AR_MI_PA_OH_MN'] = sum([df_features_train['addr_state_AR'], df_features_train['addr_state_MI'], df_features_train['addr_state_PA'],
                                             df_features_train['addr_state_OH'], df_features_train['addr_state_MN']])

df_features_train['addr_state_RI_MA_DE_SD_IN'] = sum([df_features_train['addr_state_RI'], df_features_train['addr_state_MA'], df_features_train['addr_state_DE'],
                                             df_features_train['addr_state_SD'], df_features_train['addr_state_IN']])

df_features_train['addr_state_RI_MA_DE_SD_IN'] = sum([df_features_train['addr_state_RI'], df_features_train['addr_state_MA'], df_features_train['addr_state_DE'],
                                             df_features_train['addr_state_SD'], df_features_train['addr_state_IN']])

df_features_train['addr_state_GA_WA_OR'] = sum([df_features_train['addr_state_GA'], df_features_train['addr_state_WA'], df_features_train['addr_state_OR']])

df_features_train['addr_state_WI_MT'] = sum([df_features_train['addr_state_WI'], df_features_train['addr_state_MT']])

df_features_train['addr_state_IL_CT'] = sum([df_features_train['addr_state_IL'], df_features_train['addr_state_CT']])

df_features_train['addr_state_KS_SC_CO_VT_AK_MS'] = sum([df_features_train['addr_state_KS'], df_features_train['addr_state_SC'], df_features_train['addr_state_CO'],
                                             df_features_train['addr_state_VT'], df_features_train['addr_state_AK'], df_features_train['addr_state_MS']])

In [99]:
# df_WoE = check_WoE_IV(df_features_train, df_targets_train, 'verification_status')
# # plotByWoE(df_WoE)
# df_features_train.filter(like='verification')
# df_WoE = check_WoE_IV(df_features_train, df_targets_train, 'purpose')
# df_WoE
# plotByWoE(df_WoE, rotate_x=45)
# df_features_train.filter(like='verification')

In [100]:
df_features_train['purpose_worst'] = sum([df_features_train['purpose_small_business'], df_features_train['purpose_educational'], df_features_train['purpose_moving'],
                                             df_features_train['purpose_house']])
df_features_train['purpose_renewable_energy_medical_wedding_vacation'] = sum([df_features_train['purpose_renewable_energy'], df_features_train['purpose_medical'],
                                                                              df_features_train['purpose_wedding'], df_features_train['purpose_vacation']])
df_features_train['purpose_major_purchase_car'] = sum([df_features_train['purpose_major_purchase'], df_features_train['purpose_car']])

In [101]:
# df_WoE = check_WoE_IV(df_features_train, df_targets_train, 'initial_list_status')
# df_WoE
# plotByWoE(df_WoE, rotate_x=45)

Preprocessing continuous variables

In [102]:
def check_WoE_IV_continuous(df, target, col):
    df1 = pd.concat([df[col], target], axis=1)
    df1 = pd.concat([df1.groupby(df1.columns.values[0])[df1.columns.values[1]].count().reset_index(),
                    df1.groupby(df1.columns.values[0])[df1.columns.values[1]].mean().reset_index()], axis=1)
    df1 = df1.iloc[:, [0,1,3]]
    df1.columns = [df1.columns.values[0], 'n_obs', 'prop_good']
    df1['prop_n_obs'] = df1['n_obs']/df1['n_obs'].sum()
    df1['n_good'] = df1['n_obs']*df1['prop_good']
    df1['n_bad'] = df1['n_obs']*(1-df1['prop_good'])
    df1[f'prop_{col}_good'] = df1['n_good']/df1['n_good'].sum()
    df1[f'prop_{col}_bad'] = df1['n_bad']/df1['n_bad'].sum()
    df1['WoE'] = np.log(df1[f'prop_{col}_good']/df1[f'prop_{col}_bad'])
    df1['IV'] = (df1[f'prop_{col}_good'] - df1[f'prop_{col}_bad'])*df1['WoE']
    df1['IV'] = df1['IV'].sum()
    df1['WoE_diff'] = df1['WoE'].diff().abs()
    print(f"IV for {col} is ", df1['IV'].iloc[0])
    # df1.sort_values(by='WoE', inplace=True)
    return df1

# plotByWoE(df_WoE, rotate_x=45)


In [103]:
# df_features_train['term'].unique()
# df_WoE = check_WoE_IV_continuous(df_features_train, df_targets_train, 'term')
# df_WoE
# plotByWoE(df_WoE, rotate_x=45)

In [104]:
df_features_train['term_36'] = np.where(df_features_train['term'] == 36, 1, 0)
df_features_train['term_60'] = np.where(df_features_train['term'] == 60, 1, 0)

In [105]:
# df_features_train['emp_length'].unique()
# df_WoE = check_WoE_IV_continuous(df_features_train, df_targets_train, 'emp_length')
# df_WoE
# plotByWoE(df_WoE, rotate_x=45)


In [106]:
# plotByWoE(df_WoE, rotate_x=45)
# 

In [107]:
df_features_train['emp_length_0'] = np.where(df_features_train['emp_length'] == 0.0, 1, 0)
df_features_train['emp_length_1'] = np.where(df_features_train['emp_length'] == 1.0, 1, 0)
df_features_train['emp_length_2_3_4'] = np.where(df_features_train['emp_length'].isin([2.0, 3.0, 4.0]), 1, 0)
df_features_train['emp_length_5_6'] = np.where(df_features_train['emp_length'].isin([5.0, 6.0]), 1, 0)
df_features_train['emp_length_7_8_9'] = np.where(df_features_train['emp_length'].isin([7.0, 8.0, 9.0]), 1, 0)
df_features_train['emp_length_10'] = np.where(df_features_train['emp_length'] == 10.0, 1, 0)


In [108]:
# len(df_features_train['issue_d_weeks'].unique())

In [109]:
# df_features_train['issue_d_weeks_categories'] = pd.cut(df_features_train['issue_d_weeks'], 50)
# df_features_train['issue_d_weeks_categories']

In [110]:
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'issue_d_weeks_categories')
# df_woe

In [111]:
# plotByWoE(df_woe, rotate_x= 90, figsize=(15,10))

In [112]:
df_features_train['issue_d_weeks_156-165'] = np.where(df_features_train['issue_d_weeks'].isin(range(156,165)), 1,0)
df_features_train['issue_d_weeks_165-173'] = np.where(df_features_train['issue_d_weeks'].isin(range(165,173)), 1,0)
df_features_train['issue_d_weeks_173-180.5'] = np.where(df_features_train['issue_d_weeks'].isin(range(173,181)), 1,0)
df_features_train['issue_d_weeks_181-211'] = np.where(df_features_train['issue_d_weeks'].isin(range(181,211)), 1,0) 
df_features_train['issue_d_weeks_211-235'] = np.where(df_features_train['issue_d_weeks'].isin(range(211,235)), 1,0) 
df_features_train['issue_d_weeks_235-391'] = np.where(df_features_train['issue_d_weeks'].isin(range(235,391)), 1,0) 
df_features_train['issue_d_weeks_>391'] = np.where(df_features_train['issue_d_weeks'].isin(range(391,580)), 1,0)

In [113]:
# len(df_features_train['int_rate'].unique())
# df_features_train['int_rate_categories'] = pd.cut(df_features_train['int_rate'], 10)

In [114]:
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'int_rate_categories')
# df_woe

In [115]:
# plotByWoE(df_woe, rotate_x=90, figsize=(12,10))

In [116]:
# int_bins = df_features_train['int_rate_categories'].cat.categories

In [117]:
for i in range(len(int_bins)):
    col_name = f'int_rate_{int_bins[i]}'
    df_features_train[col_name] = np.where((df_features_train['int_rate'] > int_bins[i].left) & (df_features_train['int_rate'] <= int_bins[i].right), 1, 0)

df_features_train.filter(like='int_rate')

Unnamed: 0,int_rate,"int_rate_(5.399, 7.484]","int_rate_(7.484, 9.548]","int_rate_(9.548, 11.612]","int_rate_(11.612, 13.676]","int_rate_(13.676, 15.74]","int_rate_(15.74, 17.804]","int_rate_(17.804, 19.868]","int_rate_(19.868, 21.932]","int_rate_(21.932, 23.996]","int_rate_(23.996, 26.06]"
362514,14.99,0,0,0,0,1,0,0,0,0,0
288564,20.99,0,0,0,0,0,0,0,1,0,0
213591,14.65,0,0,0,0,1,0,0,0,0,0
263083,14.49,0,0,0,0,1,0,0,0,0,0
165001,8.90,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
115,11.71,0,0,0,1,0,0,0,0,0,0
296284,10.15,0,0,1,0,0,0,0,0,0,0
61777,8.90,0,1,0,0,0,0,0,0,0,0
91763,8.90,0,1,0,0,0,0,0,0,0,0


In [118]:
# df_features_train['funded_amnt_categories'] = pd.cut(df_features_train['funded_amnt'], 20)

In [119]:
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'funded_amnt_categories')
# df_woe

In [120]:
# plotByWoE(df_woe, rotate_x=90)

We can see no clear pattern between DV and IDV i.e funded amount, so we don't need to include this factor in the model

In [121]:
# df_features_train['earliest_cr_line_weeks_categories'] = pd.cut(df_features_train['earliest_cr_line_weeks'], 10)

In [122]:
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'earliest_cr_line_weeks_categories')
# df_woe

In [123]:
# plotByWoE(df_woe)

In [124]:
# cr_line_bins = df_features_train['earliest_cr_line_weeks_categories'].cat.categories

In [125]:

for i in range(len(cr_line_bins)):
    col_name = f'earliest_cr_line_weeks_{cr_line_bins[i]}'
    df_features_train[col_name] = np.where((df_features_train['earliest_cr_line_weeks'] > cr_line_bins[i].left) 
                            & (df_features_train['earliest_cr_line_weeks'] <= cr_line_bins[i].right), 1, 0)

In [126]:
# df_features_train['installment'].unique()

In [127]:
# df_features_train['installment_categories'] = pd.cut(df_features_train['installment'], 10)

In [128]:
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'installment_categories')
# df_woe

In [129]:
# plotByWoE(df_woe)

Lets discard installment, no pattern, IV close to 0

In [130]:
# df_features_train['delinq_2yrs_cats'] = pd.cut(df_features_train['delinq_2yrs'], 100)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'delinq_2yrs_cats')
# df_woe

In [131]:
# plotByWoE(df_woe)

In [132]:
# df_features_train['delinq_2yrs_<2'] = np.where(df_features_train['delinq_2yrs'] < 2, 1, 0)
# df_features_train['delinq_2yrs_>2'] = np.where(df_features_train['delinq_2yrs'] >= 2, 1, 0)

In [133]:
# df_features_train['inq_last_6mths_cats'] = pd.cut(df_features_train['inq_last_6mths'], 50)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'inq_last_6mths_cats')
# df_woe

In [134]:
# plotByWoE(df_woe)

In [135]:
df_features_train['inq_last_6mths_<1'] = np.where(df_features_train['inq_last_6mths'] < 1, 1, 0)
df_features_train['inq_last_6mths_1-2'] = np.where((df_features_train['inq_last_6mths'] >= 1) & (df_features_train['inq_last_6mths'] < 2), 1, 0)
df_features_train['inq_last_6mths_2-3'] = np.where((df_features_train['inq_last_6mths'] >= 2) & (df_features_train['inq_last_6mths'] < 3), 1, 0)
df_features_train['inq_last_6mths_>3'] = np.where(df_features_train['inq_last_6mths'] > 3, 1, 0)


In [136]:
# df_features_train['open_acc'].value_counts()

In [137]:
# df_features_train['open_acc_cats'] = pd.cut(df_features_train['open_acc'], 40)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'open_acc_cats')
# pd.set_option('display.max_rows', None)
# df_woe.iloc[:-30,:]

In [138]:
df_features_train['open_acc_<4'] = np.where(df_features_train['open_acc'] < 4, 1, 0)
df_features_train['open_acc_4-10'] = np.where(df_features_train['open_acc'].isin(range(4,11)), 1, 0)
df_features_train['open_acc_11-13'] = np.where(df_features_train['open_acc'].isin(range(11,14)), 1, 0)
df_features_train['open_acc_14-21'] = np.where(df_features_train['open_acc'].isin(range(14,21)), 1, 0)
df_features_train['open_acc_>21'] = np.where(df_features_train['open_acc'] >= 21, 1, 0)


In [139]:
# df_features_train['total_acc_cats'] = pd.cut(df_features_train['total_acc'], 50)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'total_acc_cats')
# df_woe.iloc[:-39,:]

In [140]:
# plotByWoE(df_woe)
df_features_train['total_acc_<9'] = np.where(df_features_train['total_acc'] < 30, 1, 0)
df_features_train['total_acc_9-15'] = np.where(df_features_train['total_acc'].isin(range(9,15)), 1, 0)
df_features_train['total_acc_15-21'] = np.where(df_features_train['total_acc'].isin(range(15,21)), 1, 0)
df_features_train['total_acc_21-24'] = np.where(df_features_train['total_acc'].isin(range(21,24)), 1, 0)
df_features_train['total_acc_24-33'] = np.where(df_features_train['total_acc'].isin(range(24,33)), 1, 0)
df_features_train['total_acc_>33'] = np.where(df_features_train['total_acc'] > 33, 1, 0)


In [141]:
# df_features_train['acc_now_delinq_cats'] = pd.cut(df_features_train['acc_now_delinq'], 5)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'acc_now_delinq_cats')
# df_woe

In [142]:
# plotByWoE(df_woe)

this is also irrelevant

In [143]:
# df_features_train['annual_inc_cats'] = pd.cut(df_features_train['annual_inc'], 200)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'annual_inc_cats')
# df_woe

In [144]:
df_features_train['annual_inc_large_income'] = np.where(df_features_train['annual_inc'] > 145000,1,0)
df_features_train['annual_inc_<38K'] = np.where(df_features_train['annual_inc'] <= 38000,1,0)
df_features_train['annual_inc_38K-73.2K'] = np.where(df_features_train['annual_inc'].isin(range(38000,73300)),1,0)
df_features_train['annual_inc_73.2K-109K'] = np.where(df_features_train['annual_inc'].isin(range(73300,109000)),1,0)
df_features_train['annual_inc_109K-144K'] = np.where(df_features_train['annual_inc'].isin(range(109000,144000)),1,0)

In [145]:
# delin_data = df_features_train[~(pd.isnull(df_features_train['mths_since_last_delinq']))]

In [146]:
# delin_data['mths_since_last_delinq_cats'] = pd.cut(delin_data['mths_since_last_delinq'], 40)
# df_woe = check_WoE_IV_continuous(delin_data, df_targets_train, 'mths_since_last_delinq_cats')
# df_woe


In [147]:
# plotByWoE(df_woe.iloc[:-24,:],rotate_x=90)
# df_features_train['mths_since_last_delinq'].unique()

In [148]:
# df_features_train['mths_since_last_delinq_cats_>80'] = np.where(df_features_train['mths_since_last_delinq'] > 80,1,0)
# df_features_train['mths_since_last_delinq_<5'] = np.where(df_features_train['mths_since_last_delinq'] < 5,1,0)
# df_features_train['mths_since_last_delinq_5-23'] = np.where(df_features_train['mths_since_last_delinq'].isin(range(5,24)),1,0)
# df_features_train['mths_since_last_delinq_24-56'] = np.where(df_features_train['mths_since_last_delinq'].isin(range(24,57)),1,0)
# df_features_train['mths_since_last_delinq_>57'] = np.where(df_features_train['mths_since_last_delinq'].isin(range(57,100)),1,0)


In [149]:
# df_features_train['dti_cats'] = pd.cut(df_features_train['dti'], 20)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'dti_cats')
# df_woe


In [150]:
# plotByWoE(df_woe, rotate_x=90)

In [151]:
# dti_bins = df_features_train['dti_cats'].cat.categories

In [152]:

for i in range(len(dti_bins)):
    col_name = f"dti_{dti_bins[i]}"
    df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) & 
                                           (df_features_train['dti'] <= dti_bins[i].right), 1, 0)


  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &
  df_features_train[col_name] = np.where((df_features_train['dti'] > dti_bins[i].left) &


In [153]:
# df_features_train['mths_since_last_record_cats'] = pd.cut(df_features_train['mths_since_last_record'], 10)
# df_woe = check_WoE_IV_continuous(df_features_train, df_targets_train, 'mths_since_last_record_cats')
# df_woe

In [154]:
# plotByWoE(df_woe)

In [155]:
# bins = df_features_train['mths_since_last_record_cats'].cat.categories
# for i in range(len(bins)):
#     col_name = f"mths_since_last_record_{bins[i]}"
#     df_features_train[col_name] = np.where((df_features_train['mths_since_last_record'] > bins[i].left) & 
#                                            (df_features_train['mths_since_last_record'] <= bins[i].right), 1, 0)

Preparation/Precprocessing of test Dataset

In [156]:
# X_train = df_features_train
X_test = df_features_train
# df_features_train, df_targets_train = X_test, y_test

In [157]:
# pd.set_option('display.max_rows', 10)
# X_train

In [159]:
X_train.to_csv("loan_data_train.csv")
X_test.to_csv("loan_data_test.csv")
y_train.to_csv("loan_data_label_train.csv")
y_test.to_csv("loan_data_label_test.csv")