### Library Set-up

In [1]:
# Scientic libraries
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import logging
import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from credit_risk_model import config

# Helper libraries 
from tqdm.notebook import tqdm,trange
import warnings 
warnings.filterwarnings('ignore')

%config InlineBackend.figure_format = 'retina' # sets the figure format to 'retina' for high-resolution displays.

# Pandas options
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # To display all interaction 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 15) # When you print dataframes, 15 rows will be printed

# This code is auto reloading the kernel when any .py file is changed and saved.
%load_ext autoreload
%autoreload 2

# Table styles
table_styles = {
    'cerulean_palette': [
        dict(selector="th", props=[("color", "#FFFFFF"), ("background", "#004D80")]),
        dict(selector="td", props=[("color", "#333333")]),
        dict(selector="table", props=[("font-family", 'Arial'), ("border-collapse", "collapse")]),
        dict(selector='tr:nth-child(even)', props=[('background', '#D3EEFF')]),
        dict(selector='tr:nth-child(odd)', props=[('background', '#FFFFFF')]),
        dict(selector="th", props=[("border", "1px solid #0070BA")]),
        dict(selector="td", props=[("border", "1px solid #0070BA")]),
        dict(selector="tr:hover", props=[("background", "#80D0FF")]),
        dict(selector="tr", props=[("transition", "background 0.5s ease")]),
        dict(selector="th:hover", props=[("font-size", "1.07rem")]),
        dict(selector="th", props=[("transition", "font-size 0.5s ease-in-out")]),
        dict(selector="td:hover", props=[('font-size', '1.07rem'),('font-weight', 'bold')]),
        dict(selector="td", props=[("transition", "font-size 0.5s ease-in-out")])
    ]
}

# Seed value for numpy.random => makes notebooks stable across runs
np.random.seed(config.RANDOM_SEED)

### Data Ingestion

In [2]:
import credit_risk_model.data_processor as dp
df = dp.load_data_and_sanitize(config.FILE_NAME)

In [3]:
df['earliest_cr_line']=pd.to_datetime(df['earliest_cr_line'],format='%b-%Y')

In [4]:
df['issue_d'] = pd.to_datetime(df['issue_d'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396030 entries, 0 to 396029
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   loan_amnt             396030 non-null  float64       
 1   term                  396030 non-null  object        
 2   int_rate              396030 non-null  float64       
 3   installment           396030 non-null  float64       
 4   grade                 396030 non-null  object        
 5   sub_grade             396030 non-null  object        
 6   emp_title             373103 non-null  object        
 7   emp_length            377729 non-null  object        
 8   home_ownership        396030 non-null  object        
 9   annual_inc            396030 non-null  float64       
 10  verification_status   396030 non-null  object        
 11  issue_d               396030 non-null  datetime64[ns]
 12  loan_status           396030 non-null  object        
 13 

### EDA

In [6]:
# Instead of doing all the EDA manually will make use of Pandas reporting
from ydata_profiling import ProfileReport
# profile = ProfileReport(df,title='Pandas profiling of the dataset',explorative=True)
# profile.to_notebook_iframe()

#### Important points on the data after EDA using ydata profiling:
* Loan amount is little right skewed.
* Term has 2 categories 36 months and 60 months.
* The data has no duplicates.
* Interest rate is slightly skewed and has no missing values.
* Intsallment amount is also highly right skewed but we would not want to use it in analysis, as installment will be decided after the company has decided to give the loan, but here the objective is whether to give loan or not.
* The grade of the loans is categorical with B being the most common and G being the least common.
* There are a lot of categories in the employee_title column with missing values, won't be much useful for our analysis.
* The employee_length has 11 distinct categories with most common category being more than 10 years.
* Home ownership has 6 distinct categories, out of which the last 2 categories occur very less number of times.
* Annual income is highly skewed with skew value of around 41. Need to transform this column if applying logistic regression.
* Issue d is the issue date which will be after the loan has been approved, but here our objective is to check if the loan should be approved or not, so we can remove the column to avoid data leakage.
* There are almost 48k distinct values in title column so that will not be useful for us.
* Open_acc and Pub_rec are also highly right skewed.
* Revol_bal is also right skewed.
* Revol util is very slightly left skewed and close to normal so will not transform this.
* Total account is slightly right skewed.
* Mortgage accounts also is right skewed should be transformed.
* pub_rec_bankruptcies is also right skewed and should be transformed.
* The last 6 digits of address column is the zipcode which can be a useful feature.

### Test Data separation

In [7]:
x = df.drop(columns=[config.TARGET])
y = df[config.TARGET]
from sklearn.model_selection import train_test_split

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)

In [9]:
# Validating if we have created a proper train and test split which is representative of the entire dataset
# Create a profile report for the train dataset
train_profile = ProfileReport(x_train, title="Train Dataset Profile", explorative=True)

# Creating a profile report for test dataset
test_profile = ProfileReport(x_test,title='Test Dataset Profile', explorative=True)

In [10]:
# We can compare the stats of train and test dataset using the below code.
# comparision_report = train_profile.compare(test_profile)
# comparision_report.to_notebook_iframe()

### Outlier removal

In [11]:
# Removing outliers in the training dataset and storing it in a different training set x_train_NO

num = config.NUM_FEATURES + config.NUM_SKEWED_FEATURES

q1 = df[num].quantile(0.25)
q3 = df[num].quantile(0.75)
iqr = q3-q1


In [12]:
x_train_NO = x_train[~((x_train[num]<q1-(1.5*iqr)) | (x_train[num]>q3+(1.5*iqr))).any(axis=1)]

In [13]:
y_train_NO = y_train.loc[x_train_NO.index]

### Feature engineering

* Let's remove the columns installment, issue_d and int_rate, because these things are given after the loan approval, but in our analysis we have to decide whether to give a loan or not so removing these to avoid data leakage.
* Removing emp_title column as there are so many distinct values in it.
* Absolute dates doesn't add any positive effect on the model so will remove it, alternatively we can create any relative date columns like difference between start date and end date etc.

According to Feature engineering principles there are different types of data:
1) Categorical ordinal data - Data which is categorical and has meaningful order (like small, medium, large)
2) Categorical nominal data - Data which is categorical but doesn't have any meaningful order (like brands Nike, adidas, reebok etc)
3) Numerical Discrete data - Data which is numerical and discrete which is 10,20,30 etc.
4) Numerical continuos data - Data which is numerical and continuos.

In [26]:
from sklearn import set_config
set_config(transform_output = "pandas")
from credit_risk_model.config import *

In [27]:
#Importing all the pipelines created in FE_pipeline.py file
from credit_risk_model.FE_pipeline import num_pipeline,num_skewed_pipeline,nominal_cat_pipeline,ordinal_cat_pipeline,selected_FE_with_FS,selected_FE

In [28]:
selected_FE_with_FS.fit(x_train,y_train)

In [31]:
config.POST_FE_FEATURES = selected_FE_with_FS.transform(x_test).columns