In [1]:
# Data manipulation
import pandas as pd 

# numerical manipulation
import numpy as np

# library for converting our features into a vector
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler

# to split our dataset into train and test
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
#Using our final estimator to build our model
from sklearn.ensemble import RandomForestClassifier as RFC

# to evaluate model performance
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

#from feature engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
    LogCpTransformer
)

from feature_engine.selection import DropFeatures
from feature_engine.encoding import OrdinalEncoder

In [15]:

# load the dataset
data = pd.read_csv('CreditScoring.csv')

# read snapshot of the dataset
data.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [16]:
# format the feature column names
data.columns = data.columns.str.lower()

data.shape

(4455, 14)

In [4]:
# update the categorical var its string values so we can know what each number represent
status_values = {1: "good", 2: "bad", 0: "unknown"}
data.status = data.status.map(status_values)

home_values = {1: "rent", 2: "owner", 3: "priv", 4: "ignore", 5: "parents",6: "other", 0: "unknown"}
data.home = data.home.map(home_values)

marital_values = {1:"single", 2:"married", 3:"widow", 4:"separated", 5:"divorced", 0:"unknown"}
data.marital = data.marital.map(marital_values)

records_values = {1:"no_rec", 2:"yes_rec"}
data.records = data.records.map(records_values)

job_values = {1:"fixed", 2:"partime", 3:"freelance", 4:"others", 0: 'unknown"'}
data.job = data.job.map(job_values)

In [5]:
# 99999999 represents data not available for a particular user. Hence, let's
#replace them with the usual NaN in numpy

num_List = ['income', 'assets', 'debt']
for var in num_List:
    data[var].replace(to_replace=99999999, value=np.nan, inplace=True)

In [6]:
# let's exclude the unknown value in status since their present is small
data  = data[data.status != 'unknown']

In [7]:
# let's change the status value from string data type to int.
data.status = (data.status == 'good').astype(int)
data.status.unique()

array([1, 0])

# Splitting the data into train and test

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
    data.drop('status', axis=1), # predicting features
    data['status'], # target variable
    test_size=0.3,
    random_state=1
)

x_train.shape, x_test.shape

((3117, 13), (1337, 13))

In [9]:
x_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
4301,15,other,60,50,married,no_rec,fixed,35,92.0,6000.0,0.0,900,982
3431,1,parents,24,21,single,no_rec,partime,45,114.0,2500.0,500.0,500,1154
396,10,owner,48,37,divorced,no_rec,fixed,35,87.0,3500.0,450.0,500,1155
1277,0,parents,60,20,single,no_rec,fixed,35,79.0,0.0,0.0,1570,1575
1903,17,owner,60,51,single,no_rec,fixed,35,155.0,3000.0,0.0,1250,1668


In [10]:
NUMERICAL_VAR_WITH_NA = ['income', 'assets', 'debt'] 
NUMERICAL_LOG_VARS = ['price', 'amount', 'income', 'assets', 'debt']
NUMERICAL_YEO_VARS = ['seniority']
CATEGORICAL_VARS = ['home', 'marital', 'records', 'job']

# Pipeline design

In [11]:
credit_risk_pipeline = Pipeline([

    # ======== IMPUTATION ========== #

    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VAR_WITH_NA)),

    ('median_imputter', MeanMedianImputer(imputation_method='median', variables=NUMERICAL_VAR_WITH_NA)),


    # ==== VARIABLE TRANSFORMATION ========= #

    ('log_transformer', LogCpTransformer(variables=NUMERICAL_LOG_VARS, C=1)),

    ('yeojohnson', YeoJohnsonTransformer(variables=NUMERICAL_YEO_VARS)),

    # ===== CATEGORICAL ENCODER ========= #
    ('category_encoding', OrdinalEncoder(encoding_method = 'ordered', variables = CATEGORICAL_VARS)),

    # ========== STANDARDIZING OUR DATA ======= #
    ('data_scaling', StandardScaler()),

    # ======= final estimator ==========#
    ('rfc', RFC(random_state=1, n_estimators= 100, max_depth=5))

    ])

In [12]:
# Train the pipeline
credit_risk_pipeline.fit(x_train, y_train)

Pipeline(steps=[('missing_indicator',
                 AddMissingIndicator(variables=['income', 'assets', 'debt'])),
                ('median_imputter',
                 MeanMedianImputer(variables=['income', 'assets', 'debt'])),
                ('log_transformer',
                 LogCpTransformer(C=1,
                                  variables=['price', 'amount', 'income',
                                             'assets', 'debt'])),
                ('yeojohnson', YeoJohnsonTransformer(variables=['seniority'])),
                ('category_encoding',
                 OrdinalEncoder(variables=['home', 'marital', 'records',
                                           'job'])),
                ('data_scaling', StandardScaler()),
                ('rfc', RandomForestClassifier(max_depth=5, random_state=1))])

In [13]:

#Train set
class_ = credit_risk_pipeline.predict(x_train)
pred = credit_risk_pipeline.predict_proba(x_train)[:,1]


# determine mse and rmse
print('train roc-auc: {}'.format(roc_auc_score(y_train, pred)))
print('train accuracy: {}'.format(accuracy_score(y_train, class_)))

# Classification report
print(classification_report(y_train, class_))



train roc-auc: 0.8840921268199196
train accuracy: 0.820660891883221
              precision    recall  f1-score   support

           0       0.83      0.44      0.58       860
           1       0.82      0.96      0.89      2257

    accuracy                           0.82      3117
   macro avg       0.82      0.70      0.73      3117
weighted avg       0.82      0.82      0.80      3117



In [14]:
class_

array([1, 1, 1, ..., 1, 1, 1])