In [1]:
# Data manipulation
import pandas as pd 

# numerical manipulation
import numpy as np

# library for converting our features into a vector
from sklearn.feature_extraction import DictVectorizer

# to split our dataset into train and test
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# to evaluate model performance
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

#to serialize and save our model in useful
import joblib

# import our custom transformer
import custom_processor as cp

#from feature engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
    LogCpTransformer
)

from feature_engine.selection import DropFeatures

In [2]:

# load the dataset
data = pd.read_csv('CreditScoring.csv')

# read snapshot of the dataset
data.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [3]:
# format the feature column names
data.columns = data.columns.str.lower()

In [4]:
# update the categorical var its string values so we can know what each number represent
status_values = {1: "good", 2: "bad", 0: "unknown"}
data.status = data.status.map(status_values)

home_values = {1: "rent", 2: "owner", 3: "priv", 4: "ignore", 5: "parents",6: "other", 0: "unknown"}
data.home = data.home.map(home_values)

marital_values = {1:"single", 2:"married", 3:"widow", 4:"separated", 5:"divorced", 0:"unknown"}
data.marital = data.marital.map(marital_values)

records_values = {1:"no_rec", 2:"yes_rec"}
data.records = data.records.map(records_values)

job_values = {1:"fixed", 2:"partime", 3:"freelance", 4:"others", 0: 'unknown"'}
data.job = data.job.map(job_values)

In [5]:
# 99999999 represents data not available for a particular user. Hence, let's
#replace them with the usual NaN in numoy

num_List = ['income', 'assets', 'debt']
for var in num_List:
    data[var].replace(to_replace=99999999, value=np.nan, inplace=True)

In [6]:
# let's exclude the unknown value in status since their present is small
data  = data[data.status != 'unknown']

In [7]:
# let's change the status value from string data type to int.
data.status = (data.status == 'good').astype(int)
data.status.unique()

array([1, 0])

In [8]:
# finance ratio  = amount/price

# savings potential index = (Income - Expenses - (Debt/100)) / (Amount / Time)


data['fin_ratio'] = (data['amount']/data['price']).round(2)

data['sav_pot_index'] = ((data['income'] - data['expenses'] - (data['debt']/100))/ (data['amount']/data['time'])).round(2)

data.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price,fin_ratio,sav_pot_index
0,1,9,rent,60,30,married,no_rec,freelance,73,129.0,0.0,0.0,800,846,0.95,4.2
1,1,17,rent,60,58,widow,no_rec,fixed,48,131.0,0.0,0.0,1000,1658,0.6,4.98
2,0,10,owner,36,46,married,yes_rec,freelance,90,200.0,3000.0,0.0,2000,2985,0.67,1.98
3,1,0,rent,60,24,single,no_rec,fixed,63,182.0,2500.0,0.0,900,1325,0.68,7.93
4,1,0,rent,36,26,single,no_rec,fixed,46,107.0,0.0,0.0,310,910,0.34,7.08


# Splitting the data into train and test

In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    data.drop('status', axis=1), # predicting features
    data['status'], # target variable
    test_size=0.3,
    random_state=1
)

x_train.shape, x_test.shape

((3117, 15), (1337, 15))

# Configuration 

In [10]:
NUMERICAL_VAR_WITH_NA = ['income', 'assets', 'debt', 'sav_pot_index']
NUMERICAL_LOG_VARS = ['price', 'amount', 'income', 'assets', 'debt']
NUMERICAL_YEO_VARS = ['seniority', 'fin_ratio']
ALL_VARS = [
        'seniority', 'home', 'time', 'age', 'marital', 'records', 'job',
       'expenses', 'income', 'assets', 'debt', 'amount', 'price', 'fin_ratio',
       'sav_pot_index'
        ]
ENGINEERED_VARS =  ['age', 'amount', 'assets', 'assets_na', 'debt', 'debt_na',
                        'expenses', 'fin_ratio', 'home=ignore', 'home=other', 'home=owner',
                        'home=parents', 'home=priv', 'home=rent', 'home=unknown', 'income',
                        'income_na', 'job=fixed', 'job=freelance', 'job=others',
                        'job=partime', 'job=unknown', 'marital=divorced',
                        'marital=married', 'marital=separated', 'marital=single',
                        'marital=unknown', 'marital=widow', 'price', 'records=no_rec',
                        'records=yes_rec', 'sav_pot_index', 'sav_pot_index_na',
                        'seniority', 'time'
                        ]

DROPPED_VARS  = [ 'age', 'assets_na', 'debt', 'debt_na',
       'expenses', 'home=ignore', 'home=other',
       'home=parents', 'home=priv', 'home=rent', 'home=unknown', 
       'income_na', 'job=freelance', 'job=others',
        'job=unknown', 'marital=divorced', 'marital=unknown',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=widow', 'price', 'sav_pot_index_na', 'time'
       ]

# Selected feature variables
FEATURES = [
    'amount','assets','fin_ratio','home=owner','income','job=fixed','job=partime','records=no_rec',
    'records=yes_rec','sav_pot_index','seniority'
]

# Pipeline design

In [11]:
credit_risk_pipeline = Pipeline([
    
    # ======== IMPUTATION ========== #

    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VAR_WITH_NA)),

    ('median_imputter', MeanMedianImputer(imputation_method='median', variables=NUMERICAL_VAR_WITH_NA)),


    # ==== VARIABLE TRANSFORMATION ========= #

    ('log_transformer', LogCpTransformer(variables=NUMERICAL_LOG_VARS, C=1)),

    ('yeojohnson', YeoJohnsonTransformer(variables=NUMERICAL_YEO_VARS)),


    # ========== FEATURE EXTRACTION ========= #
    ('feature_extraction', cp.DictVect(variables=ENGINEERED_VARS)),

    # ========== SELECTION OF FEATURES SUITABLE FOR MODEL TRAINING ======= #
    ('dropped_features', DropFeatures(features_to_drop=DROPPED_VARS))

    ])

In [12]:
# Train the pipeline
credit_risk_pipeline.fit(x_train, y_train)

Pipeline(steps=[('missing_indicator',
                 AddMissingIndicator(variables=['income', 'assets', 'debt',
                                                'sav_pot_index'])),
                ('median_imputter',
                 MeanMedianImputer(variables=['income', 'assets', 'debt',
                                              'sav_pot_index'])),
                ('log_transformer',
                 LogCpTransformer(C=1,
                                  variables=['price', 'amount', 'income',
                                             'assets', 'debt'])),
                ('yeojohnson',
                 YeoJohnsonTransformer(variables=['se...
                 DropFeatures(features_to_drop=['age', 'assets_na', 'debt',
                                                'debt_na', 'expenses',
                                                'home=ignore', 'home=other',
                                                'home=parents', 'home=priv',
                                      

In [13]:
x_test.shape

(1337, 15)

In [14]:
# use the model to transform the train and the test set
x_train = credit_risk_pipeline.transform(x_train)
#x_test = credit_risk_pipeline.transform(x_test)
x_train

Unnamed: 0,amount,assets,fin_ratio,home=owner,income,job=fixed,job=partime,records=no_rec,records=yes_rec,sav_pot_index,seniority
0,6.803505,8.699681,2.947228,0.0,4.532599,1.0,0.0,1.0,0.0,3.80,3.228359
1,6.216606,7.824446,0.770449,0.0,4.744932,0.0,1.0,1.0,0.0,3.07,0.719533
2,6.216606,8.160804,0.770449,1.0,4.477337,1.0,0.0,1.0,0.0,4.56,2.734060
3,7.359468,0.000000,3.493683,0.0,4.382027,1.0,0.0,1.0,0.0,1.68,0.000000
4,7.131699,8.006701,1.983679,1.0,5.049856,1.0,0.0,1.0,0.0,5.76,3.387884
...,...,...,...,...,...,...,...,...,...,...,...
3112,6.398595,9.873080,1.657081,1.0,5.442418,0.0,0.0,0.0,1.0,12.18,4.196754
3113,7.313887,8.853808,1.110973,1.0,5.252273,0.0,0.0,1.0,0.0,5.20,3.532299
3114,7.090910,8.779711,1.746534,1.0,0.000000,0.0,0.0,1.0,0.0,-4.82,3.228359
3115,7.496097,0.000000,2.187852,0.0,5.278115,1.0,0.0,1.0,0.0,3.07,2.951990


In [15]:
# check absence of na in the train set
[var for var in x_train.columns if x_train[var].isnull().sum() > 0]

[]