In [1]:
# Data manipulation
import pandas as pd 

# numerical manipulation
import numpy as np

# library for converting our features into a vector
from sklearn.feature_extraction import DictVectorizer

# to split our dataset into train and test
from sklearn.model_selection import train_test_split

# to evaluate model performance
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

#to serialize and save our model in useful
import joblib

In [2]:

# load the dataset
data = pd.read_csv('CreditScoring.csv')

# read snapshot of the dataset
data.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [3]:
# format the feature column names
data.columns = data.columns.str.lower()

In [4]:
# update the categorical var its string values so we can know what each number represent
status_values = {1: "good", 2: "bad", 0: "unknown"}
data.status = data.status.map(status_values)

home_values = {1: "rent", 2: "owner", 3: "priv", 4: "ignore", 5: "parents",6: "other", 0: "unknown"}
data.home = data.home.map(home_values)

marital_values = {1:"single", 2:"married", 3:"widow", 4:"separated", 5:"divorced", 0:"unknown"}
data.marital = data.marital.map(marital_values)

records_values = {1:"no_rec", 2:"yes_rec"}
data.records = data.records.map(records_values)

job_values = {1:"fixed", 2:"partime", 3:"freelance", 4:"others", 0: 'unknown"'}
data.job = data.job.map(job_values)

In [5]:
# 99999999 represents data not available for a particular user. Hence, let's
#replace them with the usual NaN in numoy

num_List = ['income', 'assets', 'debt']
for var in num_List:
    data[var].replace(to_replace=99999999, value=np.nan, inplace=True)

In [6]:
# let's exclude the unknown value in status since their present is small
data  = data[data.status != 'unknown']

In [7]:
# let's change the status value from string data type to int.
data.status = (data.status == 'good').astype(int)
data.status.unique()

array([1, 0])

# Separating data into train and test datasets

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
    data.drop('status', axis=1), # predicting features
    data['status'], # target variable
    test_size=0.3,
    random_state=1
)

x_train.shape, x_test.shape

((3117, 13), (1337, 13))

# Handling Missing values in our numerical value using open source packages

In [9]:
# varibable separation
cat_var  = [var for var in x_train.columns if x_train[var].dtype == 'object']
num_var = [var for var in x_train.columns if var not in cat_var and var != 'status']

# variables with missing data
num_var_na = [var for var in num_var if x_train[var].isnull().sum() > 0]
num_var_na

['income', 'assets', 'debt']

In [10]:
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer
)

#To handle missing values in numerical variables we will solve it in two ways:
# - add a binary missing indicator
# - then replace missing values in the original variable with the mean

#set up the class
missing_ind = AddMissingIndicator(variables = num_var_na)

#fit the class to the train
missing_ind.fit(x_train)

# apply the object transform method
x_train = missing_ind.transform(x_train)
x_test = missing_ind.transform(x_test)

#inspect the addition of the binary indicator var
x_train.head()




Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price,income_na,assets_na,debt_na
4301,15,other,60,50,married,no_rec,fixed,35,92.0,6000.0,0.0,900,982,0,0,0
3431,1,parents,24,21,single,no_rec,partime,45,114.0,2500.0,500.0,500,1154,0,0,0
396,10,owner,48,37,divorced,no_rec,fixed,35,87.0,3500.0,450.0,500,1155,0,0,0
1277,0,parents,60,20,single,no_rec,fixed,35,79.0,0.0,0.0,1570,1575,0,0,0
1903,17,owner,60,51,single,no_rec,fixed,35,155.0,3000.0,0.0,1250,1668,0,0,0


In [11]:
# insert median values into the missing na

#set the inputter by creating an instance of the class
median_inputer = MeanMedianImputer(
    imputation_method= 'median', variables= num_var_na
)

#learn and store paramters from the train set
median_inputer.fit(x_train)

#view the stored parameters
median_inputer.imputer_dict_

{'income': 120.0, 'assets': 3500.0, 'debt': 0.0}

In [12]:
x_train = median_inputer.transform(x_train)
x_test = median_inputer.transform(x_test)

#check if there are still occurences of missing data
print(x_train[num_var_na].isnull().sum())
print(x_test[num_var_na].isnull().sum())

income    0
assets    0
debt      0
dtype: int64
income    0
assets    0
debt      0
dtype: int64


NB: sav_pot_index indicates "saving potential index

# Logarithmic transformation of our skewed variables - 'price', 'amount', 'income', 'assets', 'debt'

In [13]:
from feature_engine.transformation import LogCpTransformer

log_vars = ['price', 'amount', 'income', 'assets', 'debt']

#setting up the class
log_trans = LogCpTransformer(variables = log_vars, C = 1)

# learn and store parameters from the train set
log_trans.fit(x_train)

# apply the transform method to the train and test sets
x_train = log_trans.transform(x_train)
x_test = log_trans.transform(x_test)

log_trans.get_params

<bound method BaseEstimator.get_params of LogCpTransformer(C=1, variables=['price', 'amount', 'income', 'assets', 'debt'])>

# Yeo Johnson transformation of our excessively skewed variables  - 'seniority', 'fin_ratio'

In [14]:
#Using yeo-johnson transforamtion to improve the distribution characteristics of our continuous numerical data
from feature_engine.transformation import YeoJohnsonTransformer

yeo_vars = ['seniority']

yeo_johnson_transformer = YeoJohnsonTransformer(
    variables= yeo_vars
)

x_train = yeo_johnson_transformer.fit_transform(x_train)
x_test = yeo_johnson_transformer.transform(x_test)

#view the learned parameters
yeo_johnson_transformer.lambda_dict_

{'seniority': 0.10713384527284725}

In [15]:
x_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price,income_na,assets_na,debt_na
4301,3.228359,other,60,50,married,no_rec,fixed,35,4.532599,8.699681,0.0,6.803505,6.890609,0,0,0
3431,0.719533,parents,24,21,single,no_rec,partime,45,4.744932,7.824446,6.216606,6.216606,7.051856,0,0,0
396,2.73406,owner,48,37,divorced,no_rec,fixed,35,4.477337,8.160804,6.111467,6.216606,7.052721,0,0,0
1277,0.0,parents,60,20,single,no_rec,fixed,35,4.382027,0.0,0.0,7.359468,7.362645,0,0,0
1903,3.387884,owner,60,51,single,no_rec,fixed,35,5.049856,8.006701,0.0,7.131699,7.41998,0,0,0


# Feature transformation using Dict vectorizer

In [16]:
# Conveting our variable into dict and using the dict vectorizer to transform variables

x_train_dict = x_train.to_dict(orient='records')
x_test_dict = x_test.to_dict(orient='records')


dict_vect = DictVectorizer(sparse=False)

dict_vect.fit(x_train_dict)

x_train =  dict_vect.transform(x_train_dict)
x_test =  dict_vect.transform(x_test_dict)

In [17]:
x_train_dict

[{'seniority': 3.2283594715369155,
  'home': 'other',
  'time': 60,
  'age': 50,
  'marital': 'married',
  'records': 'no_rec',
  'job': 'fixed',
  'expenses': 35,
  'income': 4.532599493153256,
  'assets': 8.699681400989514,
  'debt': 0.0,
  'amount': 6.803505257608338,
  'price': 6.890609120147166,
  'income_na': 0,
  'assets_na': 0,
  'debt_na': 0},
 {'seniority': 0.7195326320618789,
  'home': 'parents',
  'time': 24,
  'age': 21,
  'marital': 'single',
  'records': 'no_rec',
  'job': 'partime',
  'expenses': 45,
  'income': 4.74493212836325,
  'assets': 7.824445930877619,
  'debt': 6.2166061010848646,
  'amount': 6.2166061010848646,
  'price': 7.051855622955894,
  'income_na': 0,
  'assets_na': 0,
  'debt_na': 0},
 {'seniority': 2.73405963937321,
  'home': 'owner',
  'time': 48,
  'age': 37,
  'marital': 'divorced',
  'records': 'no_rec',
  'job': 'fixed',
  'expenses': 35,
  'income': 4.477336814478207,
  'assets': 8.160803920954665,
  'debt': 6.111467339502679,
  'amount': 6.2166

In [18]:
x_train

array([[50.        ,  6.80350526,  8.6996814 , ...,  0.        ,
         3.22835947, 60.        ],
       [21.        ,  6.2166061 ,  7.82444593, ...,  0.        ,
         0.71953263, 24.        ],
       [37.        ,  6.2166061 ,  8.16080392, ...,  0.        ,
         2.73405964, 48.        ],
       ...,
       [55.        ,  7.09090982,  8.77971129, ...,  0.        ,
         3.22835947, 60.        ],
       [36.        ,  7.49609735,  0.        , ...,  0.        ,
         2.95198994, 48.        ],
       [40.        ,  6.12249281,  8.29429961, ...,  0.        ,
         3.89908055, 18.        ]])

In [19]:
x_train.shape

(3117, 32)

In [20]:
x_test.shape

(1337, 32)

In [21]:
dict_vect.get_feature_names_out()

array(['age', 'amount', 'assets', 'assets_na', 'debt', 'debt_na',
       'expenses', 'home=ignore', 'home=other', 'home=owner',
       'home=parents', 'home=priv', 'home=rent', 'home=unknown', 'income',
       'income_na', 'job=fixed', 'job=freelance', 'job=others',
       'job=partime', 'job=unknown"', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=unknown', 'marital=widow', 'price', 'records=no_rec',
       'records=yes_rec', 'seniority', 'time'], dtype=object)

In [22]:
x_test[0]

array([19.        ,  5.99396143,  0.        ,  0.        ,  0.        ,
        0.        , 35.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  3.36729583,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  6.39859493,  1.        ,  0.        ,
        0.71953263, 24.        ])

# Feature selection using our select from model package

In [23]:
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier as RFC

# Then we use the selectFromModel object from sklearn, which
# will select automatically the features which coefficients are non-zero

# remember to set the seed, the random state in this function
sel_ = SelectFromModel(RFC(random_state=1, n_estimators= 100, max_depth=5))

# train rfc model and select features
sel_.fit(x_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(max_depth=5, random_state=1))

In [24]:
sel_.get_support()

array([False,  True,  True, False, False, False, False, False, False,
        True, False, False, False, False,  True, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True,  True,  True, False])

In [26]:
selected_features = sel_.get_feature_names_out(input_features=
                           ['age', 'amount', 'assets', 'assets_na', 'debt', 'debt_na',
       'expenses', 'home=ignore', 'home=other', 'home=owner',
       'home=parents', 'home=priv', 'home=rent', 'home=unknown', 'income',
       'income_na', 'job=fixed', 'job=freelance', 'job=others',
       'job=partime', 'job=unknown"', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=unknown', 'marital=widow', 'price', 'records=no_rec',
       'records=yes_rec', 'seniority', 'time']
                           )

In [27]:
selected_features

array(['amount', 'assets', 'home=owner', 'income', 'job=fixed',
       'job=partime', 'records=no_rec', 'records=yes_rec', 'seniority'],
      dtype=object)

In [28]:
pd.Series(selected_features).to_csv('selected_feats.csv', index_label=False)

In [29]:
x_train_final = pd.DataFrame(data=x_train, columns = 
                       [
                           'age', 'amount', 'assets', 'assets_na', 'debt', 'debt_na',
       'expenses', 'home=ignore', 'home=other', 'home=owner',
       'home=parents', 'home=priv', 'home=rent', 'home=unknown', 'income',
       'income_na', 'job=fixed', 'job=freelance', 'job=others',
       'job=partime', 'job=unknown"', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=unknown', 'marital=widow', 'price', 'records=no_rec',
       'records=yes_rec', 'seniority', 'time'
                       ]
                       )
x_test_final = pd.DataFrame(data=x_test, columns=
                      [
                          'age', 'amount', 'assets', 'assets_na', 'debt', 'debt_na',
       'expenses', 'home=ignore', 'home=other', 'home=owner',
       'home=parents', 'home=priv', 'home=rent', 'home=unknown', 'income',
       'income_na', 'job=fixed', 'job=freelance', 'job=others',
       'job=partime', 'job=unknown"', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=unknown', 'marital=widow', 'price', 'records=no_rec',
       'records=yes_rec', 'seniority', 'time'
                      ]
                      )
x_train_final.head()

Unnamed: 0,age,amount,assets,assets_na,debt,debt_na,expenses,home=ignore,home=other,home=owner,...,marital=married,marital=separated,marital=single,marital=unknown,marital=widow,price,records=no_rec,records=yes_rec,seniority,time
0,50.0,6.803505,8.699681,0.0,0.0,0.0,35.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,6.890609,1.0,0.0,3.228359,60.0
1,21.0,6.216606,7.824446,0.0,6.216606,0.0,45.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,7.051856,1.0,0.0,0.719533,24.0
2,37.0,6.216606,8.160804,0.0,6.111467,0.0,35.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,7.052721,1.0,0.0,2.73406,48.0
3,20.0,7.359468,0.0,0.0,0.0,0.0,35.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,7.362645,1.0,0.0,0.0,60.0
4,51.0,7.131699,8.006701,0.0,0.0,0.0,35.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,7.41998,1.0,0.0,3.387884,60.0


In [30]:
x_test_final.head()

Unnamed: 0,age,amount,assets,assets_na,debt,debt_na,expenses,home=ignore,home=other,home=owner,...,marital=married,marital=separated,marital=single,marital=unknown,marital=widow,price,records=no_rec,records=yes_rec,seniority,time
0,19.0,5.993961,0.0,0.0,0.0,0.0,35.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,6.398595,1.0,0.0,0.719533,24.0
1,49.0,5.993961,8.006701,0.0,6.216606,0.0,90.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,7.267525,1.0,0.0,3.141799,12.0
2,22.0,6.175867,0.0,0.0,0.0,0.0,35.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,7.555382,1.0,0.0,1.975284,48.0
3,20.0,6.608001,8.160804,1.0,0.0,0.0,35.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,7.189922,1.0,0.0,1.165878,48.0
4,60.0,6.908755,8.006701,0.0,0.0,0.0,60.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,7.626083,1.0,0.0,0.0,36.0


# Dropping features not essential for our model using open source package from feature engine 

In [31]:
dropped_vars  = [ 'age', 'assets_na', 'debt', 'debt_na',
       'expenses', 'home=ignore', 'home=other',
       'home=parents', 'home=priv', 'home=rent', 'home=unknown', 
       'income_na', 'job=freelance', 'job=others',
        'job=unknown"', 'marital=divorced', 'marital=unknown',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=widow', 'price', 'time']

from feature_engine.selection import DropFeatures

df = DropFeatures(features_to_drop = dropped_vars)

df.fit(x_train_final)

x_train = df.transform(x_train_final)
x_test = df.transform(x_test_final)

x_train

Unnamed: 0,amount,assets,home=owner,income,job=fixed,job=partime,records=no_rec,records=yes_rec,seniority
0,6.803505,8.699681,0.0,4.532599,1.0,0.0,1.0,0.0,3.228359
1,6.216606,7.824446,0.0,4.744932,0.0,1.0,1.0,0.0,0.719533
2,6.216606,8.160804,1.0,4.477337,1.0,0.0,1.0,0.0,2.734060
3,7.359468,0.000000,0.0,4.382027,1.0,0.0,1.0,0.0,0.000000
4,7.131699,8.006701,1.0,5.049856,1.0,0.0,1.0,0.0,3.387884
...,...,...,...,...,...,...,...,...,...
3112,6.398595,9.873080,1.0,5.442418,0.0,0.0,0.0,1.0,4.196754
3113,7.313887,8.853808,1.0,5.252273,0.0,0.0,1.0,0.0,3.532299
3114,7.090910,8.779711,1.0,0.000000,0.0,0.0,1.0,0.0,3.228359
3115,7.496097,0.000000,0.0,5.278115,1.0,0.0,1.0,0.0,2.951990


# Training our model using Bagging algorithm like RFC

In [32]:
#Using our final estimator to build our model
from sklearn.ensemble import RandomForestClassifier as RFC

rfc = RFC(random_state=1, n_estimators= 100, max_depth=5)

rfc.fit(x_train, y_train)

#Train set
class_ = rfc.predict(x_train)
pred = rfc.predict_proba(x_train)[:,1]


# determine mse and rmse
print('train roc-auc: {}'.format(roc_auc_score(y_train, pred)))
print('train accuracy: {}'.format(accuracy_score(y_train, class_)))

# Classification report
print(classification_report(y_train, class_))

print()

#Test set
class_ = rfc.predict(x_test)
pred = rfc.predict_proba(x_test)[:,1]


# determine mse and rmse
print('test roc-auc: {}'.format(roc_auc_score(y_test, pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_)))

print()



train roc-auc: 0.8609282748245768
train accuracy: 0.8110362528071864
              precision    recall  f1-score   support

           0       0.74      0.48      0.58       860
           1       0.83      0.94      0.88      2257

    accuracy                           0.81      3117
   macro avg       0.78      0.71      0.73      3117
weighted avg       0.80      0.81      0.80      3117


test roc-auc: 0.7946732805443261
test accuracy: 0.7756170531039641

