In [71]:
import pandas as pd
import numpy as np

## Load the data

In [72]:
data_preprocessed = pd.read_csv('dataset-credit-preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,checking_status_1,checking_status_2,checking_status_3,checking_status_4,duration,credit_history,purpose_1,purpose_2,purpose_3,purpose_4,...,other_payment_plans,housing_1,housing_2,housing_3,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0,1,0,0,6,0,1,0,0,0,...,0,1,0,0,2,2,1,1,1,1
1,1,0,0,0,48,2,1,0,0,0,...,0,1,0,0,1,2,1,0,1,0
2,0,0,0,1,12,0,0,1,0,0,...,0,1,0,0,1,1,2,0,1,1
3,0,1,0,0,42,2,0,0,1,0,...,0,0,1,0,1,2,2,0,1,1
4,0,1,0,0,24,1,0,0,0,1,...,0,0,1,0,2,2,2,0,1,0


## Targets

In [73]:
data_preprocessed.shape

(1000, 45)

In [74]:
targets = data_preprocessed['class']

In [75]:
data_preprocessed.iloc[:,:-1]

Unnamed: 0,checking_status_1,checking_status_2,checking_status_3,checking_status_4,duration,credit_history,purpose_1,purpose_2,purpose_3,purpose_4,...,age,other_payment_plans,housing_1,housing_2,housing_3,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,0,1,0,0,6,0,1,0,0,0,...,67,0,1,0,0,2,2,1,1,1
1,1,0,0,0,48,2,1,0,0,0,...,22,0,1,0,0,1,2,1,0,1
2,0,0,0,1,12,0,0,1,0,0,...,49,0,1,0,0,1,1,2,0,1
3,0,1,0,0,42,2,0,0,1,0,...,45,0,0,1,0,1,2,2,0,1
4,0,1,0,0,24,1,0,0,0,1,...,53,0,0,1,0,2,2,2,0,1
5,0,0,0,1,36,2,0,1,0,0,...,35,0,0,1,0,1,1,2,1,1
6,0,0,0,1,24,2,0,0,1,0,...,53,0,1,0,0,1,2,1,0,1
7,1,0,0,0,36,2,0,0,0,0,...,35,0,0,0,1,1,3,1,1,1
8,0,0,0,1,12,2,1,0,0,0,...,61,0,1,0,0,1,1,1,0,1
9,1,0,0,0,30,0,0,0,0,1,...,28,0,1,0,0,2,3,1,0,1


In [76]:
unscaled_inputs = data_preprocessed.iloc[:,:-1]

## Standardize data

In [77]:
from sklearn.preprocessing import StandardScaler

credit_scaler = StandardScaler()

In [78]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [79]:
unscaled_inputs.columns.values

array(['checking_status_1', 'checking_status_2', 'checking_status_3',
       'checking_status_4', 'duration', 'credit_history', 'purpose_1',
       'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6',
       'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10',
       'credit_amount', 'savings_status', 'employment_1', 'employment_2',
       'employment_3', 'employment_4', 'employment_5',
       'installment_commitment', 'personal_status_1', 'personal_status_2',
       'personal_status_3', 'personal_status_4', 'other_parties',
       'residence_since', 'property_magnitude_1', 'property_magnitude_2',
       'property_magnitude_3', 'property_magnitude_4', 'age',
       'other_payment_plans', 'housing_1', 'housing_2', 'housing_3',
       'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'], dtype=object)

In [80]:
columns_to_omit = ['checking_status_1', 'checking_status_2', 'checking_status_3',
                   'checking_status_4', 'purpose_1',
       'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6',
       'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10', 'savings_status', 'employment_1', 'employment_2',
       'employment_3', 'employment_4', 'employment_5', 'installment_commitment', 'personal_status_1', 'personal_status_2',
       'personal_status_3', 'personal_status_4', 'other_parties',
       'residence_since', 'property_magnitude_1', 'property_magnitude_2',
       'property_magnitude_3', 'property_magnitude_4',
       'other_payment_plans', 'housing_1', 'housing_2', 'housing_3',
       'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker']

columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [81]:
credit_scaler = CustomScaler(columns_to_scale)

In [82]:
credit_scaler.fit(unscaled_inputs)

CustomScaler(columns=['duration', 'credit_history', 'credit_amount', 'age'],
             copy=None, with_mean=None, with_std=None)

In [83]:
scaled_inputs = credit_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,checking_status_1,checking_status_2,checking_status_3,checking_status_4,duration,credit_history,purpose_1,purpose_2,purpose_3,purpose_4,...,age,other_payment_plans,housing_1,housing_2,housing_3,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,0,1,0,0,-1.236478,-1.410231,1,0,0,0,...,2.766456,0,1,0,0,2,2,1,1,1
1,1,0,0,0,2.248194,0.583028,1,0,0,0,...,-1.191404,0,1,0,0,1,2,1,0,1
2,0,0,0,1,-0.738668,-1.410231,0,1,0,0,...,1.183312,0,1,0,0,1,1,2,0,1
3,0,1,0,0,1.750384,0.583028,0,0,1,0,...,0.831502,0,0,1,0,1,2,2,0,1
4,0,1,0,0,0.256953,-0.413601,0,0,0,1,...,1.535122,0,0,1,0,2,2,2,0,1
5,0,0,0,1,1.252574,0.583028,0,1,0,0,...,-0.048022,0,0,1,0,1,1,2,1,1
6,0,0,0,1,0.256953,0.583028,0,0,1,0,...,1.535122,0,1,0,0,1,2,1,0,1
7,1,0,0,0,1.252574,0.583028,0,0,0,0,...,-0.048022,0,0,0,1,1,3,1,1,1
8,0,0,0,1,-0.738668,0.583028,1,0,0,0,...,2.238742,0,1,0,0,1,1,1,0,1
9,1,0,0,0,0.754763,-1.410231,0,0,0,1,...,-0.663689,0,1,0,0,2,3,1,0,1


In [84]:
scaled_inputs.shape

(1000, 44)

## Split dataset (Train, Validation, Test)

In [85]:
from sklearn.model_selection import train_test_split

In [86]:
train_test_split(scaled_inputs, targets)

[     checking_status_1  checking_status_2  checking_status_3  \
 291                  1                  0                  0   
 748                  0                  0                  0   
 504                  0                  1                  0   
 51                   1                  0                  0   
 721                  1                  0                  0   
 593                  1                  0                  0   
 936                  0                  0                  1   
 185                  0                  0                  0   
 610                  0                  1                  0   
 355                  1                  0                  0   
 482                  0                  1                  0   
 687                  1                  0                  0   
 223                  0                  0                  0   
 27                   0                  0                  1   
 461                  0  

In [87]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [88]:
print(x_train.shape, y_train.shape)

(800, 44) (800,)


In [89]:
print(x_test.shape, y_test.shape)

(200, 44) (200,)


In [90]:
print(x_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 504 to 355
Data columns (total 44 columns):
checking_status_1         800 non-null int64
checking_status_2         800 non-null int64
checking_status_3         800 non-null int64
checking_status_4         800 non-null int64
duration                  800 non-null float64
credit_history            800 non-null float64
purpose_1                 800 non-null int64
purpose_2                 800 non-null int64
purpose_3                 800 non-null int64
purpose_4                 800 non-null int64
purpose_5                 800 non-null int64
purpose_6                 800 non-null int64
purpose_7                 800 non-null int64
purpose_8                 800 non-null int64
purpose_9                 800 non-null int64
purpose_10                800 non-null int64
credit_amount             800 non-null float64
savings_status            800 non-null int64
employment_1              800 non-null int64
employment_2              800 no

## Logistic Regression

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [92]:
reg = LogisticRegression()
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [93]:
reg.score(x_train, y_train)

0.78625

In [94]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [95]:
y_train

504    0
839    1
141    1
825    1
470    0
926    1
173    1
117    1
855    1
308    0
416    0
622    0
390    1
46     1
751    0
532    1
237    0
613    1
456    1
351    0
750    1
205    1
697    1
156    1
579    1
738    1
449    0
498    1
656    0
741    1
      ..
255    1
485    0
775    0
869    1
617    1
144    1
190    0
978    0
742    1
765    1
784    1
979    0
763    0
794    1
998    0
469    1
552    0
118    0
162    1
583    0
790    0
843    1
404    1
393    1
218    1
924    0
223    1
271    1
474    0
355    0
Name: class, Length: 800, dtype: int64

In [96]:
model_outputs == y_train

504     True
839     True
141    False
825     True
470     True
926    False
173     True
117     True
855     True
308    False
416     True
622    False
390     True
46      True
751     True
532     True
237     True
613     True
456     True
351    False
750     True
205     True
697     True
156     True
579     True
738     True
449    False
498     True
656    False
741     True
       ...  
255     True
485    False
775     True
869    False
617     True
144     True
190    False
978    False
742     True
765     True
784     True
979     True
763    False
794     True
998    False
469     True
552    False
118    False
162     True
583     True
790    False
843     True
404     True
393     True
218    False
924     True
223     True
271     True
474    False
355     True
Name: class, Length: 800, dtype: bool

In [97]:
np.sum((model_outputs == y_train))

629

In [98]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.78625

In [99]:
reg.intercept_

array([1.24981855])

In [100]:
reg.coef_

array([[ 0.08306298, -0.2598157 ,  0.17949388,  1.24707739, -0.34639893,
        -0.36780688,  0.26344123, -0.84835281, -0.03068951, -0.67938926,
         1.0601184 , -0.09218482,  0.61382705, -0.4426586 ,  0.92501132,
         0.48069555, -0.28384725, -0.06368423,  0.38384635,  0.33885096,
         0.69191519, -0.02044161, -0.14435234, -0.26646248,  0.71415768,
         0.37597015, -0.1664976 ,  0.32618831,  0.13195739,  0.050079  ,
         0.68222996,  0.26724613, -0.08154401,  0.38188646,  0.17825641,
        -0.52485443,  0.62029657,  0.65237668, -0.0228547 , -0.39539195,
        -0.03365204, -0.12586937,  0.39730907, -0.76069504]])

In [101]:
unscaled_inputs.columns.values

array(['checking_status_1', 'checking_status_2', 'checking_status_3',
       'checking_status_4', 'duration', 'credit_history', 'purpose_1',
       'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6',
       'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10',
       'credit_amount', 'savings_status', 'employment_1', 'employment_2',
       'employment_3', 'employment_4', 'employment_5',
       'installment_commitment', 'personal_status_1', 'personal_status_2',
       'personal_status_3', 'personal_status_4', 'other_parties',
       'residence_since', 'property_magnitude_1', 'property_magnitude_2',
       'property_magnitude_3', 'property_magnitude_4', 'age',
       'other_payment_plans', 'housing_1', 'housing_2', 'housing_3',
       'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'], dtype=object)

In [102]:
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,checking_status_1,0.083063
1,checking_status_2,-0.259816
2,checking_status_3,0.179494
3,checking_status_4,1.247077
4,duration,-0.346399
5,credit_history,-0.367807
6,purpose_1,0.263441
7,purpose_2,-0.848353
8,purpose_3,-0.03069
9,purpose_4,-0.679389


In [103]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,1.249819
1,checking_status_1,0.083063
2,checking_status_2,-0.259816
3,checking_status_3,0.179494
4,checking_status_4,1.247077
5,duration,-0.346399
6,credit_history,-0.367807
7,purpose_1,0.263441
8,purpose_2,-0.848353
9,purpose_3,-0.03069


## Interpreting the coefficients

In [104]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,1.249819,3.48971
4,checking_status_4,1.247077,3.480157
11,purpose_5,1.060118,2.886713
15,purpose_9,0.925011,2.521897
25,personal_status_1,0.714158,2.042466
21,employment_3,0.691915,1.997538
31,property_magnitude_1,0.68223,1.978284
38,housing_2,0.652377,1.920099
37,housing_1,0.620297,1.859479
13,purpose_7,0.613827,1.847488


## Testing the Model

In [105]:
reg.score(x_test, y_test)

0.725

In [106]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.55229752, 0.44770248],
       [0.05391955, 0.94608045],
       [0.12716447, 0.87283553],
       [0.05871199, 0.94128801],
       [0.50462419, 0.49537581],
       [0.70381494, 0.29618506],
       [0.49244125, 0.50755875],
       [0.31300997, 0.68699003],
       [0.19062401, 0.80937599],
       [0.46743881, 0.53256119],
       [0.50466879, 0.49533121],
       [0.75319751, 0.24680249],
       [0.50038711, 0.49961289],
       [0.18737356, 0.81262644],
       [0.25712315, 0.74287685],
       [0.13340078, 0.86659922],
       [0.20957345, 0.79042655],
       [0.08539509, 0.91460491],
       [0.13188441, 0.86811559],
       [0.3748397 , 0.6251603 ],
       [0.50556415, 0.49443585],
       [0.24124421, 0.75875579],
       [0.10873774, 0.89126226],
       [0.11815878, 0.88184122],
       [0.0303091 , 0.9696909 ],
       [0.66349619, 0.33650381],
       [0.10265001, 0.89734999],
       [0.72180208, 0.27819792],
       [0.88282605, 0.11717395],
       [0.06859274, 0.93140726],
       [0.

In [107]:
predicted_proba.shape

(200, 2)

In [108]:
predicted_proba[:,1]

array([0.44770248, 0.94608045, 0.87283553, 0.94128801, 0.49537581,
       0.29618506, 0.50755875, 0.68699003, 0.80937599, 0.53256119,
       0.49533121, 0.24680249, 0.49961289, 0.81262644, 0.74287685,
       0.86659922, 0.79042655, 0.91460491, 0.86811559, 0.6251603 ,
       0.49443585, 0.75875579, 0.89126226, 0.88184122, 0.9696909 ,
       0.33650381, 0.89734999, 0.27819792, 0.11717395, 0.93140726,
       0.50933612, 0.87101864, 0.9539903 , 0.68677778, 0.57629216,
       0.95179433, 0.41672763, 0.9513304 , 0.96386606, 0.57699285,
       0.69211315, 0.71554363, 0.65041728, 0.82051878, 0.77678698,
       0.87887911, 0.8319712 , 0.63963837, 0.80119352, 0.60472155,
       0.86784276, 0.6385167 , 0.34433134, 0.54245062, 0.71235032,
       0.93075476, 0.79981555, 0.73065339, 0.83163819, 0.57985889,
       0.93735936, 0.95388606, 0.81483852, 0.40884658, 0.98749584,
       0.96313231, 0.95446788, 0.73844727, 0.12014087, 0.92284439,
       0.98421968, 0.31761363, 0.58821006, 0.90076395, 0.75421

## Saving the model in pickle

In [110]:
import pickle

with open('model', 'wb') as file:
    pickle.dump(reg, file)
    
with open('scaler','wb') as file:
    pickle.dump(credit_scaler, file)