Working at a bank when we receive a loan application, we need to make sure that if we give the money, the customer will be able to pay it back. Every application carries a risk of default — the failure to return the money. We’d like to minimize this risk: before agreeing to give a loan, we want to score the
customer and assess the chances of default. If it’s too high, we reject the application.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv('CreditScoring.csv')

In [3]:
data.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [4]:
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [5]:
#Some columns are categorical(status, home, marital, records, job) 
#Others are numerical(seniority, time, age, expenses, income, assets, debt, amount, price
#Map the categorical values from current values to strings

status_values = {1: 'ok', 2: 'Default', 0: 'unknown'}
data.status = data.status.map(status_values)
home_values = {
    1: "rent",
    2: "owner",
    3: "private",
    4: "ignore",
    5: "parents",
    6: "other",
    0: "unknown"
}
data.home = data.home.map(home_values)
marital_values = {
    1: "single",
    2: "married",
    3: "widow",
    4: "separated",
    5: "divorced",
    6: "unknown"
}
data.marital = data.marital.map(marital_values)
records_values = {
    1: "no",
    2: "yes",
    0: "unknown"
}
data['records'] = data['records'].map(records_values)
job_values = {
    1: "fixed",
    2: "parttime",
    3: "freelance",
    4: "others",
    0: "unknown"
}
data['job'] = data['job'].replace(job_values)
data.head()


Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,Default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [6]:
data.status

0            ok
1            ok
2       Default
3            ok
4            ok
         ...   
4450    Default
4451         ok
4452    Default
4453         ok
4454         ok
Name: status, Length: 4455, dtype: object

In [7]:
data.describe(include = "all").round()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
count,4455,4455.0,4455,4455.0,4455.0,4454,4455,4455,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
unique,3,,7,,,5,2,5,,,,,,
top,ok,,owner,,,married,no,fixed,,,,,,
freq,3200,,2107,,,3241,3682,2806,,,,,,
mean,,8.0,,46.0,37.0,,,,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,,8.0,,15.0,11.0,,,,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,,0.0,,6.0,18.0,,,,35.0,0.0,0.0,0.0,100.0,105.0
25%,,2.0,,36.0,28.0,,,,35.0,80.0,0.0,0.0,700.0,1118.0
50%,,5.0,,48.0,36.0,,,,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,,12.0,,60.0,45.0,,,,72.0,166.0,6000.0,0.0,1300.0,1692.0


In [8]:
for col in ['income', 'assets', 'debt']:
    data[col] = data[col].replace(to_replace=99999999, value=np.nan)

In [9]:
data.describe(include="all")

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
count,4455,4455.0,4455,4455.0,4455.0,4454,4455,4455,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
unique,3,,7,,,5,2,5,,,,,,
top,ok,,owner,,,married,no,fixed,,,,,,
freq,3200,,2107,,,3241,3682,2806,,,,,,
mean,,7.987205,,46.441751,37.077666,,,,55.568799,130.568197,5403.433984,342.948614,1039.021773,1462.875645
std,,8.173444,,14.655225,10.984856,,,,19.515878,86.367434,11573.161523,1245.861736,474.543007,628.089913
min,,0.0,,6.0,18.0,,,,35.0,0.0,0.0,0.0,100.0,105.0
25%,,2.0,,36.0,28.0,,,,35.0,80.0,0.0,0.0,700.0,1117.5
50%,,5.0,,48.0,36.0,,,,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,,12.0,,60.0,45.0,,,,72.0,165.0,6000.0,0.0,1300.0,1692.0


In [10]:
data.status.value_counts()

status
ok         3200
Default    1254
unknown       1
Name: count, dtype: int64

In [11]:
#remove the unknown row
data = data[data.status != 'unk']

In [12]:
from sklearn.model_selection import train_test_split
data_train_val, data_test = train_test_split(data, test_size = 0.2, random_state = 11)
data_train, data_val = train_test_split(data_train_val, test_size = 0.15, random_state = 11)

In [13]:
len(data_train), len(data_val), len(data_test)

(3029, 535, 891)

In [14]:
y_train = (data_train.status == 'default').values
y_val = (data_train.status == 'default').values
y_test = (data_test.status == 'default').values

del data_train['status']
del data_val['status']
del data_test['status']
#Fill empty values with 0
data_train = data_train.fillna(0)
data_val = data_val.fillna(0)
data_test = data_test.fillna(0)

In [16]:
#Apply encoding
#Convert to dictionaries first
dict_train = data_train.to_dict(orient = "records")
dict_val = data_val.to_dict(orient = "records")
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(dict_train)
x_val = dv.fit_transform(dict_val)