In [1]:
import numpy as np
import pandas as pd
from src.preprocess import CompositePipeline, OnehotEncoder, StdNormalization

## TODO
* Data loading
    * $\checkmark$Downloading the data
    * $\checkmark$Reading the data to memory
* Data preprocessing
    * One-hot encoding
    * Normalization for numeric features

### Loading the dataset

In [2]:
data = pd.read_csv("dataset/germancredit.csv")

In [3]:
data.head()

Unnamed: 0,status.of.existing.checking.account,duration.in.month,credit.history,purpose,credit.amount,savings.account.and.bonds,present.employment.since,installment.rate.in.percentage.of.disposable.income,personal.status.and.sex,other.debtors.or.guarantors,...,property,age.in.years,other.installment.plans,housing,number.of.existing.credits.at.this.bank,job,number.of.people.being.liable.to.provide.maintenance.for,telephone,foreign.worker,creditability
0,... < 0 DM,6,critical account/ other credits existing (not ...,radio/television,1169,unknown/ no savings account,... >= 7 years,4,male : divorced/separated,none,...,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes,good
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,... < 100 DM,1 <= ... < 4 years,2,male : divorced/separated,none,...,real estate,22,none,own,1,skilled employee / official,1,none,yes,bad
2,no checking account,12,critical account/ other credits existing (not ...,education,2096,... < 100 DM,4 <= ... < 7 years,2,male : divorced/separated,none,...,real estate,49,none,own,1,unskilled - resident,2,none,yes,good
3,... < 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,... < 100 DM,4 <= ... < 7 years,2,male : divorced/separated,guarantor,...,building society savings agreement/ life insur...,45,none,for free,1,skilled employee / official,2,none,yes,good
4,... < 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : divorced/separated,none,...,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes,bad


In [4]:
data.columns

Index(['status.of.existing.checking.account', 'duration.in.month',
       'credit.history', 'purpose', 'credit.amount',
       'savings.account.and.bonds', 'present.employment.since',
       'installment.rate.in.percentage.of.disposable.income',
       'personal.status.and.sex', 'other.debtors.or.guarantors',
       'present.residence.since', 'property', 'age.in.years',
       'other.installment.plans', 'housing',
       'number.of.existing.credits.at.this.bank', 'job',
       'number.of.people.being.liable.to.provide.maintenance.for', 'telephone',
       'foreign.worker', 'creditability'],
      dtype='object')

In [5]:
cat_cols = ['status.of.existing.checking.account',
       'credit.history', 'purpose', 
       'savings.account.and.bonds', 'present.employment.since',
       'personal.status.and.sex', 'other.debtors.or.guarantors',
       'present.residence.since', 'property', 
       'other.installment.plans', 'housing',
       'job',
       'telephone',
       'foreign.worker']


num_cols = [
    'duration.in.month',
    'credit.amount',
    'installment.rate.in.percentage.of.disposable.income',
    'age.in.years',
    'number.of.existing.credits.at.this.bank', 
    'number.of.people.being.liable.to.provide.maintenance.for', 
    
]

In [6]:
data.isna().sum()

status.of.existing.checking.account                         0
duration.in.month                                           0
credit.history                                              0
purpose                                                     0
credit.amount                                               0
savings.account.and.bonds                                   0
present.employment.since                                    0
installment.rate.in.percentage.of.disposable.income         0
personal.status.and.sex                                     0
other.debtors.or.guarantors                                 0
present.residence.since                                     0
property                                                    0
age.in.years                                                0
other.installment.plans                                     0
housing                                                     0
number.of.existing.credits.at.this.bank                     0
job     

### One hot encoding and Normalization

In [7]:
pipeline = CompositePipeline([
    OnehotEncoder(cat_cols),
    StdNormalization(num_cols)
])

In [8]:
data.head()

Unnamed: 0,status.of.existing.checking.account,duration.in.month,credit.history,purpose,credit.amount,savings.account.and.bonds,present.employment.since,installment.rate.in.percentage.of.disposable.income,personal.status.and.sex,other.debtors.or.guarantors,...,property,age.in.years,other.installment.plans,housing,number.of.existing.credits.at.this.bank,job,number.of.people.being.liable.to.provide.maintenance.for,telephone,foreign.worker,creditability
0,... < 0 DM,6,critical account/ other credits existing (not ...,radio/television,1169,unknown/ no savings account,... >= 7 years,4,male : divorced/separated,none,...,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes,good
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,... < 100 DM,1 <= ... < 4 years,2,male : divorced/separated,none,...,real estate,22,none,own,1,skilled employee / official,1,none,yes,bad
2,no checking account,12,critical account/ other credits existing (not ...,education,2096,... < 100 DM,4 <= ... < 7 years,2,male : divorced/separated,none,...,real estate,49,none,own,1,unskilled - resident,2,none,yes,good
3,... < 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,... < 100 DM,4 <= ... < 7 years,2,male : divorced/separated,guarantor,...,building society savings agreement/ life insur...,45,none,for free,1,skilled employee / official,2,none,yes,good
4,... < 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : divorced/separated,none,...,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes,bad


In [9]:
data = pipeline(data)

In [10]:
data.head()

Unnamed: 0,foreign.worker_yes,"telephone_yes, registered under the customers name",job_skilled employee / official,job_unemployed/ unskilled - non-resident,job_unskilled - resident,housing_own,housing_rent,other.installment.plans_none,other.installment.plans_stores,"property_car or other, not in attribute Savings account/bonds",...,status.of.existing.checking.account_... >= 200 DM / salary assignments for at least 1 year,status.of.existing.checking.account_0 <= ... < 200 DM,status.of.existing.checking.account_no checking account,duration.in.month,credit.amount,installment.rate.in.percentage.of.disposable.income,age.in.years,number.of.existing.credits.at.this.bank,number.of.people.being.liable.to.provide.maintenance.for,creditability
0,1,1,1,0,0,1,0,1,0,0,...,0,0,0,-1.235859,-0.744759,0.918018,2.765073,1.026565,-0.428075,good
1,1,0,1,0,0,1,0,1,0,0,...,0,1,0,2.24707,0.949342,-0.869748,-1.190808,-0.704573,-0.428075,bad
2,1,0,0,0,1,1,0,1,0,0,...,0,0,1,-0.738298,-0.416354,-0.869748,1.182721,-0.704573,2.333701,good
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,1.749509,1.63343,-0.869748,0.831087,-0.704573,2.333701,good
4,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0.256825,0.56638,0.024135,1.534354,1.026565,2.333701,bad


In [11]:
len(data.columns)

51

### Normalization of numerical features