In [1]:
import pandas as pd

df = pd.read_csv("German Dataset.csv", sep=";", na_values="?")
# no missing values

Numeric variables :
- Duration in month
- Credit amount
- Installment rate
- Present residence since
- cc_age in months%
- Number of existing credits at this bank
- Number of people being liable to provide maintenance for

Ordinal variables :
- Status of bank account (what to do with no checking ?)
- Savings
- Employment since
- Job


Other variables :
- Credit History
- Purpose
- Personal status and sex
- Property
- Other debtors/guantors
- Other installment plans
- Housing
- Telephone (binary)
- Foreign worker (binary)
- Type (binary)

In [2]:
df.columns

Index(['Status of bank account', 'Duration in month', 'Credit History',
       'Purpose', 'Credit amount', 'Savings', 'Employment since',
       'Installment rate', 'Personal status and sex',
       'Other debtors / guarantors', 'Present residence since', 'Property',
       'cc_age in months%', 'Other installment plans ', 'Housing',
       'Number of existing credits at this bank', 'Job',
       'Number of people being liable to provide maintenance for', 'Telephone',
       'foreign worker', 'Type'],
      dtype='object')

In [3]:
df.head()

Unnamed: 0,Status of bank account,Duration in month,Credit History,Purpose,Credit amount,Savings,Employment since,Installment rate,Personal status and sex,Other debtors / guarantors,...,Property,cc_age in months%,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Type
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


In [4]:
df.dtypes

Status of bank account                                      object
Duration in month                                            int64
Credit History                                              object
Purpose                                                     object
Credit amount                                                int64
Savings                                                     object
Employment since                                            object
Installment rate                                             int64
Personal status and sex                                     object
Other debtors / guarantors                                  object
Present residence since                                      int64
Property                                                    object
cc_age in months%                                            int64
Other installment plans                                     object
Housing                                                     ob

In [5]:
# Extract numeric variables columnd
numeric_df = df.select_dtypes(include=['int64']).copy()
numeric_df.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate,Present residence since,cc_age in months%,Number of existing credits at this bank,Number of people being liable to provide maintenance for
0,6,1169,4,4,67,2,1
1,48,5951,2,2,22,1,1
2,12,2096,2,3,49,1,2
3,42,7882,2,4,45,1,2
4,24,4870,3,4,53,2,2


## Ordinal variables

In [6]:
# Status of bank account
status_mapping = {'no checking':0, '<0':1 , '0<=X<200':2, '>=200':3}
numeric_df["Status of bank account"] = df["Status of bank account"].replace(status_mapping)

# Savings
saving_mapping = {"no known savings":0, "<100":1 , "100<=X<500":2, "500<=X<1000":3, ">=1000":4}
numeric_df["Savings"] = df["Savings"].replace(saving_mapping)

# Employment since
employment_mapping = {"unemployed":0, "<1":1, "1<=X<4":2, "4<=X<7":3, ">=7":4}
numeric_df["Employment since"] = df["Employment since"].replace(employment_mapping)

# Job
job_mapping = {"unemp/unskilled non res":0, "unskilled resident":1, "skilled":2, "high qualif/self emp/mgmt":3}
numeric_df["Job"] = df["Job"].replace(job_mapping)

## Binary variables

In [9]:
# Binary variables

tel_mapping = {'yes':1, 'none':0}
numeric_df["Telephone"] = df["Telephone"].replace(tel_mapping)

fw_mapping = {'yes':1, 'no':0}
numeric_df["foreign worker"] = df["foreign worker"].replace(fw_mapping)

type_mapping = {'good':1, 'bad':0}
numeric_df["Type"] = df["Type"].replace(type_mapping)

# Create gender variable
numeric_df["Male"] = df["Personal status and sex"].apply(lambda s: int(s.split()[0]=="male"))

## Dummy variables

In [10]:
var_to_dummy = ["Credit History", "Purpose", 
                               "Other debtors / guarantors", "Property", 
                               "Other installment plans ", "Housing", "Job"]

dummy_df = pd.get_dummies(df[var_to_dummy])

In [11]:
numeric_df

Unnamed: 0,Duration in month,Credit amount,Installment rate,Present residence since,cc_age in months%,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Status of bank account,Savings,Employment since,Job,Telephone,foreign worker,Type,Male
0,6,1169,4,4,67,2,1,1,0,4,2,1,1,1,1
1,48,5951,2,2,22,1,1,2,1,2,2,0,1,0,0
2,12,2096,2,3,49,1,2,0,1,3,1,0,1,1,1
3,42,7882,2,4,45,1,2,1,1,3,2,0,1,1,1
4,24,4870,3,4,53,2,2,1,1,2,2,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,4,31,1,1,0,1,3,1,0,1,1,0
996,30,3857,4,4,40,1,1,1,1,2,3,1,1,1,1
997,12,804,4,4,38,1,1,0,1,4,2,0,1,1,1
998,45,1845,4,4,23,1,1,1,1,2,2,1,1,0,1


In [12]:
result = pd.concat([numeric_df, dummy_df], axis=1, join="inner")
result.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate,Present residence since,cc_age in months%,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Status of bank account,Savings,Employment since,...,Other installment plans _bank,Other installment plans _none,Other installment plans _stores,Housing_for free,Housing_own,Housing_rent,Job_high qualif/self emp/mgmt,Job_skilled,Job_unemp/unskilled non res,Job_unskilled resident
0,6,1169,4,4,67,2,1,1,0,4,...,0,1,0,0,1,0,0,1,0,0
1,48,5951,2,2,22,1,1,2,1,2,...,0,1,0,0,1,0,0,1,0,0
2,12,2096,2,3,49,1,2,0,1,3,...,0,1,0,0,1,0,0,0,0,1
3,42,7882,2,4,45,1,2,1,1,3,...,0,1,0,1,0,0,0,1,0,0
4,24,4870,3,4,53,2,2,1,1,2,...,0,1,0,1,0,0,0,1,0,0


In [13]:
# put sensitive attribute variable in last column
col_list =  list(result.columns.values)
col_list.remove("Male")
col = col_list + ["Male"]
result = result[col]

In [14]:
result.to_csv("german_numeric.csv", sep=",")

In [15]:
df = pd.read_csv("german_numeric.csv", sep=",", index_col=0)
df.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate,Present residence since,cc_age in months%,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Status of bank account,Savings,Employment since,...,Other installment plans _none,Other installment plans _stores,Housing_for free,Housing_own,Housing_rent,Job_high qualif/self emp/mgmt,Job_skilled,Job_unemp/unskilled non res,Job_unskilled resident,Male
0,6,1169,4,4,67,2,1,1,0,4,...,1,0,0,1,0,0,1,0,0,1
1,48,5951,2,2,22,1,1,2,1,2,...,1,0,0,1,0,0,1,0,0,0
2,12,2096,2,3,49,1,2,0,1,3,...,1,0,0,1,0,0,0,0,1,1
3,42,7882,2,4,45,1,2,1,1,3,...,1,0,1,0,0,0,1,0,0,1
4,24,4870,3,4,53,2,2,1,1,2,...,1,0,1,0,0,0,1,0,0,1


In [16]:
import numpy as np

In [17]:
df = pd.read_csv("german_numeric.csv")
X = df.drop("Type", axis=1).to_numpy()[:,1:]
y = df["Type"].to_numpy()

In [18]:
X

array([[   6, 1169,    4, ...,    0,    0,    1],
       [  48, 5951,    2, ...,    0,    0,    0],
       [  12, 2096,    2, ...,    0,    1,    1],
       ...,
       [  12,  804,    4, ...,    0,    0,    1],
       [  45, 1845,    4, ...,    0,    0,    1],
       [  45, 4576,    3, ...,    0,    0,    1]])

In [19]:
X[:,-1]

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,