In [1]:
import pandas as pd

features = ['checking account balance', 'duration', 'credit history',
            'purpose', 'amount', 'savings', 'employment', 'installment',
            'marital status', 'other debtors', 'residence time',
            'property', 'age', 'other installments', 'housing', 'credits',
            'job', 'persons', 'phone', 'foreign', 'repaid']

data_raw = pd.read_csv("../../data/credit/german.data",
                 delim_whitespace=True,
                 names=features)
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   checking account balance  1000 non-null   object
 1   duration                  1000 non-null   int64 
 2   credit history            1000 non-null   object
 3   purpose                   1000 non-null   object
 4   amount                    1000 non-null   int64 
 5   savings                   1000 non-null   object
 6   employment                1000 non-null   object
 7   installment               1000 non-null   int64 
 8   marital status            1000 non-null   object
 9   other debtors             1000 non-null   object
 10  residence time            1000 non-null   int64 
 11  property                  1000 non-null   object
 12  age                       1000 non-null   int64 
 13  other installments        1000 non-null   object
 14  housing                  

## Transforming the data set to a usable state

In [2]:
numeric_variables = ['duration', 'age', 'residence time', 'installment',
             'amount', 'persons', 'credits']
data = data_raw[numeric_variables]

# Mapping the response to 0 and 1
data["repaid"] = data_raw["repaid"].map({1:1, 2:0})

In [3]:
# Create dummy variables for all the catagorical variables
not_dummy_names = numeric_variables + ["repaid"]
dummy_names = [x not in not_dummy_names for x in features]
dummies = pd.get_dummies(data_raw.iloc[:,dummy_names], drop_first=True)
data = data.join(dummies)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   duration                      1000 non-null   int64
 1   age                           1000 non-null   int64
 2   residence time                1000 non-null   int64
 3   installment                   1000 non-null   int64
 4   amount                        1000 non-null   int64
 5   persons                       1000 non-null   int64
 6   credits                       1000 non-null   int64
 7   repaid                        1000 non-null   int64
 8   checking account balance_A12  1000 non-null   uint8
 9   checking account balance_A13  1000 non-null   uint8
 10  checking account balance_A14  1000 non-null   uint8
 11  credit history_A31            1000 non-null   uint8
 12  credit history_A32            1000 non-null   uint8
 13  credit history_A33            1000