# `2.2 Pre-processing`

1. Categorical encoding
1. Numerical scaling
1. Test spliting
    - Class balancing

## 2.2.0 Libraries and constants

In [1]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Paths
SELECTION_CSV_PATH  = 'data/processed-data/2-fayaad-1-selection.csv'
COLUMNS_PATH        = 'data/metadata/columns.csv'

In [3]:
# Constants
SELECTION_DF = pd.read_csv(SELECTION_CSV_PATH)
COLUMNS      = pd.read_csv(COLUMNS_PATH)
DTYPES       = pd.read_csv(COLUMNS_PATH, index_col=0).dtype

In [4]:
# Dtypes
german_credit_data: pd.DataFrame = SELECTION_DF.copy()

for column in german_credit_data.columns:
    german_credit_data[column] = german_credit_data[column].astype(DTYPES[column])

## 2.2.1 Categorical encoding

- Categorical columns (OH = One Hot encoding, Or = Ordinal encoding)
        - [OH] A3 - Credit history
        - [OH] A4 - Purpose
    - [Or] A6 - Savings account/bonds
    - [Or] A7 - Present employment since
        - [OH] A9 - Personal status and sex
    - [Or] A11 - Present residence since
        - [OH] A12 - Property
        - [OH] A15 - Housing

In [5]:
# Categorical encoding arrays with descriptions
ordinal_columns = [
                   'A6 - Savings account/bonds',
                   'A7 - Present employment since',
                   'A11 - Present residence since',]

onehot_columns = ['A3 - Credit history',
                  'A4 - Purpose',
                  'A9 - Personal status and sex',
                  'A12 - Property',
                  'A15 - Housing',]

In [6]:
german_credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                                                          Non-Null Count  Dtype   
---  ------                                                          --------------  -----   
 0   A2 - Duration in month                                          1000 non-null   int64   
 1   A3 - Credit history                                             1000 non-null   category
 2   A4 - Purpose                                                    988 non-null    category
 3   A5 - Credit amount                                              1000 non-null   int64   
 4   A6 - Savings account/bonds                                      817 non-null    category
 5   A7 - Present employment since                                   1000 non-null   category
 6   A9 - Personal status and sex                                    1000 non-null   category
 7   A11 - Present residence since              

In [8]:
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
german_credit_data[ordinal_columns] = ordinal_encoder.fit_transform(german_credit_data[ordinal_columns])

# print mapping for each column
for col, cats in zip(ordinal_columns, ordinal_encoder.categories_):
    mapping = {cat: int(i) for i, cat in enumerate(cats)}
    print(f"Ordinal mapping for {col}: {mapping}")

Ordinal mapping for A6 - Savings account/bonds: {'A61': 0, 'A62': 1, 'A63': 2, 'A64': 3, nan: 4}
Ordinal mapping for A7 - Present employment since: {'A71': 0, 'A72': 1, 'A73': 2, 'A74': 3, 'A75': 4}
Ordinal mapping for A11 - Present residence since: {np.int64(1): 0, np.int64(2): 1, np.int64(3): 2, np.int64(4): 3}


In [None]:
german_credit_data.info()

In [None]:
# One-hot encoding

## 2.2.2 Numerical scaling

In [None]:
# Numerical scaling

## 2.2.3 Test spliting

In [None]:
# Test spliting

### Class balancing

In [None]:
# Class balancing