# `2.2 Pre-processing`

1. Test spliting
1. (X) Features
    - 2.1. Categorical imputing
    - 2.2. Categorical encoding
        - a) Ordinal
        - b) One-Hot
1. (y) Target
    - 3.1 Label encoding

## 2.2.0 Libraries and constants

In [1]:
# Libraries
import pandas                as pd
import numpy                 as np
import matplotlib.pyplot     as plt

from utils.TestSize          import TestSize

# 2.2.1 Test split
from sklearn.model_selection import train_test_split

# 2.2.2 Categorical encoding
from sklearn.impute          import SimpleImputer
from sklearn.preprocessing   import OrdinalEncoder
from sklearn.preprocessing   import OneHotEncoder

# 2.2.3 Label encoding
from sklearn.preprocessing   import LabelEncoder

# Paths
SELECTION_CSV_PATH  = 'data/processed-data/2-fayaad-1-selection.csv'
COLUMNS_PATH        = 'data/metadata/columns.csv'

# Constants
SELECTION_DF        = pd.read_csv(SELECTION_CSV_PATH)
COLUMNS             = pd.read_csv(COLUMNS_PATH)
DTYPES              = pd.read_csv(COLUMNS_PATH, index_col=0).dtype

# Train test split
RANDOM_STATE = 42

# Target column
TARGET_COL = 'T1 - Is good credit'

# Dtypes
german_credit_data: pd.DataFrame = SELECTION_DF.copy()

for column in german_credit_data.columns:
    german_credit_data[column] = german_credit_data[column].astype(DTYPES[column])

## 2.2.1 Test spliting

In [2]:
# Features and target
X = german_credit_data.drop(columns=[TARGET_COL])
y = german_credit_data[TARGET_COL]

# Stratified split
(X_train_30, X_test_30,
y_train_30, y_test_30) = train_test_split(X, y,
                                          test_size = TestSize.t30.value,
                                          random_state = RANDOM_STATE,
                                          stratify = y)

(X_train_10, X_test_10,
y_train_10, y_test_10) = train_test_split(X, y,
                                          test_size = TestSize.t10.value,
                                          random_state = RANDOM_STATE,
                                          stratify = y)

In [3]:
# Grpoup split sets
class DatasetSplitSets:
    def __init__(self, train, test, y_train, y_test):
        self.X_train = train
        self.X_test  = test
        self.y_train = y_train
        self.y_test  = y_test

In [4]:
processed_30 = DatasetSplitSets(X_train_30, X_test_30, y_train_30, y_test_30)
processed_10 = DatasetSplitSets(X_train_10, X_test_10, y_train_10, y_test_10)

split_sets = [processed_30, processed_10]

## 2.2.2 Categorical encoding

- Categorical columns (OH = One Hot encoding, Or = Ordinal encoding)

|              Ordinal            |           One-Hot            |
| ------------------------------- | ---------------------------- |
|                                 | A3 - Credit history          |
|                                 | A4 - Purpose                 |
| A6 - Savings account/bonds      |                              |
| A7 - Present employment since   |                              |
|                                 | A9 - Personal status and sex |
| A11 - Present residence since   |                              |
|                                 | A12 - Property               |
|                                 | A15 - Housing                |

In [5]:
# Categorical columns (ordinal and one-hot)
ORDINAL_COLUMNS = ['A6 - Savings account/bonds',
                   'A7 - Present employment since',
                   'A11 - Present residence since',
                   'A19 - Telephone',
                   'A20 - Foreign worker',]

ONEHOT_COLUMNS = ['A3 - Credit history',
                  'A4 - Purpose',
                  'A9 - Personal status and sex',
                  'A12 - Property',
                  'A15 - Housing',]

COLUMN_WITH_NANS = ['A6 - Savings account/bonds',]

### a) Categorical imputing

In [6]:
# Instantiate imputer to fill A6 nans with 'A60'
imputer = SimpleImputer(strategy = 'constant', fill_value = 'A60')

In [7]:
def impute_missing_values(split_set: DatasetSplitSets,
                          columns_with_nans: list[str],
                          imputer: SimpleImputer):

    # Fit on train and transform train
    split_set.X_train[columns_with_nans] = imputer.fit_transform(split_set.X_train[columns_with_nans])

    # Transform on test
    split_set.X_test[columns_with_nans] = imputer.transform(split_set.X_test[columns_with_nans])


In [8]:
for split_set in split_sets:
    impute_missing_values(split_set, COLUMN_WITH_NANS, imputer)

### b) Ordinal encoding

In [9]:
# Instatiate an OrdinalEncoder that handles unknown categories in the test set
ordinal_encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value',
                                 unknown_value = -1)

In [10]:
def encode_ordinal_features(split_set: DatasetSplitSets,
                            ordinal_columns: list[str],
                            encoder = ordinal_encoder):

    # Fit on train and transform train
    split_set.X_train[ordinal_columns] = encoder.fit_transform(split_set.X_train[ordinal_columns])

    # Transform on test
    split_set.X_test[ordinal_columns] = encoder.transform(split_set.X_test[ordinal_columns])

In [11]:
for split_set in split_sets:
    encode_ordinal_features(split_set, ORDINAL_COLUMNS, ordinal_encoder)

### c) One-hot encoding

In [12]:
# Instantiate a OneHotEncoder that IGNORES unknown categories in the test set
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [13]:
def encode_onehot_features(split_set: DatasetSplitSets,
                           onehot_columns: list[str],
                           encoder = onehot_encoder):
    
    # 1 Fit o train
    encoder.fit(split_set.X_train[onehot_columns])
    new_columns = encoder.get_feature_names_out(onehot_columns)

    
    # 2 Transform on both train and test
    encoded_train = encoder.transform(split_set.X_train[onehot_columns])
    encoded_train_df = pd.DataFrame(encoded_train,
                                    columns = new_columns,
                                    index = split_set.X_train.index)

    encoded_test = encoder.transform(split_set.X_test[onehot_columns])
    encoded_test_df = pd.DataFrame(encoded_test,
                                   columns = new_columns,
                                   index = split_set.X_test.index)
    
    
    # 3 Drop original one-hot columns
    split_set.X_train = split_set.X_train.drop(columns = onehot_columns)
    split_set.X_test = split_set.X_test.drop(columns = onehot_columns)
    
    # 4 Concatenate the new encoded columns
    split_set.X_train = pd.concat([split_set.X_train,
                                   encoded_train_df], axis=1)

    split_set.X_test = pd.concat([split_set.X_test,
                                  encoded_test_df], axis=1)


In [15]:
for split_set in split_sets:
    encode_onehot_features(split_set, ONEHOT_COLUMNS, onehot_encoder)

## 2.3.3 Label encoding

In [17]:
# Instantiate a LabelEncoder
label_encoder = LabelEncoder()

In [18]:
def encode_bool_label_features(split_set: DatasetSplitSets,
                               encoder = label_encoder):
    
    # Fit on train and transform train
    split_set.y_train = encoder.fit_transform(split_set.y_train)

    # Transform on test
    split_set.y_test = encoder.transform(split_set.y_test)

In [19]:
for split_set in split_sets:
    encode_bool_label_features(split_set, label_encoder)

## SAVE

In [27]:
## Test = 30%
# Convert np.ndarray to pd.Series
y_train_df = pd.Series(processed_30.y_train, index = processed_30.X_train.index, name = TARGET_COL)
y_test_df  = pd.Series(processed_30.y_test,  index = processed_30.X_test.index,  name = TARGET_COL)

# Concat X and y for each set
train_df = pd.concat([processed_30.X_train, y_train_df], axis=1)
test_df  = pd.concat([processed_30.X_test, y_test_df],  axis=1)

# Path to save files
TRAIN_PROCESSED_PATH = 'data/processed-data/2-fayaad-2-train_30_processed.csv'
TEST_PROCESSED_PATH  = 'data/processed-data/2-fayaad-2-test_30_processed.csv'

# Save DataFrames to CSV files, without the index
train_df.to_csv(TRAIN_PROCESSED_PATH, index = False)
test_df.to_csv(TEST_PROCESSED_PATH, index = False)

print(f"Train set at: {TRAIN_PROCESSED_PATH}")
print(f"Test set at: {TEST_PROCESSED_PATH}")
print("\nTrain dimensions:", train_df.shape)
print("Test dimensions:", test_df.shape)

Train set at: data/processed-data/2-fayaad-2-train_30_processed.csv
Test set at: data/processed-data/2-fayaad-2-test_30_processed.csv

Train dimensions: (700, 37)
Test dimensions: (300, 37)


In [28]:
## Test = 10%
# Convert np.ndarray to pd.Series
y_train_df = pd.Series(processed_10.y_train, index = processed_10.X_train.index, name = TARGET_COL)
y_test_df  = pd.Series(processed_10.y_test,  index = processed_10.X_test.index,  name = TARGET_COL)

# Concat X and y for each set
train_df = pd.concat([processed_10.X_train, y_train_df], axis=1)
test_df  = pd.concat([processed_10.X_test, y_test_df],  axis=1)

# Path to save files
TRAIN_PROCESSED_PATH = 'data/processed-data/2-fayaad-2-train_10_processed.csv'
TEST_PROCESSED_PATH  = 'data/processed-data/2-fayaad-2-test_10_processed.csv'

# Save DataFrames to CSV files, without the index
train_df.to_csv(TRAIN_PROCESSED_PATH, index = False)
test_df.to_csv(TEST_PROCESSED_PATH, index = False)

print(f"Train set at: {TRAIN_PROCESSED_PATH}")
print(f"Test set at: {TEST_PROCESSED_PATH}")
print("\nTrain dimensions:", train_df.shape)
print("Test dimensions:", test_df.shape)

Train set at: data/processed-data/2-fayaad-2-train_10_processed.csv
Test set at: data/processed-data/2-fayaad-2-test_10_processed.csv

Train dimensions: (900, 37)
Test dimensions: (100, 37)


# END