In [1]:
from warnings import filterwarnings

# Ignore future warnings returned by ydata-profiling during report generation.
filterwarnings("ignore", category=FutureWarning)

In [2]:
import pandas as pd

# Load the dataset into pandas dataframe
training_set = pd.read_csv("dataset/raw/train_ctrUa4K.csv", index_col=0)
testing_set  = pd.read_csv("dataset/raw/test_lAUu6dG.csv" , index_col=0)

# Split the training set into features-matrix and target-vector
X_train = training_set.iloc[:, :-1]
y_train = training_set.iloc[:, -1]

# Consistency lets store the copy of testing set into X_test
X_test = testing_set.copy(deep=True)

In [3]:
from ydata_profiling import ProfileReport
training_set_profile = ProfileReport(X_train, title="Training Set Profile", explorative=True)
testing_set_profile  = ProfileReport(X_test , title="Testing Set Profile" , explorative=True)

# Compare the two profiles
training_set_profile.compare(testing_set_profile)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
# Categorize the attributes according to their statistical data type

ATTRS_NUMERIC = [
    'ApplicantIncome',
    'CoapplicantIncome',
    'LoanAmount',
    'Loan_Amount_Term',
]

ATTRS_ORDINAL = [
    'Dependents',
    'Property_Area',
]

ATTRS_CATEGORICAL = [
    'Gender',
    'Married',
    'Education',
    'Self_Employed',
    'Credit_History',
]

ATTRS_TARGET = [
    'Loan_Status'
]

In [6]:
# Build the preprocessing pipeline for the Feature Matrix

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer

import numpy as np

# Numerical Data Pre-processing Pipeline
numerical_pipeline = Pipeline(
    steps=[
        # Step-1: Drop one feature among the feature with high multi-collinearity
        (
            "Column_Dropper",
            ColumnTransformer(
                transformers=[
                    ("column_dropper", "drop", ["ApplicantIncome"])
                ],
                remainder="passthrough"
            )
        ),
        # Apply log transformation to normalize skewed data
        (
            "Log_Transformer",
            FunctionTransformer(func=np.log1p, feature_names_out="one-to-one")
        ),
        # Step-2: Scale the numerical features to normalize them.
        (
            "Standard_Scaler",
            StandardScaler()
        ),
        # Step-3: Impute missing values using KNN
        (
            "KNN_Imputer",
            KNNImputer()
        )
    ]
)

# Ordinal Data Pre-processing Pipeline
ordinal_pipeline = Pipeline(
    steps=[
        # Step-1: Impute missing values using Simple Imputer
        (
            "Simple_Imputer",
            SimpleImputer(strategy="most_frequent")
        ),
        # Step-2: Encode ordinal columns using the ordinal-encoding scheme
        (
            "Ordinal_Encoder",
            OrdinalEncoder()
        ),
    ]
)

# Categorical Data Pipeline
categorical_pipeline = Pipeline(
    steps=[
        # Step-1: Impute missing values using Simple Imputer
        (
            "Simple_Imputer",
            SimpleImputer(strategy="most_frequent")
        ),
        # Step-2: Encode categorical features using one-hot encoding scheme
        (
            "Onehot_Encoder",
            OneHotEncoder(drop="first")
        )
    ]
)

# Merge all data preprocessing pipeline
preprocessing_pipeline_combiner = ColumnTransformer(
    transformers=[
        ("Numerical_Data_Pipeline", numerical_pipeline, ATTRS_NUMERIC),
        ("Ordinal_Data_Pipeline", ordinal_pipeline, ATTRS_ORDINAL),
        ("Categorical_Data_Pipeline", categorical_pipeline, ATTRS_CATEGORICAL)
    ]
)

features_pipeline = Pipeline(
    steps=[
        # Step-1: Preprocess each data category
        (
            "Merged_Preprocessing_Pipeline",
            preprocessing_pipeline_combiner
        ),
        # Step-2: Impute missing values using KNN Imputer
        (
            "KNN_Imputer",
            KNNImputer()
        )
    ]
)

# Fit the final data pipeline using the training set
features_pipeline.fit(training_set)

In [7]:
# Generate final preprocessed training data
training_set_features_preprocessed = pd.DataFrame(
    features_pipeline.transform(training_set),
    columns=features_pipeline.get_feature_names_out()
).set_axis(training_set.index)

# Generate final preprocessed testing data
testing_set_features_preprocessed = pd.DataFrame(
    features_pipeline.transform(testing_set),
    columns=features_pipeline.get_feature_names_out()
).set_index(testing_set.index)

In [8]:
# Check the final report if there will be any new quality issues.
final_data_report = ProfileReport(training_set_features_preprocessed)
final_data_report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# Preprocessed the y_train using LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Store the label-encoded data in a Series
training_set_target_preprocessed = pd.Series(
    data=label_encoder.fit_transform(y_train),
    index=y_train.index,
    name=y_train.name
)

In [10]:
pd.DataFrame({
    "Before Label-Encoding": y_train,
    "After Label-Encoding" : training_set_target_preprocessed
}).head()

Unnamed: 0_level_0,Before Label-Encoding,After Label-Encoding
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
LP001002,Y,1
LP001003,N,0
LP001005,Y,1
LP001006,Y,1
LP001008,Y,1


In [11]:
# Export the preprocessed training data into a csv file
training_set_features_preprocessed.to_csv("dataset/feature-engineering/X_train.csv")
training_set_target_preprocessed  .to_csv("dataset/feature-engineering/y_train.csv")

# Export the preprocessed testing data into a csv file
testing_set_features_preprocessed .to_csv("dataset/feature-engineering/X_test.csv")