In [28]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from src import utils as u
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

In [29]:
# Define path
X_TRAIN_PATH = "../data/interim/X_train.pkl"
y_TRAIN_PATH = "../data/interim/y_train.pkl"

X_VALID_PATH = "../data/interim/X_valid.pkl"
y_VALID_PATH = "../data/interim/y_valid.pkl"

X_TEST_PATH = "../data/interim/X_test.pkl"
y_TEST_PATH = "../data/interim/y_test.pkl"

# Load data
X_train = u.deserialize_data(X_TRAIN_PATH)
y_train = u.deserialize_data(y_TRAIN_PATH)

X_valid = u.deserialize_data(X_VALID_PATH)
y_valid = u.deserialize_data(y_VALID_PATH)

X_test = u.deserialize_data(X_TEST_PATH)
y_test = u.deserialize_data(y_TEST_PATH)

In [30]:
print("Train:", X_train.shape)
print("Valid:", X_valid.shape)
print("Test:", X_test.shape)

Train: (26064, 11)
Valid: (3258, 11)
Test: (3259, 11)


In [31]:
def drop_duplicate_data(X, y):
    """
    Function to remove duplicate rows from dataset X
    and align the target variable y accordingly.

    Parameters:
    ----------
    X : pandas.DataFrame
        Feature dataset (train, valid, or test) to remove
        duplicates from.

    y : pandas.Series
        Target variable corresponding to dataset X.

    Returns:
    -------
    X : pandas.DataFrame
        Cleaned feature dataset after duplicate removal.

    y : pandas.Series
        Target variable align with cleaned dataset.
        
    """

    # Parameter validation
    if not isinstance(X, pd.DataFrame):
        raise TypeError("Parameter X must be a pandas DataFrame.")

    if not isinstance(y, pd.Series):
        raise TypeError("Parameter y must be a pandas Series.")

    print("Fungsi drop_duplicate_data: parameter telah divalidasi.")

    # Copy Data
    X = X.copy()

    y = y.copy()

    # Shape before dropping
    print(f"Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah {X.shape}.")

    # Duplicate checking
    X_duplicate = X[X.duplicated()]

    print(f"Fungsi drop_duplicate_data: shape dari data yang duplicate adalah {X_duplicate.shape}.")

    # Predicted shape after dropping duplicates
    X_clean = (X.shape[0] - X_duplicate.shape[0], X.shape[1])

    print(f"Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah {X_clean}.")

    # Drop duplicate
    X.drop_duplicates(inplace=True)

    # Align y with cleaned X
    y = y.loc[X.index]

    print(f"Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah {X.shape}.")

    return X, y

In [32]:
X_train, y_train = drop_duplicate_data(X_train, y_train)

Fungsi drop_duplicate_data: parameter telah divalidasi.
Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah (26064, 11).
Fungsi drop_duplicate_data: shape dari data yang duplicate adalah (96, 11).
Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah (25968, 11).
Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah (25968, 11).


In [33]:
from copy import deepcopy

def median_imputation(data, subset_data, fit):
    """
    Function to perform median imputation on numerical columns.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataset (train, test, or validation) to be imputed.

    subset_data :
        - If fit=True → list of column names to calculate median from.
        - If fit=False → dict of {column_name: median_value}.

    fit : bool
        - True → calculate median values (fitting stage).
        - False → perform imputation using calculated medians. 
    """

    # Parameter validation
    # Validation 1: data must be DataFrame
    if not isinstance(data, pd.DataFrame):
        raise RuntimeError(
            "Fungsi median_imputation: parameter data haruslah bertipe DataFrame!"
        )

    # Validation 2: fit == True
    if fit == True:
        if not isinstance(subset_data, list):
            raise RuntimeError(
                "Fungsi median_imputation: untuk nilai parameter fit = True, subset_data harus bertipe list dan berisi daftar nama kolom yang ingin dicari nilai mediannya guna menjadi data imputasi pada kolom tersebut."
            )

    # Validation 3: fit == False
    elif fit == False:
        if not isinstance(subset_data, dict):
            raise RuntimeError(
                "Fungsi median_imputation: untuk nilai parameter fit = False, subset_data harus bertipe dict dan berisi key yang merupakan nama kolom beserta value yang merupakan nilai median dari kolom tersebut."
            )

    # Validation 4: fit must be boolean
    else:
        raise RecursionError(
             "Fungsi median_imputation: parameter fit haruslah bertipe boolean, bernilai True atau False."
        )

    print("Fungsi median_imputation: parameter telah divalidasi.")


    # Copy data
    data = data.copy()
    subset_data = deepcopy(subset_data)


    # Fitting stage
    if fit == True:

        imputation_data = dict()

        for subset in subset_data:
            median_value = data[subset].median()
            imputation_data[subset] = median_value
        print(
            f"Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {imputation_data}."
        )

        return imputation_data

    # Imputatuon stage
    elif fit == False:
        print("Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:")
        print(data.isna().sum())
        print("")

        data.fillna(subset_data, inplace=True)

        print("Fungsi median_imputation: informasi count na setelah dilakukan imputasi:")
        print(data.isna().sum())
        print("")

        return data

In [34]:
# Check column that contain null
X_train.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              734
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2491
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [35]:
# Generate subset_data
subset_data = ["person_emp_length", "loan_int_rate"]

In [36]:
# X_train fitting
subset_data = median_imputation(X_train, subset_data, True)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {'person_emp_length': np.float64(4.0), 'loan_int_rate': np.float64(10.99)}.


In [37]:
# X_train imputation
X_train = median_imputation(X_train, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              734
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2491
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [38]:
# X_test imputation
X_test = median_imputation(X_test, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length              77
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 303
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [39]:
# X_valid imputation
X_valid = median_imputation(X_valid, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length              80
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 312
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [40]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def create_onehot_encoder(categories, path):
    """
    Create and fit a OneHotencoder based on the provided categorical values, then serialize and save the fitted encoder to specified disk location

    Parameters
    ----------
    categories : list
        A one-dimensional list containing categorical values that will be learned by the encoder.

    path : str
        A string representing the file path on disk where the fitted encoder will be saved.


    Returns
    -------
    ohe : OneHotEncoder
        A fitted OneHotEncoder object that has learned the provided catgories.

    Raises
    ------
    RuntimeError
        if `categories` is not of type list.
        if `path` is not of type str.
    """

    # Parameter Validation
    if not isinstance(categories, list):
        raise RuntimeError(
            "Fungsi create_onehot_encoder: parameter categories haruslah bertipe list, berisi kategori yang akan dibuat encodernya."
        )
        
    if not isinstance(path, str):
         raise RuntimeError(
             "Fungsi create_onehot_encoder: parameter path haruslah bertipe string, berisi lokasi pada disk komputer dimana encoder akan disimpan."
         )


    # Create Encoder
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    categories_array = np.array(categories).reshape(-1, 1)

    ohe.fit(categories_array)

    u.serialize_data(ohe, path)

    learned_categories = ohe.categories_[0].tolist()

    print(f"Kategori yang telah dipelajari adalah {learned_categories}")

    return ohe
         

In [41]:
# Create category list
person_home_ownership = X_train["person_home_ownership"].unique().tolist()
loan_intent = X_train["loan_intent"].unique().tolist()
loan_grade = X_train["loan_grade"].unique().tolist()
cb_person_default_on_file = X_train["cb_person_default_on_file"].unique().tolist()

In [42]:
# Make encoder
ohe_home_ownership = create_onehot_encoder(
    person_home_ownership,
    "../models/ohe_home_ownership.pkl"
)

ohe_loan_intent = create_onehot_encoder(
    loan_intent,
    "../models/ohe_loan_intent.pkl"
)

ohe_loan_grade = create_onehot_encoder(
    loan_grade,
    "../models/ohe_loan_grade.pkl"
)

ohe_default_on_file = create_onehot_encoder(
    cb_person_default_on_file,
    "../models/ohe_default_on_file.pkl"
)

Kategori yang telah dipelajari adalah ['MORTGAGE', 'OTHER', 'OWN', 'RENT']
Kategori yang telah dipelajari adalah ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE']
Kategori yang telah dipelajari adalah ['A', 'B', 'C', 'D', 'E', 'F', 'G']
Kategori yang telah dipelajari adalah ['N', 'Y']


In [45]:
def ohe_tranform(dataset, subset, prefix, ohe):
    """
    Apply One Hot Encoding transformation to a specified categorical column in a dataset using previously fitted OneHotEncoder.

    Parameters
    ----------
    dataset : pandas.DataFrame
        The dataset to be transformed.

    subset : str
        The name of the categrical column in `dataset` that will be encodec.

    prefix : str
        A string prefix that will be prepended to each generated encoded column name.

    ohe : OneHotEncoder
        A previously fitted OneHotEncoder object used to transform the data.

    Returns
    -------
    dataset : pandas.DataFrame
        A new DataFrame containing the encoded columns appended and the original categorical column removed.

    Raises
    ------
    RuntimeError
        If `dataset` is not a pandas  DataFrame.
        If `subset` is not of type str.
        If `prefix` is not of type str.
        If `ohe` is not an instance of OneHotEncoder.
        If `subset` is not found among the columns of `dataset`.
    """

    # Parameter validation
    if not isinstance(dataset, pd.DataFrame):
        raise RuntimeError("Fungsi ohe_transform: parameter dataset harus bertipe DataFrame!")

    if not isinstance(ohe, OneHotEncoder):
        raise RuntimeError("Fungsi ohe_transform: parameter ohe harus bertipe OneHotEncoder!")

    if not isinstance(prefix, str):
        raise RuntimeError("Fungsi ohe_transform: parameter prefix harus bertipe str!")

    if not isinstance(subset, str):
        raise RuntimeError("Fungsi ohe_transform: parameter subset harus bertipe str!")

    try:
        list(dataset.columns).index(subset)
    except:
        raise RuntimeError(
            "Fungsi ohe_transform: parameter subset string namun data tidak ditemukan dalam daftar kolom yang terdapat pada parameter dataset."
        )

    print("Fungsi ohe_transform: parameter telah divalidasi.")

    # Copy Data
    dataset = dataset.copy()

    print(f"Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah {list(dataset.columns)}.")

    # Create New Column Names
    col_names = [
        f"{prefix}_{col_name}"
        for col_name in ohe.categories_[0].tolist()
    ]

    # Transform
    encoded_array = ohe.transform(dataset[[subset]])

    encoded = pd.DataFrame(
        encoded_array,
        columns=col_names,
        index=dataset.index
        
    )

    # Concateanate
    dataset = pd.concat([dataset, encoded], axis=1)

    # Drop original column
    dataset.drop(columns=[subset], inplace=True)

    print(f"Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah {list(dataset.columns)}.")

    return dataset

    

In [46]:
# X_train
X_train = ohe_tranform(
    X_train,
    "person_home_ownership",
    "home_ownership",
    ohe_home_ownership
)

X_train = ohe_tranform(
    X_train,
    "loan_intent",
    "loan_intent",
    ohe_loan_intent
)

X_train = ohe_tranform(
    X_train,
    "loan_grade",
    "loan_grade",
    ohe_loan_grade
)

X_train = ohe_tranform(
    X_train,
    "cb_person_default_on_file",
    "default_onfile",
    ohe_default_on_file
)

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].




Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].
Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].
Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_o



In [47]:
# X_test
X_test = ohe_tranform(
    X_test,
    "person_home_ownership",
    "home_ownership",
    ohe_home_ownership
)

X_test = ohe_tranform(
    X_test,
    "loan_intent",
    "loan_intent",
    ohe_loan_intent
)

X_test = ohe_tranform(
    X_test,
    "loan_grade",
    "loan_grade",
    ohe_loan_grade
)

X_test = ohe_tranform(
    X_test,
    "cb_person_default_on_file",
    "default_onfile",
    ohe_default_on_file
)

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].
Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].
Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file



In [48]:
# X_valid
X_valid = ohe_tranform(
    X_valid,
    "person_home_ownership",
    "home_ownership",
    ohe_home_ownership
)

X_valid = ohe_tranform(
    X_valid,
    "loan_intent",
    "loan_intent",
    ohe_loan_intent
)

X_valid = ohe_tranform(
    X_valid,
    "loan_grade",
    "loan_grade",
    ohe_loan_grade
)

X_valid = ohe_tranform(
    X_valid,
    "cb_person_default_on_file",
    "default_onfile",
    ohe_default_on_file
)

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].
Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].
Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file



In [49]:
# Serialize X_Train
u.serialize_data(
    X_train,
    "../data/processed/X_train_prep.pkl"
)

# Serialize X_test
u.serialize_data(
    X_test,
    "../data/processed/X_test_prep.pkl"
)

# Serialize X_valid
u.serialize_data(
    X_valid,
    "../data/processed/X_valid_prep.pkl"
)



In [None]:
# Serialize y_train
u.serialize_data(
    y_train,
    "../data/processed/y_train_prep.pkl"
)