# IMPORT

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [None]:
file_path = 'loan_applications.csv'

# SECTION 1: LOAD DATASET

In [None]:
# ============================
# SECTION 1: LOAD DATASET
# ============================
def load_data(file_path):
    """
    Load dataset from a CSV file and return a pandas DataFrame.
    """
    print("\n[INFO] Loading dataset...")
    df = pd.read_csv(file_path)  # Read the CSV file into a DataFrame
    print("[SUCCESS] Dataset loaded successfully! Shape:", df.shape, "\n")  # Print dataset shape
    print(df.head())  # Display first 5 rows for preview
    print("[INFO] Dataset Info:")
    print(df.info())  # Show dataset info including column types and missing values
    return df
df = load_data(file_path)


[INFO] Loading dataset...
[SUCCESS] Dataset loaded successfully! Shape: (50000, 17) 

   application_id application_date  age gender    education     occupation  \
0               1       2020-01-05   40      M       Master  Self-employed   
1               2       2020-05-21   33      M     Bachelor       Salaried   
2               3       2020-09-24   42      F     Bachelor     Unemployed   
3               4       2020-05-10   53      F  High School       Salaried   
4               5       2020-05-15   32      M       Master       Salaried   

   employment_length    income  loan_amount  loan_term  interest_rate  \
0          13.573526  40298.38     95254.57         48          10.95   
1           3.061310  38167.11    140514.31         60          17.12   
2          12.040059   8666.27     13340.61         60          11.89   
3           8.490862  33093.00     73008.42         60          12.89   
4           0.782913  41115.42     51399.26         12           7.58   

   mo

# SECTION 2: HANDLE MISSING VALUES

In [None]:
# SECTION 2: HANDLE MISSING VALUES
# ============================
def handle_missing_values(df):
    """
    Fill missing values in the dataset:
    - Numeric columns: fill with mean
    - Categorical columns: fill with most frequent value (mode)
    """
    print("\n[INFO] Handling missing values...")
    print("[BEFORE] Missing values per column:\n", df.isnull().sum(), "\n")  # Show missing values before processing

    numeric_cols = df.select_dtypes(include=[np.number]).columns  # Get numeric columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns  # Get categorical columns

    # Fill missing values in numeric columns with mean
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # Fill missing values in categorical columns with most frequent value
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

    print("[AFTER] Missing values handled! Numeric cols:", len(numeric_cols), "Categorical cols:", len(categorical_cols), "\n")
    print(df.head())  # Display changes in data after handling missing values
    return df
df = handle_missing_values(df)


[INFO] Handling missing values...
[BEFORE] Missing values per column:
 application_id          0
application_date        0
age                     0
gender                  0
education               0
occupation              0
employment_length    7160
income               2445
loan_amount             0
loan_term               0
interest_rate           0
monthly_payment         0
credit_score         2578
existing_loans       2513
previous_defaults       0
city                    0
default                 0
dtype: int64 

[AFTER] Missing values handled! Numeric cols: 12 Categorical cols: 5 

   application_id application_date  age gender    education     occupation  \
0               1       2020-01-05   40      M       Master  Self-employed   
1               2       2020-05-21   33      M     Bachelor       Salaried   
2               3       2020-09-24   42      F     Bachelor     Unemployed   
3               4       2020-05-10   53      F  High School       Salaried   
4         

# SECTION 3: ENCODE CATEGORICAL FEATURES

In [None]:
# ============================
# SECTION 3: ENCODE CATEGORICAL FEATURES
# ============================
def encode_categorical_features(df):
    """
    Convert categorical columns into numerical format using One-Hot Encoding.
    """
    print("\n[INFO] Encoding categorical features...")
    print("[BEFORE] Columns:", df.columns.tolist(), "\n")  # Show column names before encoding

    df = pd.get_dummies(df, drop_first=True)  # Apply One-Hot Encoding

    print("[AFTER] Categorical features encoded! New shape:", df.shape, "\n")
    print(df.head())  # Display first rows after encoding
    return df
df = encode_categorical_features(df)


[INFO] Encoding categorical features...
[BEFORE] Columns: ['application_id', 'application_date', 'age', 'gender', 'education', 'occupation', 'employment_length', 'income', 'loan_amount', 'loan_term', 'interest_rate', 'monthly_payment', 'credit_score', 'existing_loans', 'previous_defaults', 'city', 'default'] 

[AFTER] Categorical features encoded! New shape: (50000, 850) 

   application_id  age  employment_length    income  loan_amount  loan_term  \
0               1   40          13.573526  40298.38     95254.57         48   
1               2   33           3.061310  38167.11    140514.31         60   
2               3   42          12.040059   8666.27     13340.61         60   
3               4   53           8.490862  33093.00     73008.42         60   
4               5   32           0.782913  41115.42     51399.26         12   

   interest_rate  monthly_payment  credit_score  existing_loans  ...  \
0          10.95          2459.46         781.0             0.0  ...   
1   

# SECTION 4: SCALE NUMERIC FEATURES

In [None]:
# ============================
# SECTION 4: SCALE NUMERIC FEATURES
# ============================
def scale_features(df, method="standard"):
    """
    Scale numerical features using either StandardScaler or MinMaxScaler.
    """
    print("\n[INFO] Scaling numeric features using", method, "scaler...")

    numeric_cols = df.select_dtypes(include=[np.number]).columns  # Get numeric columns
    scaler = StandardScaler() if method == "standard" else MinMaxScaler()  # Choose scaler
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])  # Apply scaling

    print("[SUCCESS] Features scaled using", method, "\n")
    print("[INFO] Dataset statistics after scaling:")
    print(df.describe())  # Show statistics after scaling
    return df
df = scale_features(df, method="minmax")  # method="minmax" method="standard"


[INFO] Scaling numeric features using minmax scaler...
[SUCCESS] Features scaled using minmax 

[INFO] Dataset statistics after scaling:
       application_id           age  employment_length        income  \
count    50000.000000  50000.000000       50000.000000  50000.000000   
mean         0.500000      0.273573           0.185371      0.093763   
std          0.288684      0.179577           0.135999      0.064713   
min          0.000000      0.000000           0.000000      0.000000   
25%          0.250000      0.129032           0.102984      0.050893   
50%          0.500000      0.274194           0.185371      0.082457   
75%          0.750000      0.403226           0.233847      0.118188   
max          1.000000      1.000000           1.000000      1.000000   

        loan_amount     loan_term  interest_rate  monthly_payment  \
count  50000.000000  50000.000000   50000.000000     50000.000000   
mean       0.063782      0.500665       0.424507         0.038773   
std   

# SECTION 5: FULL DATA PREPROCESSING PIPELINE

In [None]:
# ============================
# SECTION 5: FULL DATA PREPROCESSING PIPELINE
# ============================
def preprocess_data(file_path):
    """
    Full data preprocessing pipeline:
    1. Load data
    2. Handle missing values
    3. Encode categorical features
    4. Scale numeric features
    """
    print("\n====================")
    print("[START] Data Preprocessing Pipeline")
    print("====================\n")

    df = load_data(file_path)  # Load dataset
    df = handle_missing_values(df)  # Handle missing values
    df = encode_categorical_features(df)  # Encode categorical features
    df = scale_features(df, method="standard")  # Scale numeric features

    print("====================")
    print("[COMPLETE] Data Preprocessing Finished!")
    print("====================\n")
    return df
processed_df = preprocess_data("loan_applications.csv")


[START] Data Preprocessing Pipeline


[INFO] Loading dataset...
[SUCCESS] Dataset loaded successfully! Shape: (50000, 17) 

   application_id application_date  age gender    education     occupation  \
0               1       2020-01-05   40      M       Master  Self-employed   
1               2       2020-05-21   33      M     Bachelor       Salaried   
2               3       2020-09-24   42      F     Bachelor     Unemployed   
3               4       2020-05-10   53      F  High School       Salaried   
4               5       2020-05-15   32      M       Master       Salaried   

   employment_length    income  loan_amount  loan_term  interest_rate  \
0          13.573526  40298.38     95254.57         48          10.95   
1           3.061310  38167.11    140514.31         60          17.12   
2          12.040059   8666.27     13340.61         60          11.89   
3           8.490862  33093.00     73008.42         60          12.89   
4           0.782913  41115.42     51399.

# SECTION 6: EXAMPLE USAGE

In [None]:
# ============================
# SECTION 7: EXAMPLE USAGE
# ============================

# Uncomment to run:
processed_df = preprocess_data("loan_applications.csv")


[START] Data Preprocessing Pipeline


[INFO] Loading dataset...
[SUCCESS] Dataset loaded successfully! Shape: (50000, 17) 

   application_id application_date  age gender    education     occupation  \
0               1       2020-01-05   40      M       Master  Self-employed   
1               2       2020-05-21   33      M     Bachelor       Salaried   
2               3       2020-09-24   42      F     Bachelor     Unemployed   
3               4       2020-05-10   53      F  High School       Salaried   
4               5       2020-05-15   32      M       Master       Salaried   

   employment_length    income  loan_amount  loan_term  interest_rate  \
0          13.573526  40298.38     95254.57         48          10.95   
1           3.061310  38167.11    140514.31         60          17.12   
2          12.040059   8666.27     13340.61         60          11.89   
3           8.490862  33093.00     73008.42         60          12.89   
4           0.782913  41115.42     51399.