In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("/Users/gunveerkalsi/Desktop/loan-default-prediction-ml/data/raw/Loan_Default.csv")
df.head()


Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [3]:
X = df.drop('Status', axis=1)
y = df['Status']


In [4]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

num_cols, cat_cols


(Index(['ID', 'year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
        'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score',
        'LTV', 'dtir1'],
       dtype='object'),
 Index(['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose',
        'Credit_Worthiness', 'open_credit', 'business_or_commercial',
        'Neg_ammortization', 'interest_only', 'lump_sum_payment',
        'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
        'credit_type', 'co-applicant_credit_type', 'age',
        'submission_of_application', 'Region', 'Security_Type'],
       dtype='object'))

In [9]:
for col in cat_cols:
    mode_value = X[col].mode()[0]
    X[col] = X[col].fillna(mode_value)


In [10]:
X.isnull().sum().sum()


np.int64(0)

In [11]:
for col in num_cols:
    median_value = X[col].median()
    X[col] = X[col].fillna(median_value)


Note: Missing values were imputed using explicit column reassignment to avoid chained assignment warnings
and ensure compatibility with future versions of pandas.


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    remainder='passthrough'
)


In [14]:
X_encoded = preprocessor.fit_transform(X)


In [15]:
feature_names = preprocessor.get_feature_names_out()
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)


In [16]:
X_encoded.head()


Unnamed: 0,cat__loan_limit_cf,cat__loan_limit_ncf,cat__Gender_Female,cat__Gender_Joint,cat__Gender_Male,cat__Gender_Sex Not Available,cat__approv_in_adv_nopre,cat__approv_in_adv_pre,cat__loan_type_type1,cat__loan_type_type2,...,remainder__loan_amount,remainder__rate_of_interest,remainder__Interest_rate_spread,remainder__Upfront_charges,remainder__term,remainder__property_value,remainder__income,remainder__Credit_Score,remainder__LTV,remainder__dtir1
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,116500.0,3.99,0.3904,2596.45,360.0,118000.0,1740.0,758.0,98.728814,45.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,206500.0,3.99,0.3904,2596.45,360.0,418000.0,4980.0,552.0,75.13587,39.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,406500.0,4.56,0.2,595.0,360.0,508000.0,9480.0,834.0,80.019685,46.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,456500.0,4.25,0.681,2596.45,360.0,658000.0,11880.0,587.0,69.3769,42.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,696500.0,4.0,0.3042,0.0,360.0,758000.0,10440.0,602.0,91.886544,39.0


### Categorical Feature Encoding

Categorical variables were converted into numerical form using One-Hot Encoding.
This approach avoids introducing artificial ordinal relationships between categories
and ensures compatibility with machine learning models. Numerical features were
retained without modification.


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [19]:
y_train.value_counts(normalize=True)

Status
0    0.753557
1    0.246443
Name: proportion, dtype: float64

In [20]:
y_test.value_counts(normalize=True)

Status
0    0.753548
1    0.246452
Name: proportion, dtype: float64

### Train–Test Split

The dataset was split into training and testing sets using an 80–20 ratio.
Stratified sampling was applied to preserve the original class distribution
of the target variable, ensuring reliable evaluation on imbalanced data.


In [21]:
X_encoded.to_csv("../data/processed/X_encoded.csv", index=False)
y.to_csv("../data/processed/y.csv", index=False)
