In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

df = pd.read_csv('../data/Loan_Default.csv')
df.describe(include="all")

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
count,148670.0,148670.0,145326,148670,147762,148670,148536,148670,148670,148670,...,148670,148670.0,148670,148470,148470,133572.0,148670,148670,148670.0,124549.0
unique,,,2,4,2,3,4,2,2,2,...,4,,2,7,2,,4,2,,
top,,,cf,Male,nopre,type1,p3,l1,nopc,nob/c,...,CIB,,CIB,45-54,to_inst,,North,direct,,
freq,,,135348,42346,124621,113173,55934,142344,148114,127908,...,48152,,74392,34720,95814,,74722,148637,,
mean,99224.5,2019.0,,,,,,,,,...,,699.789103,,,,72.746457,,,0.246445,37.732932
std,42917.476598,0.0,,,,,,,,,...,,115.875857,,,,39.967603,,,0.430942,10.545435
min,24890.0,2019.0,,,,,,,,,...,,500.0,,,,0.967478,,,0.0,5.0
25%,62057.25,2019.0,,,,,,,,,...,,599.0,,,,60.47486,,,0.0,31.0
50%,99224.5,2019.0,,,,,,,,,...,,699.0,,,,75.13587,,,0.0,39.0
75%,136391.75,2019.0,,,,,,,,,...,,800.0,,,,86.184211,,,0.0,45.0


In [9]:
y = df["Status"]
X = df.drop(["Status", "ID"], axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
num_attribs = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
cat_attribs = [col for col in X_train.columns if X_train[col].dtype == 'object']
ordinal_attribs = ["age", "total_units"]
nominal_attribs = [x for x in cat_attribs if x not in ordinal_attribs]

print(num_attribs)
print(ordinal_attribs)
print(nominal_attribs)

['year', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1']
['age', 'total_units']
['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'credit_type', 'co-applicant_credit_type', 'submission_of_application', 'Region', 'Security_Type']


"age" and "total_units" are ordered categories, so OrdinalEncoder is appropriate
The rest of the categorical attributes are not ordered, so one-hot encoder is appropriate

In [12]:
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

nominal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

ordinal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=[['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74'], ['1U', '2U', '3U', '4U']])
)

In [13]:
preprocessing = ColumnTransformer([
("numerical", numerical_pipeline, num_attribs),
("nominal", nominal_pipeline, nominal_attribs),
("ordinal", ordinal_pipeline, ordinal_attribs)
])

In [19]:
X_train_prepared = preprocessing.fit_transform(X_train)
X_test_prepared = preprocessing.transform(X_test)

#conversion to data frame
X_train_prepared = pd.DataFrame(
    X_train_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_train.index)

X_test_prepared = pd.DataFrame(
    X_test_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_test.index)

In [15]:
X_train_prepared.describe(include="all")

Unnamed: 0,numerical__year,numerical__loan_amount,numerical__rate_of_interest,numerical__Interest_rate_spread,numerical__Upfront_charges,numerical__term,numerical__property_value,numerical__income,numerical__Credit_Score,numerical__LTV,...,nominal__submission_of_application_not_inst,nominal__submission_of_application_to_inst,nominal__Region_North,nominal__Region_North-East,nominal__Region_central,nominal__Region_south,nominal__Security_Type_Indriect,nominal__Security_Type_direct,ordinal__age,ordinal__total_units
count,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,...,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0
mean,0.0,5.705323e-18,-7.305801e-16,8.038233000000001e-17,2.067059e-16,-4.571428e-16,-4.9391370000000006e-17,-5.693375000000001e-17,3.718317e-16,1.65305e-16,...,0.353736,0.646264,0.50211,0.008399,0.058452,0.431039,0.00021,0.99979,3.13783,0.021398
std,0.0,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,0.478131,0.478131,0.499998,0.091263,0.234596,0.495224,0.014497,0.014497,1.419215,0.196632
min,0.0,-1.70794,-8.249927,-9.118744,-1.089982,-4.098103,-1.401694,-1.061997,-1.724327,-1.740993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,-0.7317496,-0.5783249,-0.5534866,-0.6445091,0.425386,-0.587116,-0.4707358,-0.870421,-0.2360485,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0
50%,0.0,-0.1894214,-0.08734241,-0.08713045,-0.1649559,0.425386,-0.2089189,-0.1751051,-0.00788952,0.05062967,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0
75%,0.0,0.5698381,0.4445553,0.4278792,0.2927654,0.425386,0.3147385,0.2129101,0.8632672,0.2831916,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,4.0,0.0
max,0.0,17.59894,8.116157,6.564722,20.29271,0.425386,46.60024,88.02446,1.725799,187.4085,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,3.0


In [16]:
X_train_prepared.isnull().sum()

numerical__year                    0
numerical__loan_amount             0
numerical__rate_of_interest        0
numerical__Interest_rate_spread    0
numerical__Upfront_charges         0
                                  ..
nominal__Region_south              0
nominal__Security_Type_Indriect    0
nominal__Security_Type_direct      0
ordinal__age                       0
ordinal__total_units               0
Length: 61, dtype: int64