# ML Models

In [1]:
import pandas as pd # type:ignore
from sklearn.model_selection import train_test_split, GridSearchCV # type:ignore
from sklearn.linear_model import LogisticRegression, RidgeClassifier # type:ignore
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier # type:ignore
from sklearn.gaussian_process import GaussianProcessClassifier # type:ignore
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier # type:ignore
from sklearn.svm import SVC # type:ignore
from sklearn.tree import DecisionTreeClassifier # type:ignore
from xgboost import XGBClassifier, XGBRFClassifier # type:ignore
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score # type:ignore
from sklearn.preprocessing import OneHotEncoder, StandardScaler # type:ignore

## Data Processing

In [2]:
data = pd.read_csv("./data/train.csv")
train_data, val_data = train_test_split(data, test_size=0.2)
test_data = pd.read_csv("./data/test.csv")

In [3]:
# TRANSFORMING FEATURE VARIABLES
data_id = data["id"]
test_data_id = test_data["id"]

# NUMERICAL DATA
numerical_columns = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "cb_person_cred_hist_length"]
sc = StandardScaler()
data_num = data[numerical_columns]
data_num_np = sc.fit_transform(data_num)
data_num = pd.DataFrame(data_num_np, index=data_num.index, columns=data_num.columns)

test_data_num = test_data[numerical_columns]
test_data_num_np = sc.transform(test_data_num)
test_data_num = pd.DataFrame(test_data_num_np, index=test_data_num.index, columns=test_data_num.columns)

# # AGE
# sc_age = StandardScaler()
# data["person_age"] = sc_age.fit_transform(data["person_age"])
# test_data["person_age"] = sc_age.transform(test_data["person_age"])

# # INCOME
# sc_income = StandardScaler()
# data["person_income"] = sc_income.fit_transform(data["person_income"])
# test_data["person_income"] = sc_income.transform(test_data["person_income"])

# # EMPLOYMENT LENGTH
# sc_emp_len = StandardScaler()
# data["person_emp_length"] = sc_emp_len.fit_transform(data["person_emp_length"])
# test_data["person_emp_length"] = sc_emp_len.transform(test_data["person_emp_length"])

# # LOAN AMOUNT
# sc_loan_amount = StandardScaler()
# data["loan_amnt"] = sc_loan_amount.fit_transform(data["loan_amnt"])
# test_data["loan_amnt"] = sc_loan_amount.transform(test_data["loan_amnt"])

# # INTEREST RATE
# sc_int_rate = StandardScaler()
# data["loan_int_rate"] = sc_int_rate.fit_transform(data["loan_int_rate"])
# test_data["loan_int_rate"] = sc_int_rate.transform(test_data["loan_int_rate"])

# # CREDIT HISTORY LENGTH
# sc_hist_len = StandardScaler()
# data["cb_person_credit_hist_length"] = sc_hist_len.fit_transform(data["cb_person_credit_hist_length"])
# test_data["cb_person_credit_hist_length"] = sc_hist_len.transform(test_data["cb_person_credit_hist_length"])

# CATEGORICAL DATA
categorical_columns = ["person_home_ownership", "loan_intent", "loan_grade", "cb_person_default_on_file"]
one_hot_encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded_data = one_hot_encoder.fit_transform(data[categorical_columns])
encoded_data = pd.DataFrame(one_hot_encoded_data, columns=one_hot_encoder.get_feature_names_out(categorical_columns))
data = pd.concat([data_id, data_num, encoded_data], axis=1)
# data = data.drop(categorical_columns, axis=1)

one_hot_encoded_test_data = one_hot_encoder.transform(test_data[categorical_columns])
encoded_test_data = pd.DataFrame(one_hot_encoded_test_data, columns=one_hot_encoder.get_feature_names_out(categorical_columns))
test_data = pd.concat([test_data_id, test_data_num, encoded_test_data], axis=1)
# test_data = test_data.drop(categorical_columns, axis=1)

In [4]:
data.columns

Index(['id', 'person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'cb_person_cred_hist_length',
       'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A',
       'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E',
       'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_N',
       'cb_person_default_on_file_Y'],
      dtype='object')