In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from sklearn.preprocessing import LabelEncoder

## Data Description
Source: https://www.kaggle.com/datasets/yasserh/loan-default-dataset/discussion/522084
<br> **ID**	client loan application id
<br> **year**	year of loan application
<br> **loan_limit**	indicates whether the loan is conforming (cf) or non-conforming (ncf)
<br> **Gender**	gender of the applicant (male, female, joint, sex not available)
<br> **approv_in_adv**	indicates whether the loan was approved in advance (pre, nopre)
<br> **loan_type**	type of loan (type1, type2, type3)
<br> **loan_purpose**	purpose of the loan (p1, p2, p3, p4)
<br> **Credit_Worthiness**	credit worthiness (l1, l2)
<br> **open_credit**	indicates whether the applicant has any open credit accounts (opc, nopc)
<br> **business_or_commercial**	indicates whether the loan is for business/commercial purposes (ob/c - business/commercial, nob/c - personal)
<br> **loan_amount**	amount of money being borrowed
<br> **rate_of_interest**	interest rate charged on the loan
<br> **Interest_rate_spread**	difference between the interest rate on the loan and a benchmark interest rate
<br> **Upfront_charges**	initial charges associated with securing the loan
<br> **term**	duration of the loan in months
<br> **Neg_ammortization**	indicates whether the loan allows for negative ammortization (neg_amm, not_neg)
<br> **interest_only**	indicates whether the loan has an interest-only payment option (int_only, not_int)
<br> **lump_sum_payment**	indicates if a lump sum payment is required at the end of the loan term (lpsm, not_lpsm)
<br> **property_value**	value of the property being financed
<br> **construction_type**	type of construction (sb - site built, mh - manufactured home)
<br> **occupancy_type**	occupancy type (pr - primary residence, sr- secondary residence, ir - investment property)
<br> **Secured_by**	specifies the type of collateral securing the loan (home, land)
<br> **total_units**	number of units in the property being financed (1U, 2U, 3U, 4U)
<br> **income**	applicant's annual income
<br> **credit_type**	applicant's type of credit (CIB - credit information bureau , CRIF - CIRF credit information bureau, EXP - experian , EQUI - equifax)
<br> **Credit_Score**	applicant's credit score
<br> **co-applicant_credit_type**	co-applicant's type of credit (CIB - credit information bureau EXP - experian)
<br> **age**	the age of the applicant.
<br> **submission_of_application**	indicates how the application was submitted (to_inst - to institution, not_inst - not to institution)
<br> **LTV**	loan-to-value ratio, calculated as the loan amount divided by the property value
<br> **Region**	geographic region where the property is located (North, south, central, North-East)
<br> **Security_Type**	type of security or collateral backing the loan (direct, indirect)
<br> **Status**	indicates whether the loan has been defaulted (1) or not (0)
<br> **dtir1**	debt-to-income ratio



## 1) Import Data

In [None]:
df = pd.read_csv(r"C:\Users\boeun\Downloads\archive\Loan_Default.csv")

In [None]:
df.head()

In [None]:
ax = sns.countplot(x = 'Status', data=df)
ax.bar_label(ax.containers[0])
plt.title("Default Status Observation")
plt.show()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.drop_duplicates(subset=['ID'], inplace=True)

In [None]:
df.shape  # There were no duplicate records

## 2) Preprocess Data

In [None]:
df.describe()

In [None]:
df2 = df.copy()

In [None]:
# Encode to numerical values
le = LabelEncoder()

obj_columns = [col for col in df2.columns if df2[col].dtype == 'object']
for col in obj_columns:
    df2[col] = le.fit_transform(df2[col])

In [None]:
df2.info()

## 3) Random Forest

In [None]:
df3 = df2.copy()

In [None]:
X = df3.drop('Status', axis=1) 
y = df3['Status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
clf_rf = RandomForestClassifier(max_depth=2, random_state=0)
clf_rf.fit(X_train, y_train)

In [None]:
pred = clf_rf.predict(X_test)
print(f'Accuracy Score: {accuracy_score(y_test, pred):.2%}')

In [None]:
confusion_matrix(y_test, pred)

In [None]:
class_names = ['Did Not Default', 'Defaulted']
disp = ConfusionMatrixDisplay.from_estimator(clf_rf, X_test, y_test, display_labels=class_names, cmap=plt.cm.Blues)