# Python Modelling

## EDA and Drop Fitur Leakage

In [1]:
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import warnings

warnings.filterwarnings("ignore")

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

### Load dataset

In [27]:
df = pd.read_csv("../data/credit_scoring.csv")
df.head()

Unnamed: 0,application_id,age,monthly_income,loan_amount,previous_defaults,credit_score,default,leak_col_good,leak_col_subtle
0,501000,41,13995609,5982664,0,624,0,0,-0.04
1,501001,58,13683833,3711198,0,809,0,0,0.001
2,501002,33,9417391,7172332,0,647,0,0,0.077
3,501003,45,6861811,8661056,0,450,0,0,0.038
4,501004,22,5640742,4520669,1,816,0,0,0.02


### EDA

In [28]:
# Dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   application_id     6000 non-null   int64  
 1   age                6000 non-null   int64  
 2   monthly_income     6000 non-null   int64  
 3   loan_amount        6000 non-null   int64  
 4   previous_defaults  6000 non-null   int64  
 5   credit_score       6000 non-null   int64  
 6   default            6000 non-null   int64  
 7   leak_col_good      6000 non-null   int64  
 8   leak_col_subtle    6000 non-null   float64
dtypes: float64(1), int64(8)
memory usage: 422.0 KB


In [29]:
# Statistik data
df.describe()

Unnamed: 0,application_id,age,monthly_income,loan_amount,previous_defaults,credit_score,default,leak_col_good,leak_col_subtle
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,503999.5,40.3215,11403860.0,8001122.0,0.243667,575.979833,0.012167,0.012167,0.011945
std,1732.195139,11.300697,4912598.0,4071786.0,0.484767,158.328931,0.109639,0.109639,0.113947
min,501000.0,21.0,3000310.0,1000794.0,0.0,300.0,0.0,0.0,-0.119
25%,502499.75,31.0,7163379.0,4440939.0,0.0,438.75,0.0,0.0,-0.02
50%,503999.5,41.0,11299450.0,7982304.0,0.0,577.5,0.0,0.0,0.0
75%,505499.25,50.0,15737050.0,11531440.0,0.0,713.0,0.0,0.0,0.021
max,506999.0,59.0,19986220.0,14998580.0,3.0,849.0,1.0,1.0,1.092


In [30]:
# Dataset shape
df.shape

(6000, 9)

In [31]:
# Cek missing values
df.isna().sum()

application_id       0
age                  0
monthly_income       0
loan_amount          0
previous_defaults    0
credit_score         0
default              0
leak_col_good        0
leak_col_subtle      0
dtype: int64

In [32]:
# Cek data duplikat
print("Jumlah data duplikat : ", df.duplicated().sum())

Jumlah data duplikat :  0


In [33]:
# Drop data leakage
leakage_cols = ["leak_col_good", "leak_col_subtle"]
df_new = df.drop(columns=leakage_cols)

In [34]:
df_new.head()

Unnamed: 0,application_id,age,monthly_income,loan_amount,previous_defaults,credit_score,default
0,501000,41,13995609,5982664,0,624,0
1,501001,58,13683833,3711198,0,809,0
2,501002,33,9417391,7172332,0,647,0
3,501003,45,6861811,8661056,0,450,0
4,501004,22,5640742,4520669,1,816,0


## Baseline Logistic Reg -> GradientBoosting 

In [37]:
# Split dataset
X = df_new.drop(columns=["application_id", "default"])
y = df_new["default"]
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,  # 20% data untuk test
    stratify=y,  
    random_state=42
)

In [38]:
print("Jumlah data pada data training:", X_train.shape[0])
print("Jumlah data pada data testing:", X_test.shape[0])

Jumlah data pada data training: 4800
Jumlah data pada data testing: 1200


In [40]:
# Data train
X_train.head()

Unnamed: 0,age,monthly_income,loan_amount,previous_defaults,credit_score
382,49,15929405,7257297,0,537
2603,50,13636274,6963137,0,610
65,48,3094579,1847424,0,356
3332,22,6156666,9324793,0,406
833,47,9886549,9609103,0,347


In [41]:
# Scaling dataset menggunakan standar scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Logistic Regression


In [44]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
lr_pred_proba = lr.predict_proba(X_test_scaled)[:, 1]
lr_pred_label = lr.predict(X_test_scaled)

# Evaluasi
auc = roc_auc_score(y_test, lr_pred_proba)
accuracy = accuracy_score(y_test, lr_pred_label)

print("Logistic Regression AUC:", auc)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression AUC: 0.7552180028129395
Logistic Regression Accuracy: 0.9875


### Gradient Boosting

In [45]:
# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train_scaled, y_train)
gb_pred_proba = gb.predict_proba(X_test_scaled)[:, 1]
gb_pred_label = gb.predict(X_test_scaled)

# Evaluasi
auc = roc_auc_score(y_test, gb_pred_proba)
accuracy = accuracy_score(y_test, gb_pred_label)

print("Logistic Regression AUC:", auc)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression AUC: 0.6863009845288326
Logistic Regression Accuracy: 0.9858333333333333
