In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/My Drive/Colab Notebooks/PBI - Home Credit Indonesia/home-credit-default-risk

/content/drive/My Drive/Colab Notebooks/PBI - Home Credit Indonesia/home-credit-default-risk


In [4]:
# Import Data
application_train = pd.read_csv('application_train.csv')
application_test = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
prev = pd.read_csv('previous_application.csv')
posh = pd.read_csv('POS_CASH_balance.csv')
ins = pd.read_csv('installments_payments.csv')
cc = pd.read_csv('credit_card_balance.csv')

In [5]:
application_train.nunique().sum()

np.int64(695662)

In [6]:
bureau.nunique().sum()

np.int64(2671291)

In [7]:
bureau_balance.head(5)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [8]:
prev.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [9]:
posh.head(5)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [10]:
def status_to_score(status):
    if status == 'C':
        return 0  # sudah lunas
    elif status == 'X':
        return 0  # belum jatuh tempo / tidak ada info
    else:
        return int(status)  # keterlambatan dalam bulan

bureau_balance['STATUS_SCORE'] = bureau_balance['STATUS'].apply(status_to_score)

In [11]:
bureau_balance_agg = bureau_balance.groupby('SK_ID_BUREAU').agg({
    'STATUS_SCORE': ['mean', 'max', 'sum', 'count'],
    'MONTHS_BALANCE': ['count']
})
bureau_balance_agg.columns = ['STATUS_MEAN', 'STATUS_MAX', 'STATUS_SUM', 'STATUS_COUNT', 'MONTH_COUNT']
bureau_balance_agg = bureau_balance_agg.reset_index()

In [12]:
# Merge Data ke Application
bureau = bureau.merge(bureau_balance_agg, on='SK_ID_BUREAU', how='left')

In [13]:
# Pisahkan kredit aktif dan tidak aktif
active = bureau[bureau['CREDIT_ACTIVE'] == 'Active']
closed = bureau[bureau['CREDIT_ACTIVE'] == 'Closed']

# Agregasi data umum
bureau_agg = bureau.groupby('SK_ID_CURR').agg({
    'AMT_CREDIT_SUM': ['sum', 'mean'],
    'AMT_CREDIT_SUM_DEBT': ['sum', 'mean'],
    'AMT_CREDIT_SUM_OVERDUE': ['sum'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'DAYS_CREDIT': ['mean'],
    'CNT_CREDIT_PROLONG': ['sum']
})
bureau_agg.columns = ['_'.join(col).strip() for col in bureau_agg.columns.values]
bureau_agg.reset_index(inplace=True)

# Rasio debt to credit
bureau_agg['RATIO_DEBT_CREDIT'] = bureau_agg['AMT_CREDIT_SUM_DEBT_sum'] / bureau_agg['AMT_CREDIT_SUM_sum']
bureau_agg['RATIO_DEBT_CREDIT'] = bureau_agg['RATIO_DEBT_CREDIT'].fillna(0)

# Tambahan: jumlah pinjaman aktif & tidak aktif
bureau_agg['NUM_ACTIVE_LOANS'] = active.groupby('SK_ID_CURR').size()
bureau_agg['NUM_CLOSED_LOANS'] = closed.groupby('SK_ID_CURR').size()

# Isi NaN dengan 0 (untuk pelanggan tanpa pinjaman aktif/closed)
bureau_agg['NUM_ACTIVE_LOANS'] = bureau_agg['NUM_ACTIVE_LOANS'].fillna(0)
bureau_agg['NUM_CLOSED_LOANS'] = bureau_agg['NUM_CLOSED_LOANS'].fillna(0)

# Gabung ke application_train
application_train = application_train.merge(bureau_agg, on='SK_ID_CURR', how='left')
application_train.fillna(0, inplace=True)

# Gabung ke application_test (kalau kamu punya juga)
application_test = application_test.merge(bureau_agg, on='SK_ID_CURR', how='left')
application_test.fillna(0, inplace=True)

In [14]:
# Buat fitur agregat
prev_agg = prev.groupby('SK_ID_CURR').agg({
    'AMT_APPLICATION': ['mean', 'max'],
    'AMT_CREDIT': ['mean', 'max'],
    'AMT_DOWN_PAYMENT': 'mean',
    'RATE_DOWN_PAYMENT': 'mean',
    'DAYS_DECISION': 'mean',
    'NAME_CONTRACT_STATUS': lambda x: (x == 'Approved').sum(),  # jumlah aplikasi disetujui
    'SK_ID_PREV': 'count'  # total aplikasi sebelumnya
}).reset_index()

# Rename kolom multi-level agar lebih rapi
prev_agg.columns = ['_'.join(col).strip() if col[1] else col[0] for col in prev_agg.columns.values]

# Gabungkan ke application_train
application_train = application_train.merge(prev_agg, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='left')
application_test = application_test.merge(prev_agg, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='left')

In [15]:
# Hitung keterlambatan & rasio bayar
ins['LATE_PAYMENT'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['PAYMENT_RATIO'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']

ins_agg = ins.groupby('SK_ID_CURR').agg({
    'LATE_PAYMENT': ['mean', 'max'],
    'PAYMENT_RATIO': ['mean', 'min'],
    'SK_ID_PREV': 'nunique'  # banyaknya pinjaman yang punya cicilan
}).reset_index()

ins_agg.columns = ['_'.join(col).strip() if col[1] else col[0] for col in ins_agg.columns.values]
application_train = application_train.merge(ins_agg, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='left')
application_test = application_test.merge(ins_agg, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='left')

In [16]:
cc_agg = cc.groupby('SK_ID_CURR').agg({
    'AMT_BALANCE': 'mean',
    'AMT_TOTAL_RECEIVABLE': 'mean',
    'AMT_DRAWINGS_ATM_CURRENT': 'mean',
    'SK_DPD': ['mean', 'max'],  # days past due
    'SK_DPD_DEF': ['mean', 'max'],  # worse delinquency
}).reset_index()

cc_agg.columns = ['_'.join(col).strip() if col[1] else col[0] for col in cc_agg.columns.values]
application_train = application_train.merge(cc_agg, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='left')
application_train.drop(columns=['SK_ID_CURR'], inplace=True)
application_test = application_test.merge(cc_agg, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='left')
application_test.drop(columns=['SK_ID_CURR'], inplace=True)

In [17]:
# Check Missing Values
missing = application_train.isnull().mean().sort_values(ascending=False)
print(missing[missing > 0])

AMT_DRAWINGS_ATM_CURRENT_mean    0.801178
AMT_BALANCE_mean                 0.717392
SK_DPD_mean                      0.717392
SK_DPD_DEF_max                   0.717392
SK_DPD_max                       0.717392
SK_DPD_DEF_mean                  0.717392
AMT_TOTAL_RECEIVABLE_mean        0.717392
AMT_DOWN_PAYMENT_mean            0.110259
RATE_DOWN_PAYMENT_mean           0.110259
AMT_CREDIT_max                   0.053507
AMT_APPLICATION_max              0.053507
SK_ID_PREV_count                 0.053507
DAYS_DECISION_mean               0.053507
NAME_CONTRACT_STATUS_<lambda>    0.053507
AMT_CREDIT_mean                  0.053507
AMT_APPLICATION_mean             0.053507
PAYMENT_RATIO_mean               0.051627
PAYMENT_RATIO_min                0.051627
LATE_PAYMENT_mean                0.051627
LATE_PAYMENT_max                 0.051627
SK_ID_PREV_nunique               0.051601
dtype: float64


In [18]:
# Select Numeric and Categorical Columns with NaN < 50%
threshold = 0.5
selected_cols = application_train.columns[application_train.isnull().mean() < threshold]

num_cols = application_train[selected_cols].select_dtypes(include=np.number).columns
cat_cols = application_train[selected_cols].select_dtypes(include='object').columns

In [19]:
# Identifikasi kolom numerik dan kategorikal
num_cols = application_train.select_dtypes(include=np.number).columns.tolist()
cat_cols = application_train.select_dtypes(include='object').columns.tolist()

# Ganti nilai inf/-inf dengan NaN agar bisa diimputasi
application_train[num_cols] = application_train[num_cols].replace([np.inf, -np.inf], np.nan)

In [20]:
# Cek ulang jumlah NaN
print(application_train[num_cols].isna().sum().sort_values(ascending=False))

AMT_DRAWINGS_ATM_CURRENT_mean    246371
AMT_BALANCE_mean                 220606
SK_DPD_mean                      220606
SK_DPD_DEF_max                   220606
SK_DPD_max                       220606
                                  ...  
AMT_CREDIT_SUM_OVERDUE_sum            0
CREDIT_DAY_OVERDUE_max                0
CREDIT_DAY_OVERDUE_mean               0
DAYS_CREDIT_mean                      0
CNT_CREDIT_PROLONG_sum                0
Length: 138, dtype: int64


In [21]:
# Imputasi data
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

application_train[num_cols] = num_imputer.fit_transform(application_train[num_cols])
application_train[cat_cols] = cat_imputer.fit_transform(application_train[cat_cols])

In [22]:
# Encoding Categorical Features
low_cardinality = [col for col in cat_cols if application_train[col].nunique() <= 2]
medium_cardinality = [col for col in cat_cols if 3 <= application_train[col].nunique() <= 10]
high_cardinality = [col for col in cat_cols if application_train[col].nunique() > 10]

label_enc = LabelEncoder()
for col in low_cardinality:
    application_train[col] = label_enc.fit_transform(application_train[col])

application_train = pd.get_dummies(application_train, columns=medium_cardinality)
application_train.drop(columns=high_cardinality, inplace=True)

In [23]:
# Pisahkan fitur dan target dari application_train
X = application_train.drop(['TARGET'], axis=1)
y = application_train['TARGET']

In [24]:
# Gunakan hanya train dan validation dari application_train
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
# Siapkan data test dari file terpisah (application_test)
X_real_test = application_test

print(f"Data Training: {X_train_split.shape}, Data Validation: {X_val.shape}, Data Test: {X_real_test.shape}")

Data Training: (246008, 203), Data Validation: (61503, 203), Data Test: (48744, 153)


In [26]:
# Model Setup
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVM': SVR(),
    'KNN': KNeighborsRegressor()
}

In [27]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X_train_split.select_dtypes(include=np.number).columns)
    ],
    remainder='passthrough'
)

In [28]:
# GridSearchCV untuk model GradientBoosting
param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 5]
}

In [29]:
# Pipeline
tuned_pipeline = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('resample', SMOTETomek(random_state=42)),
    ('model', GradientBoostingRegressor())
])

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

# Logistic Regression pipeline
pipe_logreg = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000))
])

# Gradient Boosting pipeline
pipe_gb = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor())
])

# Cross-validation
scores_logreg = cross_val_score(pipe_logreg, X_train_split, y_train_split,
                                scoring='neg_mean_squared_error', cv=3)
scores_gb = cross_val_score(pipe_gb, X_train_split, y_train_split,
                            scoring='neg_mean_squared_error', cv=3)

# Cetak RMSE
print("Logistic Regression CV RMSE:", np.sqrt(-scores_logreg.mean()))
print("Gradient Boosting CV RMSE:", np.sqrt(-scores_gb.mean()))

Logistic Regression CV RMSE: 0.28404278371945946
Gradient Boosting CV RMSE: 0.26136921714321715


In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Pipeline Logistic Regression
logreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000))
])

logreg_param_grid = {
    'model__C': [0.1, 1, 10],
    'model__penalty': ['l2'],
    'model__solver': ['lbfgs']
}

logreg_grid = GridSearchCV(logreg_pipeline, logreg_param_grid, cv=5, scoring='neg_mean_squared_error')
logreg_grid.fit(X_train_split, y_train_split)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

gboost_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor())
])

gboost_param_grid = {
    'model__n_estimators': [100],
    'model__learning_rate': [0.1],
    'model__max_depth': [3]
}

gboost_random = RandomizedSearchCV(
    gboost_pipeline,
    gboost_param_grid,
    n_iter=2,  # cuma coba 2 kombinasi
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42
)
gboost_random.fit(X_train_split, y_train_split)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Logistic Regression
logreg_best = logreg_grid.best_estimator_
logreg_preds = logreg_best.predict(X_test_split)
print("LogReg RMSE:", np.sqrt(mean_squared_error(y_test_split, logreg_preds)))
print("Best Params (LogReg):", logreg_grid.best_params_)

In [None]:
# Gradient Boosting
gboost_best = gboost_random.best_estimator_
gboost_preds = gboost_best.predict(X_test_split)
print("GBoost RMSE:", np.sqrt(mean_squared_error(y_test_split, gboost_preds)))
print("Best Params (GBoost):", gboost_random.best_params_)


In [None]:
# Model terbaik
best_model = gboost_best

In [None]:
# Prediksi dan evaluasi di validation set
y_val_pred = best_model.predict(X_val)
mae_val = mean_absolute_error(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2_val = r2_score(y_val, y_val_pred)

print(f"Validation MAE: {mae_val:.4f}")
print(f"Validation RMSE: {rmse_val:.4f}")
print(f"Validation R²: {r2_val:.4f}")

In [None]:
# Prediksi di test set eksternal (tanpa target)
y_real_test_pred = best_model.predict(X_real_test)

In [None]:
# Tidak bisa evaluasi di test karena tidak ada TARGET, jadi hanya simpan hasil prediksi
application_test['TARGET_PREDICTED'] = y_real_test_pred

In [None]:
# Evaluasi di training (untuk perbandingan)
y_train_pred = best_model.predict(X_train_split)
mae_train = mean_absolute_error(y_train_split, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train_split, y_train_pred))
r2_train = r2_score(y_train_split, y_train_pred)

In [None]:
# Visualisasi evaluasi (Train dan Validation saja, karena test tidak punya TARGET)
eval_results = {
    'Train': {'MAE': mae_train, 'RMSE': rmse_train, 'R²': r2_train},
    'Validation': {'MAE': mae_val, 'RMSE': rmse_val, 'R²': r2_val}
}

eval_df = pd.DataFrame(eval_results).T
eval_df.plot(kind='barh', figsize=(10, 6))
plt.title('Perbandingan Evaluasi Model pada Train dan Validation')
plt.xlabel('Nilai')
plt.tight_layout()
plt.show()

In [None]:
# Simpan hasil prediksi test jika dibutuhkan
application_test[['SK_ID_CURR', 'TARGET_PREDICTED']].to_csv('submission.csv', index=False)