# 1. SETTINGS

In [1]:
# libraries
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# garbage collection
import gc
gc.enable()

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# random settings
seed = 42

# 2. PREPARATIONS

In [None]:
# dataset
data = "v1"

In [None]:
# import data
train_features_df = pd.read_csv("/content/train_features (1).csv")
test  = pd.read_csv("/content/test_features.csv")
train_labels_df = pd.read_csv("/content/train_labels (2).csv")

In [None]:
# Merge train features and labels on SK_ID_CURR to ensure alignment
train = pd.merge(train_features_df, train_labels_df, on="SK_ID_CURR", how="inner")

# Sort data
train = train.sort_values("SK_ID_CURR")

In [None]:
# extract target
y = train["TARGET"]
train = train.drop(columns=["TARGET"]) # Remove TARGET from features in train dataframe

In [None]:
# exclude features
excluded_feats = ["SK_ID_CURR"]

# Get common columns between train and test
common_cols = list(set(train.columns) & set(test.columns))

# Clean feature names to remove problematic characters for LightGBM
import re
def clean_feature_names(features_list):
    cleaned_features = []
    for feature in features_list:
        # Replace problematic characters with underscore using regex
        cleaned_feature = re.sub(r'[^A-Za-z0-9_]+', '_', feature)
        # Further clean by stripping leading/trailing underscores and collapsing multiple underscores
        cleaned_feature = cleaned_feature.strip('_')
        cleaned_feature = re.sub(r'__+', '_', cleaned_feature)
        cleaned_features.append(cleaned_feature)
    return cleaned_features

features = [f for f in common_cols if f not in excluded_feats]
features = clean_feature_names(features)

# Apply cleaning to the actual dataframe columns as well
train.columns = clean_feature_names(train.columns)
test.columns = clean_feature_names(test.columns)

# Re-filter features after cleaning, in case some features were renamed
features = [f for f in features if f in train.columns and f in test.columns]

In [None]:
# check dimensions
print(train[features].shape)
print(test[features].shape)

In [None]:
### PARAMETERS

# parallel settings
cores = 10

# learner settings
metric   = "auc"
verbose  = 500
stopping = 300

# CV settings
num_folds = 5
shuffle   = True

# lightGBM
gbm = lgb.LGBMClassifier(n_estimators     = 10000,
                         learning_rate    = 0.005,
                         num_leaves       = 31,
                         colsample_bytree = 0.8,
                         subsample        = 0.9,
                         max_depth        = 5,
                         reg_alpha        = 0.1,
                         reg_lambda       = 0.1,
                         min_split_gain   = 0.2,  # Increased
                         min_child_weight = 5,    # Increased
                         random_state     = seed,
                         num_threads      = cores)

# 3. CROSS-VALIDATION

## 3.1. ALL FEATURES

In [None]:
# Convert object columns to float in both train and test dataframes
for col in features:
    if train[col].dtype == 'object':
        # Convert boolean-like strings to numeric before casting to float
        train[col] = (
            train[col]
            .replace({'True': 1.0, 'False': 0.0, 'F': 0.0, 'T': 1.0})
            .astype(float, errors='ignore')
        )
        train[col] = pd.to_numeric(train[col], errors='coerce').fillna(0)

    if test[col].dtype == 'object':
        # Convert boolean-like strings to numeric before casting to float
        test[col] = (
            test[col]
            .replace({'True': 1.0, 'False': 0.0, 'F': 0.0, 'T': 1.0})
            .astype(float, errors='ignore')
        )
        test[col] = pd.to_numeric(test[col], errors='coerce').fillna(0)


In [None]:
# CV
folds = StratifiedKFold(
    n_splits=num_folds,
    shuffle=True,
    random_state=seed
)

valid_aucs_cv = np.zeros(num_folds)
test_preds_cv = np.zeros(test.shape[0])
feature_importance_df = pd.DataFrame()

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    print(f"\n========== Fold {n_fold + 1} ==========")

    # =========================
    # Data partitioning
    # =========================
    trn_x = train[features].iloc[trn_idx]
    trn_y = y.iloc[trn_idx]
    val_x = train[features].iloc[val_idx]
    val_y = y.iloc[val_idx]

    # =========================
    # Model (RE-INIT per fold)
    # =========================
    gbm = lgb.LGBMClassifier(
        n_estimators=10000,
        learning_rate=0.01,          # sedikit dinaikkan
        num_leaves=63,               # diperbesar
        max_depth=-1,                # biarkan tree berkembang
        min_child_weight=1,          # dilonggarkan
        min_split_gain=0.0,          # dilonggarkan
        colsample_bytree=0.8,
        subsample=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        objective="binary",
        metric="auc",
        random_state=seed,
        num_threads=cores,
        force_col_wise=True          # hilangkan overhead
    )

    # =========================
    # Training
    # =========================
    gbm.fit(
        trn_x,
        trn_y,
        eval_set=[(val_x, val_y)],
        eval_metric="auc",
        callbacks=[
            lgb.early_stopping(stopping_rounds=200),
            lgb.log_evaluation(period=500)
        ]
    )

    # =========================
    # Validation AUC
    # =========================
    val_pred = gbm.predict_proba(val_x)[:, 1]
    valid_aucs_cv[n_fold] = roc_auc_score(val_y, val_pred)

    print(f"Fold {n_fold + 1} AUC: {valid_aucs_cv[n_fold]:.5f}")

    # =========================
    # Test prediction (bagging)
    # =========================
    test_preds_cv += gbm.predict_proba(test[features])[:, 1] / num_folds

    # =========================
    # Feature importance
    # =========================
    fold_importance = pd.DataFrame({
        "Feature": features,
        "Importance": gbm.feature_importances_,
        "Fold": n_fold + 1
    })

    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance],
        axis=0
    )

# =========================
# CV summary
# =========================
print("\n========== CV RESULT ==========")
print(f"Mean AUC  : {valid_aucs_cv.mean():.6f}")
print(f"Std  AUC  : {valid_aucs_cv.std():.6f}")


In [None]:
##### VARIABLE IMPORTANCE

# load importance
top_feats = 50
cols = feature_importance_df[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by = "Importance", ascending = False)[0:top_feats].index
importance = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

# plot variable importance
plt.figure(figsize = (10, 10))
sns.barplot(x = "Importance", y = "Feature", data = importance.sort_values(by = "Importance", ascending = False))
plt.title('LightGBM Variable Importance (mean over CV folds)')
plt.tight_layout()

# save plot as pdf
plt.savefig("../var_importance.pdf")

## 3.2. TOP FEATURES

In [None]:
# keep top features
top = 500
cols = feature_importance_df[["Feature", "Importance"]].groupby("Feature").mean().sort_values(by = "Importance", ascending = False)[0:top].index
importance = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
features = list(importance.groupby("Feature").Importance.mean().sort_values(ascending = False).index)

In [None]:
# check dimensions
print(train[features].shape)
print(test[features].shape)

In [19]:
# hitung imbalance ratio
pos_weight = (len(y) - y.sum()) / y.sum()

gbm = lgb.LGBMClassifier(
    n_estimators=10000,
    learning_rate=0.01,
    num_leaves=63,
    max_depth=-1,
    min_child_weight=1,
    min_split_gain=0.0,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    scale_pos_weight=pos_weight,
    random_state=seed,
    num_threads=cores
)

# ================================
# STRATIFIED K-FOLD
# ================================

folds = StratifiedKFold(
    n_splits=num_folds,
    shuffle=True,
    random_state=seed
)

# ================================
# STORAGE
# ================================

valid_aucs_cv = np.zeros(num_folds)

# ================================
# CROSS-VALIDATION LOOP (TIDAK DIUBAH)
# ================================

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):

    print(f"\n========== Fold {n_fold+1} ==========")

    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]

    # train lightGBM (OPERASI ASLI DIPERTAHANKAN)
    gbm = gbm.fit(
        trn_x,
        trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric=metric,
        callbacks=[
            lgb.early_stopping(stopping, verbose=True),
            lgb.log_evaluation(period=verbose)
        ]
    )

    # validation AUC
    val_pred = gbm.predict_proba(val_x)[:, 1]
    valid_aucs_cv[n_fold] = roc_auc_score(val_y, val_pred)

    print(f"Fold {n_fold+1} AUC: {valid_aucs_cv[n_fold]:.5f}")

# ================================
# CV RESULT
# ================================

print("\n========== CV RESULT ==========")
print(f"Mean AUC : {valid_aucs_cv.mean():.6f}")
print(f"Std  AUC : {valid_aucs_cv.std():.6f}")


[LightGBM] [Info] Number of positive: 789, number of negative: 9425
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.436518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94433
[LightGBM] [Info] Number of data points in the train set: 10214, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.077247 -> initscore=-2.480355
[LightGBM] [Info] Start training from score -2.480355
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[14]	training's auc: 0.9696	training's binary_logloss: 0.239778	valid_1's auc: 0.719107	valid_1's binary_logloss: 0.263888
Fold 1 AUC: 0.71911

[LightGBM] [Info] Number of positive: 789, number of negative: 9425
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.388627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94440


# 4. SUBMISSION

In [20]:
# create submission
test["TARGET"] = test_preds_cv
subm = test[["SK_ID_CURR", "TARGET"]]

In [23]:
# hitung mean AUC sebagai pengganti 'auc'
auc = valid_aucs_cv.mean()

# simpan submission dengan nama sederhana
filename = "submission_lgb.csv"

subm.to_csv(filename, index=False, float_format="%.8f")
print(f"File saved as {filename} in current working directory")



File saved as submission_lgb.csv in current working directory


In [None]:
# no card, old features (560):            0.786941 | 0.783
# no card, new features (694):            0.788893 | 0.783
# with card, new features (1072):         0.790123 | 0.787
# with card and kernel features (1109):   0.790053 |
# card, kernel, factorize, no na (978):   0.790803 |
# card, kern, fac, nona, adummy (1193):   0.791321 |
# full data, one-hot ecoding (1844):      0.791850 |
# full data, one-hot, extra sums (2486):  0.791880 | 0.789
# full, one-hot, sums, buroscore (2501):  0.791761 |
# full, one-hot, clean, buroscore (1826): 0.791867 |
# last data + ext, age ratios (1828):     0.791808 |
# new app feats, remove weighted (1830):  0.794241 | 0.795
# previous data - top1000 LGB features:   0.794384 |
# select top1500 LGB features:            0.794384 |