In [10]:

import pandas as pd
import numpy as np
import warnings
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix, auc

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")

In [11]:
def create_features(data):
    data = data.copy()
    data['pernah_dihubungi_sebelumnya'] = np.where(
        data['hari_sejak_kontak_sebelumnya'] == 999, 0, 1)

    data['rasio_sukses_sebelumnya'] = (
        data['hasil_kampanye_sebelumnya'].map({'success': 1, 'failure': 0, 'nonexistent': 0})
        * (data['jumlah_kontak_sebelumnya'] + 1)
    )

    data['risk_score'] = (
        0.4 * data['indeks_harga_konsumen'] +
        0.3 * data['suku_bunga_euribor_3bln'] +
        0.3 * data['tingkat_variasi_pekerjaan']
    )

    data['usia_group'] = pd.cut(
        data['usia'],
        bins=[18, 30, 45, 60, 100],
        labels=['muda', 'paruhbaya', 'senior', 'lansia']
    )

    data['beban_pinjaman'] = (
        data['pinjaman_rumah'].map({'yes': 1, 'no': 0, 'unknown': 0}) +
        data['pinjaman_pribadi'].map({'yes': 1, 'no': 0, 'unknown': 0})
    )

    return data


NUM_FEATURES = [
    'usia', 'jumlah_kontak_kampanye_ini', 'indeks_harga_konsumen',
    'suku_bunga_euribor_3bln', 'rasio_sukses_sebelumnya', 'risk_score'
]

CAT_FEATURES = [
    'pekerjaan', 'status_perkawinan', 'pendidikan', 'jenis_kontak',
    'pulau', 'usia_group', 'pernah_dihubungi_sebelumnya'
]


In [12]:
def build_preprocessor():
    return ColumnTransformer([
        ('num', StandardScaler(), NUM_FEATURES),
        ('cat', OneHotEncoder(max_categories=15, handle_unknown='infrequent_if_exist'), CAT_FEATURES)
    ])

In [13]:
def get_base_models():
    return [
        ('xgb', XGBClassifier(n_estimators=300, max_depth=7, learning_rate=0.05,
                              subsample=0.8, scale_pos_weight=8, tree_method='hist', random_state=42)),
        ('lgbm', LGBMClassifier(n_estimators=300, max_depth=7, learning_rate=0.05,
                                subsample=0.8, is_unbalance=True, random_state=42)),
        ('catboost', CatBoostClassifier(iterations=300, depth=7, learning_rate=0.05,
                                        subsample=0.8, auto_class_weights='Balanced', silent=True))
    ]


def build_pipeline(base_models):
    return ImbPipeline([
        ('smote', SMOTE(random_state=42, sampling_strategy=0.5)),
        ('stack', StackingClassifier(
            estimators=base_models,
            final_estimator=LogisticRegression(max_iter=1000),
            passthrough=True,
            n_jobs=-1
        ))
    ])


In [14]:
class AdvancedBankDepositPredictor:
    def __init__(self):
        self.model = None
        self.preprocessor = None
        self.feature_columns = []

    def preprocess_data(self, df, fit=False):
        df = create_features(df)
        df = df.fillna(df.mode().iloc[0] if fit else df.ffill())

        if fit:
            self.feature_columns = [col for col in df.columns if col != 'berlangganan_deposito']
            self.preprocessor = build_preprocessor()
            return self.preprocessor.fit_transform(df[self.feature_columns])
        else:
            return self.preprocessor.transform(df[self.feature_columns])

    def train(self, data, tune=False):
        X = self.preprocess_data(data, fit=True)
        y = data['berlangganan_deposito']
        base_models = get_base_models()
        pipeline = build_pipeline(base_models)

        if tune:
            param_dist = {
                'smote__sampling_strategy': [0.4, 0.5, 0.6],
                'stack__xgb__max_depth': [5, 7, 9],
                'stack__xgb__learning_rate': [0.01, 0.05, 0.1],
                'stack__final_estimator__C': [0.1, 1, 10]
            }

            search = RandomizedSearchCV(
                pipeline, param_distributions=param_dist,
                n_iter=20, cv=StratifiedKFold(5), scoring='roc_auc',
                verbose=1, n_jobs=-1
            )
            search.fit(X, y)
            self.model = search.best_estimator_
            print(f"Best AUC (CV): {search.best_score_:.4f}")
        else:
            pipeline.fit(X, y)
            self.model = pipeline

    def predict(self, df):
        X = self.preprocess_data(df)
        return self.model.predict_proba(X)[:, 1]

    def evaluate(self, y_true, y_scores):
        auc_score = roc_auc_score(y_true, y_scores)
        fpr, tpr, _ = roc_curve(y_true, y_scores)
        precision, recall, _ = precision_recall_curve(y_true, y_scores)
        preds = (y_scores > 0.5).astype(int)
        cm = confusion_matrix(y_true, preds)

        print(f"AUC: {auc_score:.4f}")

        plt.figure(figsize=(15, 4))
        plt.subplot(1, 3, 1)
        plt.plot(fpr, tpr, label=f'AUC = {auc_score:.4f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend()

        plt.subplot(1, 3, 2)
        plt.plot(recall, precision)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')

        plt.subplot(1, 3, 3)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')

        plt.tight_layout()
        plt.show()
    def save(self, path):
        joblib.dump({
            'model': self.model,
            'preprocessor': self.preprocessor,
            'feature_columns': self.feature_columns
        }, path)

    def load(self, path):
        obj = joblib.load(path)
        self.model = obj['model']
        self.preprocessor = obj['preprocessor']
        self.feature_columns = obj['feature_columns']


In [None]:
  # Load data
train_data = pd.read_csv('training_dataset.csv')
val_data = pd.read_csv('validation_set.csv')

# Train model
predictor = AdvancedBankDepositPredictor()
predictor.train(train_data, tune=True)

# Predict & Evaluate
preds = predictor.predict(val_data)
y_true = val_data['berlangganan_deposito']
predictor.evaluate(y_true, preds)

# Save submission
submission = pd.DataFrame({
    'customer_number': val_data['customer_number'],
    'berlangganan_deposito': preds
})
submission.to_csv('submission.csv', index=False)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
