In [None]:
import pandas as pd 
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings("ignore")

# Enhanced Feature Engineering
def create_enhanced_features(data):
    """Sophisticated feature engineering for banking domain"""
    data = data.copy()
    
    # Contact history features
    data['previous_contact_flag'] = np.where(data['hari_sejak_kontak_sebelumnya'] == 999, 0, 1)
    data['success_ratio'] = (
        data['hasil_kampanye_sebelumnya'].map({'success': 1, 'failure': 0, 'nonexistent': 0}) * 
        (data['jumlah_kontak_sebelumnya'] + 1)
    )
    
    # Economic composite features
    economic_features = ['indeks_harga_konsumen', 'suku_bunga_euribor_3bln', 'tingkat_variasi_pekerjaan']
    scaler = StandardScaler()
    data[economic_features] = scaler.fit_transform(data[economic_features])
    data['economic_risk_score'] = 0.4*data['indeks_harga_konsumen'] + 0.4*data['suku_bunga_euribor_3bln'] + 0.2*data['tingkat_variasi_pekerjaan']
    
    # Demographic features
    age_bins = [0, 25, 35, 45, 55, 65, 100]
    data['age_group'] = pd.cut(data['usia'], bins=age_bins, 
                              labels=['18-25', '26-35', '36-45', '46-55', '56-65', '66+'])
    
    # Loan features
    loan_mapping = {'yes': 1, 'no': 0, 'unknown': 0.5}
    data['housing_loan'] = data['pinjaman_rumah'].map(loan_mapping)
    data['personal_loan'] = data['pinjaman_pribadi'].map(loan_mapping)
    
    # Temporal features
    month_map = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
                'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    data['contact_month'] = data['bulan_kontak_terakhir'].map(month_map)
    data['quarter'] = pd.cut(data['contact_month'], 
                            bins=[0,3,6,9,12], 
                            labels=['Q1', 'Q2', 'Q3', 'Q4'])
    
    # Interaction features
    data['contact_success_ratio'] = data['jumlah_kontak_kampanye_ini'] * data['success_ratio']
    data['age_economic_interaction'] = data['usia'] * data['economic_risk_score']
    
    return data

# Robust Preprocessing Pipeline
def create_advanced_preprocessor():
    """Create robust preprocessing pipeline with proper categorical handling"""
    
    categorical_features = [
        'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya',
        'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'quarter',
        'hasil_kampanye_sebelumnya', 'pulau', 'age_group'
    ]
    
    numerical_features = [
        'usia', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya',
        'jumlah_kontak_sebelumnya', 'indeks_harga_konsumen',
        'indeks_kepercayaan_konsumen', 'suku_bunga_euribor_3bln',
        'jumlah_pekerja', 'previous_contact_flag', 'success_ratio',
        'economic_risk_score', 'housing_loan', 'personal_loan',
        'contact_month', 'contact_success_ratio', 'age_economic_interaction'
    ]

    numerical_pipe = Pipeline([
        ('scaler', RobustScaler())
    ])

    categorical_pipe = Pipeline([
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer([
        ('num', numerical_pipe, numerical_features),
        ('cat', categorical_pipe, categorical_features)
    ], remainder='drop')

    return preprocessor, numerical_features + categorical_features

# Optimized Objective Function
def enhanced_objective(trial, X, y, preprocessor):
    """Optimization objective with proper pipeline integration"""
    
    algorithm = trial.suggest_categorical("algorithm", ["xgb", "lgb", "cat"])
    
    # Algorithm configuration
    if algorithm == "xgb":
        params = {
            'n_estimators': trial.suggest_int("n_estimators", 200, 1000),
            'max_depth': trial.suggest_int("max_depth", 3, 9),
            'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            'subsample': trial.suggest_float("subsample", 0.6, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 1.0),
            'reg_alpha': trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),
            'reg_lambda': trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),
            'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1, 20),
            'random_state': 42,
            'eval_metric': 'auc',
            'tree_method': 'gpu_hist',  # Faster training
            'gpu_id': 0,  # Use GPU if available
        }
        model = XGBClassifier(**params)
    elif algorithm == "lgb":
        params = {
            'n_estimators': trial.suggest_int("n_estimators", 200, 1000),
            'max_depth': trial.suggest_int("max_depth", 3, 9),
            'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            'subsample': trial.suggest_float("subsample", 0.6, 1.0),
            'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 1.0),
            'reg_alpha': trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),
            'reg_lambda': trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),
            'num_leaves': trial.suggest_int("num_leaves", 15, 255),
            'min_child_samples': trial.suggest_int("min_child_samples", 20, 100),
            'random_state': 42,
            'device': 'gpu',  # Use GPU if available,
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
        }
        model = LGBMClassifier(**params)
    else:
        params = {
            'iterations': trial.suggest_int("iterations", 200, 1000),
            'depth': trial.suggest_int("depth", 3, 8),
            'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1e-4, 10.0, log=True),
            'random_seed': 42,
            'verbose': False,
            'task_type': 'GPU',  # Use GPU if available
            'devices': '0',  # Use first GPU
            'auto_class_weights': 'Balanced',  # Handle class imbalance
        }
        model = CatBoostClassifier(**params)

    # Sampling strategy
    sampler = trial.suggest_categorical("sampling", ["smote", "adasyn", None])
    if sampler == "smote":
        sampler = SMOTE(random_state=42)
    elif sampler == "adasyn":
        sampler = ADASYN(random_state=42)
    else:
        sampler = None

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # Create complete pipeline
        pipeline_steps = [('preprocessor', preprocessor)]
        if sampler:
            pipeline_steps.append(('sampler', sampler))
        pipeline_steps.append(('model', model))
        
        pipeline = ImbPipeline(pipeline_steps)
        pipeline.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = pipeline.predict_proba(X_valid)[:, 1]
        auc_scores.append(roc_auc_score(y_valid, y_pred))

    return np.mean(auc_scores)

# Main Training Function
def train_optimized_model_and_validate():
    """Train the model using Optuna and validate on validation_set.csv with AUC"""
    
    print("Loading training data...")
    df_train = pd.read_csv("training_dataset.csv")
    df_train = create_enhanced_features(df_train)
    
    preprocessor, features = create_advanced_preprocessor()
    X = df_train[features]
    y = df_train['berlangganan_deposito']
    
    print(f"\nTraining Dataset Info:\nRows: {X.shape[0]}, Features: {X.shape[1]}")
    print(f"Class Distribution:\n{y.value_counts().to_dict()}\n")
    
    # Optimize hyperparameters
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(lambda trial: enhanced_objective(trial, X, y, preprocessor), 
                   n_trials=100, show_progress_bar=True)
    
    print(f"\nBest Trial:")
    print(f"AUC: {study.best_value:.4f}")
    print(f"Params: {study.best_params}")
    
    # Train final model
    best_params = study.best_params.copy()
    algorithm = best_params.pop('algorithm')
    sampling = best_params.pop('sampling')
    
    # Select model
    if algorithm == "xgb":
        model = XGBClassifier(**best_params, random_state=42)
    elif algorithm == "lgb":
        model = LGBMClassifier(**best_params, random_state=42)
    else:
        model = CatBoostClassifier(**best_params, random_seed=42, verbose=False)
    
    print("\nRetraining best model on full training set...")
    pipeline_steps = [('preprocessor', preprocessor)]
    if sampling == "smote":
        pipeline_steps.append(('sampler', SMOTE(random_state=42)))
    elif sampling == "adasyn":
        pipeline_steps.append(('sampler', ADASYN(random_state=42)))
    pipeline_steps.append(('model', model))
    
    final_pipeline = ImbPipeline(pipeline_steps)
    final_pipeline.fit(X, y)
    
    print("Loading and processing validation set...")
    df_val = pd.read_csv("validation_set.csv")
    df_val = create_enhanced_features(df_val)
    
    X_val = df_val[features]
    y_val = df_val['berlangganan_deposito']
    
    print(f"Validation Dataset Info:\nRows: {X_val.shape[0]}, Features: {X_val.shape[1]}")
    
    # Predict and evaluate
    y_proba = final_pipeline.predict_proba(X_val)[:, 1]
    auc_val = roc_auc_score(y_val, y_proba)
    
    print(f"\n✅ Validation AUC: {auc_val:.4f}")
    
    # OPTIONAL: ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'AUC = {auc_val:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve on Validation Set')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    return final_pipeline, features, study

# Prediction Function
def make_predictions(model, features):
    """Generate predictions with proper feature handling"""
    
    val = pd.read_csv("validation_set.csv")
    val = create_enhanced_features(val)
    
    # Handle feature mismatch
    available_features = [f for f in features if f in val.columns]
    missing = list(set(features) - set(available_features))
    if missing:
        print(f"Warning: Missing features in validation data: {missing}")
    
    X_val = val[available_features]
    y_pred = model.predict_proba(X_val)[:, 1]
    
    submission = pd.DataFrame({
        'customer_number': val['customer_number'],
        'berlangganan_deposito': y_pred
    }).sort_values('customer_number')
    
    submission.to_csv("submission.csv", index=False)
    print("\nPredictions saved to submission.csv")
    return submission

# Evaluation Function
def evaluate_model(model, features):
    """Comprehensive model evaluation"""
    
    val = pd.read_csv("validation_set.csv")
    if 'berlangganan_deposito' not in val.columns:
        print("No labels for evaluation")
        return
    
    val = create_enhanced_features(val)
    X_val = val[features]
    y_val = val['berlangganan_deposito']
    
    y_pred = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred)
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_pred)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f'AUC = {auc_score:.3f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    
    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_val, y_pred)
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label='PR Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    print(classification_report(y_val, model.predict(X_val)))
    
    return auc_score
# === Fungsi Pembuatan File Submission ===
def create_submission_file(model_pipeline, feature_names, validation_file="validation_set.csv", output_file="validation_predictions.csv"):
    df_val = pd.read_csv(validation_file)
    df_val = create_enhanced_features(df_val)  # fitur tambahan jika kamu pakai
    X_val = df_val[feature_names]

    # Prediksi probabilitas
    y_proba = model_pipeline.predict_proba(X_val)[:, 1]

    # Buat dataframe hasil
    submission = pd.DataFrame({
        "customer_number": df_val["customer_number"],
        "berlangganan_deposito": y_proba
    })

    # Simpan ke file
    submission.to_csv(output_file, index=False)
    print(f"✅ File submission berhasil disimpan sebagai '{output_file}'")
    # === Fungsi Pembuatan File Submission ===
def create_submission_file(model_pipeline, feature_names, validation_file="validation_set.csv", output_file="validation_predictions.csv"):
    df_val = pd.read_csv(validation_file)
    df_val = create_enhanced_features(df_val)  # fitur tambahan jika kamu pakai
    X_val = df_val[feature_names]

    # Prediksi probabilitas
    y_proba = model_pipeline.predict_proba(X_val)[:, 1]

    # Buat dataframe hasil
    submission = pd.DataFrame({
        "customer_number": df_val["customer_number"],
        "berlangganan_deposito": y_proba
    })

    # Simpan ke file
    submission.to_csv(output_file, index=False)
    print(f"✅ File submission berhasil disimpan sebagai '{output_file}'")

# Main Execution
if __name__ == "__main__":
    model, features, study = train_optimized_model_and_validate()
    print("\nTraining completed. Best parameters found:")
    print(study.best_params)
    submission = make_predictions(model, features)
    evaluate_model(model, features)
    print("\nProcess completed successfully!")
    create_submission_file(model, features)
    print("\nSubmission file created successfully!")

[I 2025-05-26 17:14:44,114] A new study created in memory with name: no-name-71e3bd1d-abd5-47a7-bfb2-451d22ca0a5b


Loading training data...

Training Dataset Info:
Rows: 22916, Features: 27
Class Distribution:
{0: 20302, 1: 2614}



  0%|          | 0/100 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 16241, number of negative: 16241
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6627
[LightGBM] [Info] Number of data points in the train set: 32482, number of used features: 27
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1650, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 17 dense feature groups (0.62 MB) transferred to GPU in 0.003850 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 16242, number of negative: 16242
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6628
[LightGBM] [Info] Number of data points in the train set: 32484, number of used features: 27
[LightGBM] [Info] Us