In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.exceptions import NotFittedError
import joblib

class SingleImprovedBankDepositPredictor:
    def __init__(self):
        self.model = None
        self.model_name = "GradientBoosting"
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.feature_columns = []

    def preprocess_data(self, data, is_training=True):
        data = data.copy()

        # Handle missing values
        data.fillna(data.mode().iloc[0], inplace=True)

        # Encoding categorical features
        categorical_columns = data.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            if is_training:
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
                self.label_encoders[col] = le
            else:
                le = self.label_encoders.get(col)
                if le:
                    # Handle unseen categories
                    data[col] = data[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
                    data[col] = le.transform(data[col])
                else:
                    raise NotFittedError(f"LabelEncoder for {col} not found.")

        # Simple feature engineering - only use existing columns
        if 'jumlah_kontak_kampanye_ini' in data.columns and 'jumlah_kontak_sebelumnya' in data.columns:
            data['total_kontak'] = data['jumlah_kontak_kampanye_ini'] + data['jumlah_kontak_sebelumnya']

        # Drop unnecessary columns
        if 'customer_number' in data.columns:
            data.drop(columns=['customer_number'], inplace=True)

        if is_training and 'berlangganan_deposito' in data.columns:
            self.feature_columns = [col for col in data.columns if col != 'berlangganan_deposito']

        return data

    def train_model_with_proper_cv(self, df):
        """Train model with proper cross-validation to get realistic performance estimate"""
        df = self.preprocess_data(df, is_training=True)
        X = df[self.feature_columns]
        y = df['berlangganan_deposito']

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Use StratifiedKFold for proper cross-validation
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        # Model with slight regularization to prevent overfitting
        model = GradientBoostingClassifier(
            n_estimators=100, 
            learning_rate=0.1,
            max_depth=3,  # Reduce depth to prevent overfitting
            min_samples_split=20,  # Require more samples to split
            min_samples_leaf=10,   # Require more samples in leaf
            random_state=42
        )
        
        # Get cross-validation scores
        cv_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='roc_auc')
        mean_cv_score = cv_scores.mean()
        std_cv_score = cv_scores.std()
        
        print(f"Cross-Validation Results:")
        print(f"CV AUC: {mean_cv_score:.4f} (+/- {std_cv_score:.4f})")
        print(f"Individual CV scores: {[f'{score:.4f}' for score in cv_scores]}")
        
        # Train final model on full dataset
        model.fit(X_scaled, y)
        
        self.model = model
        
        return {
            "model_name": self.model_name,
            "cv_auc_score": mean_cv_score,
            "cv_std": std_cv_score
        }

    def predict(self, new_data):
        if self.model is None:
            raise NotFittedError("Model not trained yet.")
            
        new_data = self.preprocess_data(new_data, is_training=False)
        X_new = new_data[self.feature_columns]
        X_new_scaled = self.scaler.transform(X_new)
        return self.model.predict_proba(X_new_scaled)[:, 1]

    def save_model(self, path):
        joblib.dump({
            'model': self.model,
            'scaler': self.scaler,
            'encoders': self.label_encoders,
            'features': self.feature_columns,
            'model_name': self.model_name
        }, path)

    def load_model(self, path):
        model_dict = joblib.load(path)
        self.model = model_dict['model']
        self.scaler = model_dict['scaler']
        self.label_encoders = model_dict['encoders']
        self.feature_columns = model_dict['features']
        self.model_name = model_dict['model_name']

    def save_predictions(self, customer_number, y_test_pred, filename='submission.csv'):
        submission = pd.DataFrame({
            'customer_number': customer_number,
            'berlangganan_deposito': y_test_pred
        })
        submission.to_csv(filename, index=False)
        return filename

# Example usage
def main():
    print("🏦 Single Improved Bank Deposit Prediction System")
    print("=" * 50)
    
    # Load data
    print("\n📁 Loading training data...")
    train_data = pd.read_csv('https://raw.githubusercontent.com/difadlyaulhaq/junk/refs/heads/main/training_dataset.csv')
    print(f"✅ Training data loaded: {train_data.shape}")
    
    print("\n📁 Loading validation data...")
    validation_data = pd.read_csv('https://raw.githubusercontent.com/difadlyaulhaq/junk/refs/heads/main/validation_set.csv')
    print(f"✅ Validation data loaded: {validation_data.shape}")
    
    # Initialize and train predictor
    predictor = SingleImprovedBankDepositPredictor()
    
    print("\n🔧 Training model with proper cross-validation...")
    result = predictor.train_model_with_proper_cv(train_data)
    
    print(f"\n🎯 Model: {result['model_name']}")
    print(f"🎯 CV AUC Score: {result['cv_auc_score']:.4f} (+/- {result['cv_std']:.4f})")
    print(f"📊 Expected performance on unseen data: ~{result['cv_auc_score']:.4f}")
    
    # Make predictions
    print("\n🔮 Making predictions...")
    predictions = predictor.predict(validation_data)
    
    # Save predictions
    print("\n💾 Saving predictions...")
    filename = predictor.save_predictions(validation_data['customer_number'], predictions, 'improved_submission.csv')
    
    print(f"\n🎉 Process completed successfully!")
    print(f"📁 File saved: {filename}")
    print(f"📊 Prediction range: [{predictions.min():.4f}, {predictions.max():.4f}]")

if __name__ == "__main__":
    main()

🏦 Single Improved Bank Deposit Prediction System

📁 Loading training data...
✅ Training data loaded: (22916, 22)

📁 Loading validation data...
✅ Validation data loaded: (5729, 21)

🔧 Training model with proper cross-validation...
Cross-Validation Results:
CV AUC: 0.7943 (+/- 0.0095)
Individual CV scores: ['0.7798', '0.7980', '0.8089', '0.7912', '0.7936']

🎯 Model: GradientBoosting
🎯 CV AUC Score: 0.7943 (+/- 0.0095)
📊 Expected performance on unseen data: ~0.7943

🔮 Making predictions...

💾 Saving predictions...

🎉 Process completed successfully!
📁 File saved: improved_submission.csv
📊 Prediction range: [0.0172, 0.8685]
