In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.exceptions import NotFittedError
import warnings
import joblib

In [49]:
class BankDepositPredictor:
    def __init__(self):
        self.model = None
        self.model_name = None
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.feature_columns = []

    def preprocess_data(self, data, is_training=True):
        data = data.copy()

        # Handling missing values
        data.fillna(data.mode().iloc[0], inplace=True)

        # Encoding categorical features
        categorical_columns = data.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            if is_training:
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
                self.label_encoders[col] = le
            else:
                le = self.label_encoders.get(col)
                if le:
                    data[col] = data[col].apply(lambda x: x if x in le.classes_ else le.classes_[0])
                    data[col] = le.transform(data[col])
                else:
                    raise NotFittedError(f"LabelEncoder for {col} not found. Fit the model first.")

        # Feature engineering (add example interaction features)
        if 'pemasukan' in data.columns and 'jumlah_tanggungan' in data.columns:
            data['rasio_pemasukan_tanggungan'] = data['pemasukan'] / (data['jumlah_tanggungan'] + 1)

        # Drop unnecessary columns
        if 'customer_number' in data.columns:
            data.drop(columns=['customer_number'], inplace=True)

        if is_training and 'berlangganan_deposito' in data.columns:
            self.feature_columns = [col for col in data.columns if col != 'berlangganan_deposito']

        return data

    def train_model(self, df):
        df = self.preprocess_data(df, is_training=True)
        X = df[self.feature_columns]
        y = df['berlangganan_deposito']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        model.fit(X_train_scaled, y_train)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        auc = roc_auc_score(y_test, y_pred_proba)
        print(f"GradientBoosting AUC: {auc:.4f}")

        self.model = model
        self.model_name = "GradientBoosting"

        return {
            "model_name": self.model_name,
            "auc_score": auc
        }

    def predict(self, new_data):
        new_data = self.preprocess_data(new_data, is_training=False)
        X_new = new_data[self.feature_columns]
        X_new_scaled = self.scaler.transform(X_new)
        return self.model.predict_proba(X_new_scaled)[:, 1]

    def save_model(self, path):
        joblib.dump({
            'model': self.model,
            'scaler': self.scaler,
            'encoders': self.label_encoders,
            'features': self.feature_columns,
            'model_name': self.model_name
        }, path)

    def load_model(self, path):
        model_dict = joblib.load(path)
        self.model = model_dict['model']
        self.scaler = model_dict['scaler']
        self.label_encoders = model_dict['encoders']
        self.feature_columns = model_dict['features']
        self.model_name = model_dict['model_name']

    def save_predictions(self, customer_number, y_test_pred, filename='submission.csv'):
        submission = pd.DataFrame({
            'customer_number': customer_number,
            'berlangganan_deposito': y_test_pred
        })
        submission.to_csv(filename, index=False)
        return filename



In [50]:
print("🏦 Bank Deposit Prediction System")
print("=" * 50)

# Load training dataset
print("\n📁 Loading training data...")
try:
    train_data = pd.read_csv('https://raw.githubusercontent.com/difadlyaulhaq/Data_Quest_Hackathon/c73ff0ec05c5706f574a10b00556057dce362346/training_dataset.csv')
    print(f"✅ Training data loaded: {train_data.shape}")
    
    # Explore data structure
    print(f"\n📊 Column info:")
    print(f"Total columns: {len(train_data.columns)}")
    print(f"Columns: {train_data.columns.tolist()}")
    
    print(f"\n🔍 Data types:")
    for col in train_data.columns:
        dtype = train_data[col].dtype
        nunique = train_data[col].nunique()
        null_count = train_data[col].isnull().sum()
        print(f"   {col}: {dtype} | Unique: {nunique} | Nulls: {null_count}")
    
    if 'berlangganan_deposito' in train_data.columns:
        print(f"\n📈 Target distribution:")
        print(train_data['berlangganan_deposito'].value_counts())
    else:
        print(f"\n⚠️  Target column 'berlangganan_deposito' not found!")
        print(f"Available columns: {train_data.columns.tolist()}")
        
except FileNotFoundError:
    print("❌ File 'training_dataset.csv' tidak ditemukan!")
    print("   Pastikan file ada di directory yang sama dengan notebook ini")

# Load validation dataset
print("\n📁 Loading validation data...")
try:
    validation_data = pd.read_csv('https://raw.githubusercontent.com/difadlyaulhaq/Data_Quest_Hackathon/c73ff0ec05c5706f574a10b00556057dce362346/validation_set.csv')
    print(f"✅ Validation data loaded: {validation_data.shape}")
except FileNotFoundError:
    print("❌ File 'validation_dataset.csv' tidak ditemukan!")
    print("   Pastikan file ada di directory yang sama dengan notebook ini.")


🏦 Bank Deposit Prediction System

📁 Loading training data...
✅ Training data loaded: (22916, 22)

📊 Column info:
Total columns: 22
Columns: ['customer_number', 'usia', 'pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya', 'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir', 'hari_kontak_terakhir', 'jumlah_kontak_kampanye_ini', 'hari_sejak_kontak_sebelumnya', 'jumlah_kontak_sebelumnya', 'hasil_kampanye_sebelumnya', 'tingkat_variasi_pekerjaan', 'indeks_harga_konsumen', 'indeks_kepercayaan_konsumen', 'suku_bunga_euribor_3bln', 'jumlah_pekerja', 'pulau', 'berlangganan_deposito']

🔍 Data types:
   customer_number: int64 | Unique: 22916 | Nulls: 0
   usia: int64 | Unique: 78 | Nulls: 0
   pekerjaan: object | Unique: 12 | Nulls: 0
   status_perkawinan: object | Unique: 4 | Nulls: 0
   pendidikan: object | Unique: 8 | Nulls: 0
   gagal_bayar_sebelumnya: object | Unique: 3 | Nulls: 0
   pinjaman_rumah: object | Unique: 3 | Nulls: 0
   pinjaman_priba

In [51]:
# Inisialisasi predictor
predictor = BankDepositPredictor()

# Train model
print("\n🔧 Training model...")
model = predictor.train_model(train_data)


🔧 Training model...
GradientBoosting AUC: 0.7998


In [52]:
# Make predictions
print("\n🔮 Making predictions...")
predictions = predictor.predict(validation_data)

# Display sample predictions
print(f"\n📊 Sample predictions:")
print(predictions[:10])

print(f"\n📋 Prediction Summary:")
print(f"   Total predictions: {len(predictions)}")
print(f"   Model used: {predictor.model_name}")


🔮 Making predictions...

📊 Sample predictions:
[0.05395663 0.03551548 0.03482137 0.03720204 0.06693379 0.03509629
 0.04331913 0.03545669 0.30902294 0.0710289 ]

📋 Prediction Summary:
   Total predictions: 5729
   Model used: GradientBoosting


In [53]:
print("\n💾 Saving predictions...")
filename = predictor.save_predictions(validation_data['customer_number'], predictions, 'submission.csv')


print(f"\n🎉 Process completed successfully!")
print(f"📁 File saved: {filename}")
print(f"📊 Ready for submission!")

# Verify file format
print(f"\n🔍 Verifying file format...")
saved_data = pd.read_csv('submission.csv')  # Tambahkan ini!
print(saved_data['berlangganan_deposito'])  # Tambahkan ini!
# saved_data['berlangganan_deposito'] = saved_data['berlangganan_deposito'].astype(int)
# print(f"   Columns: {saved_data.columns.tolist()}")
# print(f"   Shape: {saved_data.shape}")
# print(f"   Probability range: [{saved_data['berlangganan_deposito'].min():.4f}, {saved_data['berlangganan_deposito'].max():.4f}]")



💾 Saving predictions...

🎉 Process completed successfully!
📁 File saved: submission.csv
📊 Ready for submission!

🔍 Verifying file format...
0       0.053957
1       0.035515
2       0.034821
3       0.037202
4       0.066934
          ...   
5724    0.043218
5725    0.042101
5726    0.034469
5727    0.066810
5728    0.041142
Name: berlangganan_deposito, Length: 5729, dtype: float64
