In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


In [2]:
# Load the data
train_df = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

print(train_df.info())
   

# Preprocess the data
numeric_features = train_df.select_dtypes(include=[np.float64, np.int64]).columns.tolist()
numeric_features.remove('loan_status')
numeric_features.remove('id')
categorical_features = train_df.select_dtypes(include=[object]).columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [3]:
# Scale numeric features
scaler = RobustScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

# Polynomial features
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(train_df[numeric_features])
test_poly = poly.transform(test_df[numeric_features])
poly_features = poly.get_feature_names_out(numeric_features)

train_poly_df = pd.DataFrame(train_poly, columns=poly_features)
test_poly_df = pd.DataFrame(test_poly, columns=poly_features)

In [4]:
# One-Hot Encoding for Categorical Features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
train_encoded = encoder.fit_transform(train_df[categorical_features])
test_encoded = encoder.transform(test_df[categorical_features])
encoded_features = encoder.get_feature_names_out(categorical_features)

train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_features)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_features)

# Combine encoded features with polynomial features

train_processed = pd.concat([train_poly_df, train_encoded_df], axis=1)
test_processed = pd.concat([test_poly_df, test_encoded_df], axis=1)

In [5]:
# Resample the data to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(train_processed, train_df['loan_status'])


# Convert data to PyTorch tensors and move to MPS if available
X_resampled = torch.tensor(X_resampled.values, dtype=torch.float32)
y_resampled = torch.tensor(y_resampled.values, dtype=torch.float32)

# Create a dataset and split into training and validation sets
dataset = TensorDataset(X_resampled, y_resampled)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [6]:
class Net1(nn.Module):
    def __init__(self, input_dim, neurons=189, dropout_rate=0.32780849026968734):
        super(Net1, self).__init__()
        self.fc1 = nn.Linear(input_dim, neurons)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(neurons, neurons // 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.output = nn.Linear(neurons // 2, 1)
        
    def forward(self, X):
        X = self.relu1(self.fc1(X))
        X = self.dropout1(X)
        X = self.relu2(self.fc2(X))
        X = self.dropout2(X)
        X = self.output(X)
        return X.view(-1)


In [7]:
class Net2(nn.Module):
    def __init__(self, input_dim, neurons=189, dropout_rate=0.32780849026968734):
        super(Net2, self).__init__()
        self.fc1 = nn.Linear(input_dim, neurons)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(neurons, neurons // 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(neurons // 2, neurons // 4)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)
        self.output = nn.Linear(neurons // 4, 1)
        
    def forward(self, X):
        X = self.relu1(self.fc1(X))
        X = self.dropout1(X)
        X = self.relu2(self.fc2(X))
        X = self.dropout2(X)
        X = self.relu3(self.fc3(X))
        X = self.dropout3(X)
        X = self.output(X)
        return X.view(-1)

In [8]:
class Net3(nn.Module):
    def __init__(self, input_dim, neurons=189, dropout_rate=0.32780849026968734):
        super(Net3, self).__init__()
        self.fc1 = nn.Linear(input_dim, neurons)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(neurons, neurons)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.output = nn.Linear(neurons, 1)
        
    def forward(self, X):
        X = self.relu1(self.fc1(X))
        X = self.dropout1(X)
        X = self.relu2(self.fc2(X))
        X = self.dropout2(X)
        X = self.output(X)
        return X.view(-1)

In [9]:
class Net4(nn.Module):
    def __init__(self, input_dim, neurons=189, dropout_rate=0.32780849026968734):
        super(Net4, self).__init__()
        self.fc1 = nn.Linear(input_dim, neurons)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(neurons, neurons // 2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(neurons // 2, neurons // 4)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)
        self.fc4 = nn.Linear(neurons // 4, neurons // 8)
        self.relu4 = nn.ReLU()
        self.dropout4 = nn.Dropout(dropout_rate)
        self.output = nn.Linear(neurons // 8, 1)
        
    def forward(self, X):
        X = self.relu1(self.fc1(X))
        X = self.dropout1(X)
        X = self.relu2(self.fc2(X))
        X = self.dropout2(X)
        X = self.relu3(self.fc3(X))
        X = self.dropout3(X)
        X = self.relu4(self.fc4(X))
        X = self.dropout4(X)
        X = self.output(X)
        return X.view(-1)

In [10]:
import optuna

def objective(trial, model_class, train_loader, val_loader, input_dim):
    neurons = trial.suggest_int('neurons', 64, 256)
    dropout_rate = trial.suggest_float('dropout_rate', 0.3, 0.7)
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
    
    model = model_class(input_dim=input_dim, neurons=neurons, dropout_rate=dropout_rate)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss()
    
    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
    
    # Validation
    model.eval()
    val_targets = []
    val_outputs = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(torch.sigmoid(outputs).cpu().numpy())
    
    roc_auc = roc_auc_score(val_targets, val_outputs)
    return roc_auc

In [11]:
# Define the models
model_classes = [Net1, Net2, Net3, Net4]
best_params = []
input_dim = X_resampled.size(1)

for model_class in model_classes:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_class, train_loader, val_loader, input_dim), n_trials=50)
    best_params.append(study.best_params)
    print(f'Best trial for {model_class.__name__}: {study.best_trial.value}')
    print(f'Best parameters for {model_class.__name__}: {study.best_trial.params}')

[I 2024-10-31 17:52:03,579] A new study created in memory with name: no-name-15236b1d-5305-48fb-89e6-ee695dd4937d
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 17:52:37,548] Trial 0 finished with value: 0.9417242865756961 and parameters: {'neurons': 135, 'dropout_rate': 0.5411067887883838, 'lr': 0.00016164381566585593, 'weight_decay': 2.34376517932625e-05}. Best is trial 0 with value: 0.9417242865756961.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 17:53:12,011] Trial 1 finished with value: 0.9423664728214493 and parameters: {'neurons': 181, 'dropout_rate': 0.6165208457192888, 'lr': 0.006263893549221401, 'weight_decay': 3.8898510913585865e-05}. Best is trial 1 with value: 0.9423664728214493.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I

Best trial for Net1: 0.9590049813663793
Best parameters for Net1: {'neurons': 222, 'dropout_rate': 0.3690023618574458, 'lr': 0.0012089281396259566, 'weight_decay': 4.271042549679235e-05}


[I 2024-10-31 18:21:59,603] Trial 0 finished with value: 0.950465329533443 and parameters: {'neurons': 182, 'dropout_rate': 0.43770488706729516, 'lr': 0.003987801511174416, 'weight_decay': 3.428556977374144e-05}. Best is trial 0 with value: 0.950465329533443.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 18:22:40,755] Trial 1 finished with value: 0.9476385744101857 and parameters: {'neurons': 196, 'dropout_rate': 0.5211903628140426, 'lr': 0.000386506894515652, 'weight_decay': 0.00022261033748529672}. Best is trial 0 with value: 0.950465329533443.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 18:23:24,015] Trial 2 finished with value: 0.957674511102684 and parameters: {'neurons': 191, 'dropout_rate': 0.314327729861347, 'lr': 0.0017433542349776974, 'weight_decay': 5.2986973125007486e-05}. Best is trial 2 with value

Best trial for Net2: 0.9617839535394548
Best parameters for Net2: {'neurons': 255, 'dropout_rate': 0.31746153219513246, 'lr': 0.0007083486502211974, 'weight_decay': 7.347363763567497e-05}


[I 2024-10-31 18:56:54,079] Trial 0 finished with value: 0.9567243227501914 and parameters: {'neurons': 255, 'dropout_rate': 0.32856398473483217, 'lr': 0.0034639857070532447, 'weight_decay': 1.720508866187133e-05}. Best is trial 0 with value: 0.9567243227501914.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 18:57:30,393] Trial 1 finished with value: 0.9533569664803568 and parameters: {'neurons': 152, 'dropout_rate': 0.46961197261238896, 'lr': 0.0007548147145311098, 'weight_decay': 3.892879094253391e-05}. Best is trial 0 with value: 0.9567243227501914.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 18:58:01,617] Trial 2 finished with value: 0.9381186431725641 and parameters: {'neurons': 65, 'dropout_rate': 0.6751161587830555, 'lr': 0.004409513625482757, 'weight_decay': 0.00013148316431168408}. Best is trial 0 with 

Best trial for Net3: 0.9610354281146753
Best parameters for Net3: {'neurons': 237, 'dropout_rate': 0.30115832254713687, 'lr': 0.0026377962563592368, 'weight_decay': 2.38570495636126e-05}


[I 2024-10-31 19:30:29,790] Trial 0 finished with value: 0.9200332141169565 and parameters: {'neurons': 126, 'dropout_rate': 0.647713285726964, 'lr': 0.007936504893797583, 'weight_decay': 0.0009618783583804862}. Best is trial 0 with value: 0.9200332141169565.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 19:31:11,991] Trial 1 finished with value: 0.9537089759478745 and parameters: {'neurons': 148, 'dropout_rate': 0.3692544805805613, 'lr': 0.001498605334545766, 'weight_decay': 9.415764678448518e-05}. Best is trial 1 with value: 0.9537089759478745.
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-3)
[I 2024-10-31 19:31:50,221] Trial 2 finished with value: 0.9434209247671708 and parameters: {'neurons': 105, 'dropout_rate': 0.3156509616976256, 'lr': 0.00018546307082501357, 'weight_decay': 1.163734541591738e-05}. Best is trial 1 with val

Best trial for Net4: 0.9595014417216625
Best parameters for Net4: {'neurons': 209, 'dropout_rate': 0.35285851386934997, 'lr': 0.0009650056752537214, 'weight_decay': 1.69872979653766e-05}


In [12]:
# Train each model with the best hyperparameters
trained_models = []
roc_auc_scores = []

for model_class, params in zip(model_classes, best_params):
    model = model_class(input_dim=input_dim, neurons=params['neurons'], dropout_rate=params['dropout_rate'])
    optimizer = optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    criterion = nn.BCEWithLogitsLoss()
    
    # Training loop
    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
    
    # Validation
    model.eval()
    val_targets = []
    val_outputs = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(torch.sigmoid(outputs).cpu().numpy())
    
    roc_auc = roc_auc_score(val_targets, val_outputs)
    print(f'Validation ROC AUC Score for {model_class.__name__}: {roc_auc:.4f}')
    trained_models.append(model)
    roc_auc_scores.append(roc_auc)

Epoch 1/100, Loss: 0.3483
Epoch 2/100, Loss: 0.3086
Epoch 3/100, Loss: 0.2993
Epoch 4/100, Loss: 0.2918
Epoch 5/100, Loss: 0.2864
Epoch 6/100, Loss: 0.2809
Epoch 7/100, Loss: 0.2767
Epoch 8/100, Loss: 0.2735
Epoch 9/100, Loss: 0.2688
Epoch 10/100, Loss: 0.2670
Epoch 11/100, Loss: 0.2648
Epoch 12/100, Loss: 0.2622
Epoch 13/100, Loss: 0.2603
Epoch 14/100, Loss: 0.2570
Epoch 15/100, Loss: 0.2567
Epoch 16/100, Loss: 0.2542
Epoch 17/100, Loss: 0.2510
Epoch 18/100, Loss: 0.2516
Epoch 19/100, Loss: 0.2486
Epoch 20/100, Loss: 0.2480
Epoch 21/100, Loss: 0.2449
Epoch 22/100, Loss: 0.2433
Epoch 23/100, Loss: 0.2441
Epoch 24/100, Loss: 0.2424
Epoch 25/100, Loss: 0.2421
Epoch 26/100, Loss: 0.2401
Epoch 27/100, Loss: 0.2395
Epoch 28/100, Loss: 0.2393
Epoch 29/100, Loss: 0.2373
Epoch 30/100, Loss: 0.2362
Epoch 31/100, Loss: 0.2359
Epoch 32/100, Loss: 0.2341
Epoch 33/100, Loss: 0.2335
Epoch 34/100, Loss: 0.2344
Epoch 35/100, Loss: 0.2320
Epoch 36/100, Loss: 0.2309
Epoch 37/100, Loss: 0.2304
Epoch 38/1

In [13]:
# Calculate weights based on ROC AUC scores
total_roc_auc = sum(roc_auc_scores)
weights = [score / total_roc_auc for score in roc_auc_scores]
print(f'Weights: {weights}')

# Weighted ensemble predictions
def weighted_ensemble_predictions(models, weights, test_loader):
    all_preds = []
    for model in models:
        model.eval()
        preds = []
        with torch.no_grad():
            for inputs in test_loader:
                outputs = model(inputs)
                preds.extend(torch.sigmoid(outputs).cpu().numpy())
        all_preds.append(preds)
    
    # Weighted average of the predictions
    weighted_preds = np.average(all_preds, axis=0, weights=weights)
    return weighted_preds

# Create test loader
test_tensor = torch.tensor(test_processed.values, dtype=torch.float32)
test_loader = DataLoader(test_tensor, batch_size=64, shuffle=False)

# Get weighted ensemble predictions
final_preds = weighted_ensemble_predictions(trained_models, weights, test_loader)

# Save to submission file
submission = pd.DataFrame({
    "id": test_df['id'], 
    "target": final_preds
})
submission.to_csv("submission.csv", index=False)

Weights: [0.24948505570502652, 0.25126158102053814, 0.24939325207195356, 0.24986011120248172]
