In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CyclicLR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, OneHotEncoder

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the data
train_df = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')

train_df.head()
train_df.info()

Using device: cuda
/kaggle/input/playground-series-s4e11/sample_submission.csv
/kaggle/input/playground-series-s4e11/train.csv
/kaggle/input/playground-series-s4e11/test.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure       

In [2]:
# Preprocess the data
numeric_features = train_df.select_dtypes(include=[np.float64, np.int64]).columns.tolist()
numeric_features.remove('Depression')
numeric_features.remove('id')

categorical_features = train_df.select_dtypes(include=[object]).columns.tolist()
categorical_features.remove("Name")

# Fill missing values
train_df[numeric_features] = train_df[numeric_features].fillna(train_df[numeric_features].mean())
test_df[numeric_features] = test_df[numeric_features].fillna(test_df[numeric_features].mean())

# Scale numeric features
scaler = RobustScaler()
train_df[numeric_features] = scaler.fit_transform(train_df[numeric_features])
test_df[numeric_features] = scaler.transform(test_df[numeric_features])

# Polynomial features
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(train_df[numeric_features])
test_poly = poly.transform(test_df[numeric_features])
poly_features = poly.get_feature_names_out(numeric_features)

train_poly_df = pd.DataFrame(train_poly, columns=poly_features)
test_poly_df = pd.DataFrame(test_poly, columns=poly_features)

In [3]:
# One-Hot Encoding for Categorical Features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
train_encoded = encoder.fit_transform(train_df[categorical_features])
test_encoded = encoder.transform(test_df[categorical_features])
encoded_features = encoder.get_feature_names_out(categorical_features)

train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_features)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_features)

# Combine encoded features with polynomial features

train_processed = pd.concat([train_poly_df, train_encoded_df], axis=1)
test_processed = pd.concat([test_poly_df, test_encoded_df], axis=1)
test_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93800 entries, 0 to 93799
Columns: 383 entries, Age to Family History of Mental Illness_Yes
dtypes: float64(383)
memory usage: 274.1 MB


In [4]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Resample the data to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(train_processed, train_df['Depression'])

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
test_X_scaled = scaler.transform(test_processed)

In [5]:
# Convert data to tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_resampled, dtype=torch.float32)
test_X_tensor = torch.tensor(test_X_scaled, dtype=torch.float32)

In [6]:
from torch.utils.data import random_split
# Create TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [7]:
# Define a simple neural network model
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        return self.net(x).squeeze()

# Instantiate the model
input_dim = X_tensor.shape[1]
model = SimpleNet(input_dim=input_dim).to(device)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training settings
num_epochs = 50
patience = 5  # Early stopping patience

# Early stopping variables
best_loss = float('inf')
epochs_no_improve = 0
best_model_state = None

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_losses = []
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    
    # Validation
    model.eval()
    val_losses = []
    val_targets = []
    val_outputs = []
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_losses.append(loss.item())
            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(torch.sigmoid(outputs).cpu().numpy())
    
    val_loss = np.mean(val_losses)
    val_preds = (np.array(val_outputs) >= 0.5).astype(int)
    val_accuracy = accuracy_score(val_targets, val_preds)
    
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {np.mean(train_losses):.4f}, '
          f'Val Loss: {val_loss:.4f}, '
          f'Val Accuracy: {val_accuracy:.4f}')
    
    # Early stopping
    if val_loss < best_loss:
        best_loss = val_loss
        best_model_state = model.state_dict()
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print('Early stopping triggered')
            break

# Load the best model weights
if best_model_state is not None:
    model.load_state_dict(best_model_state)

Epoch 1/50, Train Loss: 0.1940, Val Loss: 0.1801, Val Accuracy: 0.9310
Epoch 2/50, Train Loss: 0.1746, Val Loss: 0.1695, Val Accuracy: 0.9350
Epoch 3/50, Train Loss: 0.1647, Val Loss: 0.1641, Val Accuracy: 0.9373
Epoch 4/50, Train Loss: 0.1571, Val Loss: 0.1547, Val Accuracy: 0.9397
Epoch 5/50, Train Loss: 0.1510, Val Loss: 0.1528, Val Accuracy: 0.9419
Epoch 6/50, Train Loss: 0.1471, Val Loss: 0.1507, Val Accuracy: 0.9412
Epoch 7/50, Train Loss: 0.1438, Val Loss: 0.1489, Val Accuracy: 0.9424
Epoch 8/50, Train Loss: 0.1405, Val Loss: 0.1463, Val Accuracy: 0.9443
Epoch 9/50, Train Loss: 0.1386, Val Loss: 0.1445, Val Accuracy: 0.9455
Epoch 10/50, Train Loss: 0.1372, Val Loss: 0.1447, Val Accuracy: 0.9466
Epoch 11/50, Train Loss: 0.1342, Val Loss: 0.1424, Val Accuracy: 0.9472
Epoch 12/50, Train Loss: 0.1314, Val Loss: 0.1422, Val Accuracy: 0.9470
Epoch 13/50, Train Loss: 0.1306, Val Loss: 0.1370, Val Accuracy: 0.9476
Epoch 14/50, Train Loss: 0.1299, Val Loss: 0.1368, Val Accuracy: 0.9482
E

In [8]:
# Prepare test data loader
test_dataset = TensorDataset(test_X_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Generate predictions
model.eval()
test_outputs = []
with torch.no_grad():
    for (inputs,) in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        outputs = torch.sigmoid(outputs).cpu().numpy()
        test_outputs.extend(outputs)

# Prepare submission
test_preds = (np.array(test_outputs) >= 0.5).astype(int)
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': test_preds
})

# Save submission to CSV
submission.to_csv('submission.csv', index=False)
print('Submission file created.')

Submission file created.
