In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch.utils.data
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import os



In [74]:
# Load Data
train_data = pd.read_csv(r"C:\Users\charl\Space_titanic\train.csv")
test_data = pd.read_csv(r"C:\Users\charl\Space_titanic\test.csv")


# Feature Engineering
def process_data(df):
    # Split Cabin info (B/0/P -> B, 0, P)
    df[['CabinDeck', 'CabinNum', 'CabinSide']] = df['Cabin'].str.split('/', expand=True)
    
    # Convert CabinNum to float
    df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')
    
    # Drop columns that cause overfitting/are unused
    df.drop(columns=['Name', 'Cabin', 'PassengerId'], inplace=True, errors='ignore')
    
    # Log Transform Spending
    spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spend_cols].sum(axis=1) 
    
    # Apply Log1p to handle skewness
    for col in spend_cols + ['TotalSpend']:
        df[col] = df[col].fillna(0)
        df[col] = np.log1p(df[col])
        
    return df

# Apply Feature Engineering
print("Processing data")

# Save ids
test_passenger_ids = test_data['PassengerId'].copy()

# Then process
train_data = process_data(train_data.drop(columns=['Transported']))
test_data = process_data(test_data)

# Config
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinSide']
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum', 'TotalSpend']

# Imputation and Scaling
print("Imputing and Scaling...")

# Impute Numbers (Median)
imputer_num = SimpleImputer(strategy='median')
train_data[num_cols] = imputer_num.fit_transform(train_data[num_cols])
test_data[num_cols] = imputer_num.transform(test_data[num_cols])

# Scale Numbers (StandardScaler)
scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

# Impute Categories (Mode)
imputer_cat = SimpleImputer(strategy='most_frequent')
train_data[cat_cols] = imputer_cat.fit_transform(train_data[cat_cols])
test_data[cat_cols] = imputer_cat.transform(test_data[cat_cols])

# Encoding
print("Encoding")
embedding_sizes = []

for col in cat_cols:
    le = LabelEncoder()
    # Fit on all data
    all_values = pd.concat([train_data[col], test_data[col]], axis=0)
    le.fit(all_values)
    
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    
    # Calculate Embedding dimensions
    input_size = len(le.classes_)
    output_dim = min(50, (input_size + 1) // 2)
    embedding_sizes.append((input_size, output_dim))
print("Preprocessing done")


Processing data
Imputing and Scaling...
Encoding
Preprocessing done


In [75]:
#Make torch dataset class

class SpaceTitanicDataset(Dataset):
    def __init__(self, X, y=None, embedded_col_names=None):
        """
        X: The DataFrame containing input features
        y: The Series containing labels (Transported or not). None for test set.
        embedded_col_names: List of column names that are categorical
        """
        # 1. Setup the Categorical Data
        if embedded_col_names:
            self.X_cat = X[embedded_col_names].copy().values.astype('int64') # Ints for lookup
            #Drop to avoid duplication
            self.X_cont = X.drop(columns=embedded_col_names).copy().values.astype('float32')
        else:
            self.X_cat = None
            self.X_cont = X.values.astype('float32')
            
        # 2. Setup the Label
        if y is not None:
            self.y = y.values.astype('float32').reshape(-1, 1) #Reshape into a column vector
        else:
            self.y = None

    def __len__(self):
        
        return len(self.X_cont)

    def __getitem__(self, idx):
        
        x_cont = self.X_cont[idx]
        
        if self.X_cat is not None:
            x_cat = self.X_cat[idx]
        else:
            x_cat = [] 
            
        if self.y is not None:
            return x_cat, x_cont, self.y[idx]
        else:
            return x_cat, x_cont 

In [76]:
#Create the Model architecture
class SpaceTitanicModel(nn.Module):
    def __init__(self, embedding_sizes, n_cont_features, dropout_p=0.3):
        """
        Args:
            embedding_sizes (list of tuples): List of (num_categories, embedding_dim) 
            n_cont_features (int): Number of continuous features (Age, Spend, etc.)
            dropout_p (float): Default is 30%
        """
        super().__init__()
        
        #Embedding layers
        self.embeddings = nn.ModuleList([
            nn.Embedding(categories, size) for categories, size in embedding_sizes
        ])
        
        # Length of embedding features
        self.n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_cont = n_cont_features
        
        # Total Input Size = Embeddings + Continuous features
        self.in_features = self.n_emb + self.n_cont

        
        # Layers of the model
        self.layer1 = nn.Sequential(
            nn.Linear(self.in_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        )
        
        self.layer2 = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        )
        
        self.layer3 = nn.Sequential(
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        )
        
        # Output Layer
        self.out = nn.Linear(64, 1)
        
    def forward(self, x_cat, x_cont):
        # Embeddings
        embeddings = []
        for i, e in enumerate(self.embeddings):
            # Lookup the vector for the i-th categorical column
            embeddings.append(e(x_cat[:, i]))
            
        # Concatenate embeddings horizontally (Batch, Total_Emb_Dim)
        x = torch.cat(embeddings, 1)
        
        # Concat with continuous features(Batch, Total_Emb_Dim + n_cont)
        x = torch.cat([x, x_cont], 1)
        
        # Feed forward
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        # Final output
        x = self.out(x)
        
        return x

In [77]:
#Calculate Continuous Features size
n_cont_features = len(num_cols)

In [81]:
# Split into train/validation
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_data, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

# Create datasets
train_ds = SpaceTitanicDataset(X_train, y_train, embedded_col_names=cat_cols)
val_ds = SpaceTitanicDataset(X_val, y_val, embedded_col_names=cat_cols)

train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=False)

# Initialize model
model = SpaceTitanicModel(embedding_sizes, n_cont_features, dropout_p=0.4)
model.to(device)

loss_func = nn.BCEWithLogitsLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

In [82]:
# Training Loop
num_epochs = 100
best_val_acc = 0
patience_counter = 0
early_stop_patience = 15

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for x_cat, x_cont, y in train_dl:
        x_cat, x_cont, y = x_cat.to(device), x_cont.to(device), y.to(device)
        
        optimizer.zero_grad()
        outputs = model(x_cat, x_cont)
        loss = loss_func(outputs, y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        predicted = (torch.sigmoid(outputs) > 0.5).float()
        correct += (predicted == y).sum().item()
        total += y.size(0)
    
    train_loss = running_loss / len(train_dl)
    train_acc = correct / total
    
    #Validation
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0.0
    
    with torch.no_grad():
        for x_cat, x_cont, y in val_dl:
            x_cat, x_cont, y = x_cat.to(device), x_cont.to(device), y.to(device)
            outputs = model(x_cat, x_cont)
            val_loss += loss_func(outputs, y).item()
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            val_correct += (predicted == y).sum().item()
            val_total += y.size(0)
    
    val_loss /= len(val_dl)
    val_acc = val_correct / val_total
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')  # Save best
    else:
        patience_counter += 1
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:3d} | Train Loss: {train_loss:.4f} Acc: {train_acc*100:.2f}% | Val Loss: {val_loss:.4f} Acc: {val_acc*100:.2f}%")
    
    if patience_counter >= early_stop_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

print(f"\nBest Validation Accuracy: {best_val_acc*100:.2f}%")
model.load_state_dict(torch.load('best_model.pt'))  # Load best model for inference

Epoch   5 | Train Loss: 0.4566 Acc: 78.43% | Val Loss: 0.4166 Acc: 79.59%
Epoch  10 | Train Loss: 0.4423 Acc: 79.22% | Val Loss: 0.4099 Acc: 79.93%
Epoch  15 | Train Loss: 0.4293 Acc: 79.38% | Val Loss: 0.4032 Acc: 80.68%
Epoch  20 | Train Loss: 0.4238 Acc: 80.16% | Val Loss: 0.4002 Acc: 80.22%
Epoch  25 | Train Loss: 0.4176 Acc: 80.43% | Val Loss: 0.3972 Acc: 80.22%
Epoch  30 | Train Loss: 0.4104 Acc: 80.77% | Val Loss: 0.3930 Acc: 80.79%
Epoch  35 | Train Loss: 0.4079 Acc: 80.53% | Val Loss: 0.3955 Acc: 79.64%
Early stopping at epoch 38

Best Validation Accuracy: 81.08%


<All keys matched successfully>

In [83]:
# Inference
print("Testing...")

# Create test dataset
test_ds = SpaceTitanicDataset(test_data, y=None, embedded_col_names=cat_cols)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

model.eval()
all_predictions = []

with torch.no_grad():
    for x_cat, x_cont in test_dl:
        x_cat = x_cat.to(device)
        x_cont = x_cont.to(device)
        
        outputs = model(x_cat, x_cont)
        
        preds = (torch.sigmoid(outputs) > 0.5).cpu().numpy().flatten()
        all_predictions.extend(preds)

submission_df = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': all_predictions
})

notebook_dir = os.getcwd()

# Save there
submission_df.to_csv(os.path.join(notebook_dir, 'submission.csv'), index=False)

print(f"Saved to: {os.path.join(notebook_dir, 'submission.csv')}")

Testing...
Saved to: C:\Users\charl\Titanic_notebook\submission.csv
