In [3]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

# Import preprocessing functions from the preprocessing notebook
%run preprocessing.ipynb

In [4]:
# Load training and test data
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Apply preprocessing to both datasets
train_preprocessed = preprocessing(train_data) # type: ignore
test_preprocessed = preprocessing(test_data) # type: ignore

print("Preprocessed training data shape:", train_preprocessed.shape)
print("Preprocessed test data shape:", test_preprocessed.shape)

# Convert target variable to numeric if needed
train_preprocessed['Transported'] = train_preprocessed['Transported'].map({True: 1, False: 0}).astype(int)

# Split preprocessed training data into features and target
X = train_preprocessed.drop(columns=['PassengerId', 'Transported'])
y = train_preprocessed['Transported']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Prepare test data for prediction
X_test = test_preprocessed.drop(columns=['PassengerId'])
test_ids = test_preprocessed['PassengerId']

print("Training features shape:", X_train.shape)
print("Validation features shape:", X_val.shape)
print("Test features shape:", X_test.shape)

Preprocessed training data shape: (8693, 17)
Preprocessed test data shape: (4277, 16)
Training features shape: (6954, 15)
Validation features shape: (1739, 15)
Test features shape: (4277, 15)


In [5]:
import warnings

warnings.filterwarnings("ignore")

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
        'Logistic Regression CV': LogisticRegressionCV(),
    'SGD': SGDClassifier(),
    
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate models
def evaluate_models(X_train, X_test, y_train, y_test):
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.append((name, acc))
    
    # Sort models by accuracy
    results.sort(key=lambda x: x[1], reverse=True)
    return results


results = evaluate_models(X_train, X_val, y_train, y_val)
    
print("Model Performance:")
for name, acc in results:
    print(f"{name}: {acc:.6f}")

Model Performance:
Random Forest: 0.813686
Gradient Boosting: 0.805635
K-Nearest Neighbors: 0.798735
Bagging: 0.792984
Logistic Regression CV: 0.792409
Logistic Regression: 0.791259
AdaBoost: 0.787234
Support Vector Machine: 0.786659
Decision Tree: 0.745256
SGD: 0.695227


In [8]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test).astype(bool)

In [9]:
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Transported': y_pred
})

In [10]:
submission.to_csv("submission.csv", index=False)

In [12]:
competition_name = "spaceship-titanic"
submission_path = './submission.csv'
!kaggle competitions submit -c {competition_name} -f {submission_path} -m "Model Selection Approach"
!kaggle competitions submissions -c {competition_name}

100%|███████████████████████████████████████| 56.4k/56.4k [00:00<00:00, 191kB/s]
Successfully submitted to Spaceship TitanicfileName        date                        description               status                     publicScore  privateScore  
--------------  --------------------------  ------------------------  -------------------------  -----------  ------------  
submission.csv  2025-04-04 19:16:54.653000  Model Selection Approach  SubmissionStatus.PENDING                              
submission.csv  2025-04-04 19:16:44.207000  Model Selection Approach  SubmissionStatus.COMPLETE  0.79915                    
submission.csv  2025-04-04 19:14:45         Submission 1              SubmissionStatus.COMPLETE  0.79915                    
submission.csv  2025-04-04 18:53:32         Submission 1              SubmissionStatus.COMPLETE  0.79752                    
submission.csv  2025-04-03 23:43:30.873000  Submission 1              SubmissionStatus.COMPLETE  0.00000                    
s