## Model #2 for Spaceship Titanic

### Data load and preprocess

In [85]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [69]:
df_train = pd.read_csv(r'datasets/train.csv')
df_test = pd.read_csv(r'datasets/test.csv')

In [70]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


Handle missing values

In [71]:
# Fill NaN strings with Unknown
df_train['HomePlanet'].fillna('UnknownPlanet', inplace=True)
df_test['HomePlanet'].fillna('UnknownPlanet', inplace=True)

df_train['Destination'].fillna('UnknownDestin', inplace=True)
df_test['Destination'].fillna('UnknownDestin', inplace=True)

# Fill NaN nums with 0
df_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
df_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)

# Fill CryoSleep bool with 0.5 and convert to float
df_train['CryoSleep'] = df_train['CryoSleep'].fillna(0.5).astype(float)
df_test['CryoSleep'] = df_test['CryoSleep'].fillna(0.5).astype(float)

# Fill Age with mean
mean_age_train = round(df_train['Age'].mean())
mean_age_test = round(df_test['Age'].mean())
df_train['Age'].fillna(mean_age_train, inplace=True)
df_test['Age'].fillna(mean_age_test, inplace=True)

# Fill VIP with False (most common by difference) and convert to int
df_train['VIP'] = df_train['VIP'].fillna(False).astype(bool).astype(int)
df_test['VIP'] = df_test['VIP'].fillna(False).astype(bool).astype(int)

Name and Cabin columns

In [72]:
# Create separated column for surnames only
df_train['Surname'] = df_train['Name'].apply(lambda x: str(x).split()[-1] if pd.notna(x) else x)
df_test['Surname'] = df_test['Name'].apply(lambda x: str(x).split()[-1] if pd.notna(x) else x)

# Group Cabin based on the Surname and fill NaN with it
df_surname_cabin_train = df_train.dropna(subset=['Cabin']).groupby('Surname')['Cabin'].first()
df_train['Cabin'] = df_train['Cabin'].fillna(df_train['Surname'].map(df_surname_cabin_train))
df_surname_cabin_test = df_test.dropna(subset=['Cabin']).groupby('Surname')['Cabin'].first()
df_test['Cabin'] = df_test['Cabin'].fillna(df_test['Surname'].map(df_surname_cabin_test))

In [73]:
# Split Cabin in its three sections
df_train[['CabinDeck', 'CabinNum', 'CabinSide']] = df_train['Cabin'].str.split('/', expand=True)
df_test[['CabinDeck', 'CabinNum', 'CabinSide']] = df_test['Cabin'].str.split('/', expand=True)

# Fill NaN Deck with U (for Unknown)
df_train['CabinDeck'].fillna('U', inplace=True)
df_test['CabinDeck'].fillna('U', inplace=True)

# Fill NaN Num with -1
df_train['CabinNum'] = df_train['CabinNum'].fillna(-1).astype(int)
df_test['CabinNum'] = df_train['CabinNum'].fillna(-1).astype(int)

# Fill NaN Side with 0.5 and convert to float
df_train['CabinSide'] = df_train['CabinSide'].map({'P': 1, 'S': 0}).fillna(0.5).astype(float)
df_test['CabinSide'] = df_test['CabinSide'].map({'P': 1, 'S': 0}).fillna(0.5).astype(float)

# Drop Cabin and Surname columns
df_train.drop(columns=['Cabin', 'Name', 'Surname'], inplace=True)
df_test.drop(columns=['Cabin', 'Name', 'Surname'], inplace=True)

Encode categorical columns

In [74]:
# Select categorical columns
cat_cols = ['HomePlanet', 'Destination', 'CabinDeck']

# Create the encoder
oh_encoder = OneHotEncoder(drop='first', sparse_output=False) # Drop first to avoid multicollinearity, sparse false to get an array
encoded_train = oh_encoder.fit_transform(df_train[cat_cols])
encoded_test = oh_encoder.transform(df_test[cat_cols])

# Convert array into DataFrame and get the columns names
df_train_encoded = pd.DataFrame(encoded_train, columns=oh_encoder.get_feature_names_out(cat_cols))
df_test_encoded = pd.DataFrame(encoded_test, columns=oh_encoder.get_feature_names_out(cat_cols))

# Join the original DF with the encoded columns
df_train_final = pd.concat([df_train.drop(columns=cat_cols), df_train_encoded], axis=1)
df_test_final = pd.concat([df_test.drop(columns=cat_cols), df_test_encoded], axis=1)

In [75]:
# Check if there is any column without int or float dtype
for col in df_train_final.columns:
    if not pd.api.types.is_numeric_dtype(df_train_final[col]):
        print(f'{col} is not numeric')

PassengerId is not numeric


### Prepare data for training and validation

In [76]:
# Scale columns, except OH encoded
to_scale = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
scaler = MinMaxScaler()

df_train_final[to_scale] = scaler.fit_transform(df_train_final[to_scale])
df_test_final[to_scale] = scaler.fit_transform(df_test_final[to_scale])

# Select features and target
X = df_train_final.drop(columns=['PassengerId', 'Transported'])
y = df_train_final.Transported

# Split into training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, random_state=10)

### Build the model

In [77]:
# Instantiate model
hgbc = HistGradientBoostingClassifier(max_iter=100, random_state=10)

# Train
hgbc.fit(X_train, y_train)

y_pred = hgbc.predict(X_valid)
print(f'Accuracy: {accuracy_score(y_valid, y_pred)*100:.2f}%')

Accuracy: 80.25%


In [78]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=5, random_state=10)
etc.fit(X_train, y_train)

y_etc_pred = etc.predict(X_valid)

print(f'Accuracy: {accuracy_score(y_valid, y_etc_pred)*100:.2f}%')

Accuracy: 79.18%


In [79]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=10)
abc.fit(X_train, y_train)

y_abc_pred = abc.predict(X_valid)

print(f'Accuracy: {accuracy_score(y_valid, y_abc_pred)*100:.2f}%')

Accuracy: 77.95%


### Predictions and submission

In [81]:
X_test = df_test_final.drop(columns=['PassengerId'])

y_sub = hgbc.predict(X_test)

In [86]:
new_sub = pd.DataFrame({
    'PassengerId': df_test_final['PassengerId'],
    'Transported': y_sub
})

if not os.path.exists('space_titanic_sub.csv'):
    new_sub.to_csv('space_titanic_sub.csv', index=False)
else:
    print('Submission already created')