## Predictions and submission

In [55]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import joblib
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler

#### Preprocess test data

In [56]:
df_test = pd.read_csv(r'datasets/test.csv')
df_test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [57]:
# Check for duplicates and missing rows
print(f'Number of duplicated rows: {df_test.duplicated().sum()}')
print()
print('Missing values:')
print(df_test.isna().sum())

Number of duplicated rows: 0

Missing values:
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [58]:
# Handle numerical columns
from num_utils import handle_num_cols, fillna_bills, fillna_age

df_test = handle_num_cols(df_test)
df_test = df_test.apply(fillna_bills, axis=1)
df_test = fillna_age(df_test)

In [59]:
# Handle categorical columns
from cat_utils import home_destin, vip, cryo_sleep, name, cabin, apply_encoding

df_test = home_destin(df_test)
df_test = vip(df_test)
df_test = cryo_sleep(df_test)
df_test = name(df_test)
df_test = cabin(df_test)

In [60]:
# Drop unecessary columns
df_test.drop(columns=['HomePlanet', 'Destination'], inplace=True)
df_test.drop(columns=['Name'], inplace=True)
df_test.drop(columns=['Cabin'], inplace=True)

In [61]:
df_test

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Billed,Route,Surname,Cabin_deck,Cabin_num,Cabin_side
0,0013_01,1,27.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,Earth_TRAPPIST-1e,Carsoning,G,3,S
1,0018_01,0,19.000000,0,0.0,9.0,0.0,2823.0,0.0,2832.0,Earth_TRAPPIST-1e,Peckers,F,4,S
2,0019_01,1,31.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,Europa_55 Cancri e,Unhearfus,C,0,S
3,0021_01,0,38.000000,0,0.0,6652.0,0.0,181.0,585.0,7418.0,Europa_TRAPPIST-1e,Caltilter,C,1,S
4,0023_01,0,20.000000,0,10.0,0.0,635.0,0.0,0.0,645.0,Earth_TRAPPIST-1e,Harperez,F,5,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,1,34.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,Earth_TRAPPIST-1e,Peter,G,1496,S
4273,9269_01,0,42.000000,0,0.0,847.0,17.0,10.0,144.0,1018.0,Earth_TRAPPIST-1e,Scheron,G,141,S
4274,9271_01,1,28.658146,0,0.0,0.0,0.0,0.0,0.0,0.0,Mars_55 Cancri e,Pore,D,296,P
4275,9273_01,0,28.658146,0,0.0,2680.0,0.0,0.0,523.0,3203.0,Europa_TRAPPIST-1e,Conale,D,297,P


In [62]:
# Convert num columns to int
df_test['VIP'] = df_test['VIP'].astype(int)
df_test['CryoSleep'] = df_test['CryoSleep'].astype(int)

# Binary encode Side
df_test['Cabin_side'] = df_test['Cabin_side'].map({'P': 1, 'S': 0})
common_side = df_test['Cabin_side'].mode()[0]
df_test['Cabin_side'].fillna(common_side, inplace=True)

# Special treatment for Cabin_num
df_test['Cabin_num'] = pd.to_numeric(df_test['Cabin_num'], errors='coerce') # Converts 'Unknown' into Nan
df_test['Cabin_num'].fillna(-1, inplace=True) # Fill with random predefined value (-1)
df_test['Cabin_num'] = df_test['Cabin_num'].astype(int)

# Load encoders from previous training
route_encoder = joblib.load('leave_route.joblib')
surname_encoder = joblib.load('leave_surname.joblib')
deck_encoder = joblib.load('leave_deck.joblib')

# Encode categoricals based on Leave-One-Out
df_test['Route'] = route_encoder.transform(df_test['Route'])
df_test['Surname'] = surname_encoder.transform(df_test['Surname'])
df_test['Cabin_deck'] = deck_encoder.transform(df_test['Cabin_deck'])

In [63]:
df_test

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Billed,Route,Surname,Cabin_deck,Cabin_num,Cabin_side
0,0013_01,1,27.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.393560,0.500000,0.516841,3,0.0
1,0018_01,0,19.000000,0,0.0,9.0,0.0,2823.0,0.0,2832.0,0.393560,0.666667,0.441995,4,0.0
2,0019_01,1,31.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.689204,0.600000,0.680481,0,0.0
3,0021_01,0,38.000000,0,0.0,6652.0,0.0,181.0,585.0,7418.0,0.635400,0.503624,0.680481,1,0.0
4,0023_01,0,20.000000,0,10.0,0.0,635.0,0.0,0.0,645.0,0.393560,0.500000,0.441995,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,1,34.000000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.393560,0.500000,0.516841,1496,0.0
4273,9269_01,0,42.000000,0,0.0,847.0,17.0,10.0,144.0,1018.0,0.393560,0.000000,0.516841,141,0.0
4274,9271_01,1,28.658146,0,0.0,0.0,0.0,0.0,0.0,0.0,0.611399,0.503624,0.432432,296,1.0
4275,9273_01,0,28.658146,0,0.0,2680.0,0.0,0.0,523.0,3203.0,0.635400,1.000000,0.432432,297,1.0


### Prepare predictions

In [64]:
features = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Billed', 'Route', 'Surname', 'Cabin_deck', 'Cabin_num', 'Cabin_side']
X_test = df_test[features]
scaler = joblib.load('scaler.joblib')
X_test = scaler.transform(X_test)

# Load model
gbc = joblib.load('gbc_model.joblib')

# Make predictions
y_pred = gbc.predict(X_test)
y_pred = y_pred.astype(bool)

In [65]:
# Create the submission Dataframe
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Transported': y_pred
})

if not os.path.exists('spaceship_titanic_submission.csv'):
    submission.to_csv('spaceship_titanic_submission.csv', index=False)
else:
    print('Submission already created')