## Model building for Spaceship Titanic challenge

In [1]:
import os
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings('ignore')

import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

### Step 1: Data analysis and preprocessing

In [2]:
df_train = pd.read_csv(r'datasets/train.csv')
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
# Check for duplicates
print(f'Number of duplicated rows: {df_train.duplicated().sum()}')

Number of duplicated rows: 0


In [4]:
# Check for Nan values
print('Missing values:')
print(df_train.isna().sum())

Missing values:
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [5]:
# Transform Transported to numerical values
df_train['Transported'] = df_train['Transported'].astype(int)

1.1. Handle numerical columns

In [6]:
# Calculate a new column for the total amount spent within all services
df_train['Billed'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']
print('Total rows with some num column missing:', df_train['Billed'].isna().sum())
print(f'Average billed amount (if billed): {df_train[df_train['Billed'] != 0]['Billed'].mean():.2f}€')
print(f'Average billed amount (total): {df_train['Billed'].mean():.2f}€')

Total rows with some num column missing: 908
Average billed amount (if billed): 2546.85€
Average billed amount (total): 1484.60€


In [7]:
# Fill NaN based on the bill
billed_min = 1484.60
billed_max = 2546.85
nan_bill = df_train['Billed'].isna()

# Applies max amount if condition is satisfied (> 0) and min amount if not (0)
replace_bill = np.where(df_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].gt(0).any(axis=1),
                        billed_min,
                        billed_max)
df_train.loc[nan_bill, 'Billed'] = replace_bill[nan_bill]


In [8]:
# Fill NaN of numerical cols
def fillna_num_cols(row):
    target = row['Billed']
    num_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    num_sum = row[num_cols].sum(skipna=True) # Sum of non NaN values
    row[num_cols] = row[num_cols].fillna((target - num_sum)/row[num_cols].isna().sum())
    return row

df_train = df_train.apply(fillna_num_cols, axis=1)

In [9]:
# Fill Age with the mean
mean_age = df_train['Age'].mean()
df_train['Age'].fillna(mean_age, inplace=True)

1.2. HomePlanet and Destination

In [10]:
# Create new column with the route (home - destination)
df_train['Route'] = df_train['HomePlanet'].fillna('Unknown') + "_" + df_train['Destination'].fillna('Unknown')

# Fill with the most common home/destination for each route
df_train['Route'].value_counts()

# Unknown_Trappist --> Earth_Trappist
# Earth_Unknown --> Earth_Trappist
# Mars_Unknown --> Mars_Trappist
# Europa_Unknown --> Europa_Trappist
# Unknown_Cancri --> Europa_Cancri
# Unknown_PSO --> Mars_PSO
# Unknown_Unknown --> Earth_Trappist (most common)

Route
Earth_TRAPPIST-1e        3101
Mars_TRAPPIST-1e         1475
Europa_TRAPPIST-1e       1189
Europa_55 Cancri e        886
Earth_PSO J318.5-22       712
Earth_55 Cancri e         690
Mars_55 Cancri e          193
Unknown_TRAPPIST-1e       150
Earth_Unknown              99
Mars_PSO J318.5-22         49
Mars_Unknown               42
Europa_Unknown             37
Unknown_55 Cancri e        31
Europa_PSO J318.5-22       19
Unknown_PSO J318.5-22      16
Unknown_Unknown             4
Name: count, dtype: int64

In [11]:
def set_routes(df):
    routes = ['Unknown_TRAPPIST-1e', 'Earth_Unknown', 'Mars_Unknown', 'Europa_Unknown', 'Unknown_55 Cancri e', 'Unknown_PSO J318.5-22', 'Unknown_Unknown']
    for route in routes:
        if route in ['Unknown_TRAPPIST-1e', 'Earth_Unknown', 'Unknown_Unknown']:
            df.loc[df['Route'] == route, 'Route'] = 'Earth_TRAPPIST-1e'
        elif route == 'Mars_Unknown':
            df.loc[df['Route'] == route, 'Route'] = 'Mars_TRAPPIST-1e'
        elif route == 'Europa_Unknown':
            df.loc[df['Route'] == route, 'Route'] = 'Europa_TRAPPIST-1e'
        elif route == 'Unknown_55 Cancri e':
            df.loc[df['Route'] == route, 'Route'] = 'Europa_55 Cancri e'
        elif route == 'Unknown_PSO J318.5-22':
            df.loc[df['Route'] == route, 'Route'] = 'Mars_PSO J318.5-22'
    return df

df_train = set_routes(df_train)

In [12]:
# Encode Route using Leave-One-Out Encoding
leave_route = ce.LeaveOneOutEncoder(cols=['Route'])
df_train['Route'] = leave_route.fit_transform(df_train['Route'], df_train['Transported'])

# Drop Home and Destination columns
df_train.drop(columns=['HomePlanet', 'Destination'], inplace=True)

1.3. VIP

In [13]:
# Fill VIP with the most common (False, by a huge difference)
mode_vip = df_train['VIP'].mode()[0]
df_train['VIP'].fillna(mode_vip, inplace=True)

# Convert to int
df_train['VIP'] = df_train['VIP'].astype(int)

1.4. CryoSleep

In [14]:
# Create dataframe with the most common Cryo for each route
df_mode_cryo = df_train.groupby('Route')['CryoSleep'].agg(lambda x: x.mode()[0])

# Fill NaN in Cryo with the most common bool based on the route
df_train['CryoSleep'] = df_train.apply(
    lambda row: df_mode_cryo[row['Route']] if pd.isna(row['CryoSleep'])
    else row['CryoSleep'],
    axis=1)

# Convert to int
df_train['CryoSleep'] = df_train['CryoSleep'].astype(int)

1.5. Name

In [15]:
# Fill NaN with Unknown
df_train['Name'].fillna('Unknown', inplace=True)

# Create separated column for surnames only
df_train['Surname'] = df_train['Name'].apply(lambda x: str(x).split()[-1])

# Drop Name column
df_train.drop(columns=['Name'], inplace=True)

# Encode Surname
leave_surname = ce.LeaveOneOutEncoder(cols=['Surname'])
df_train['Surname'] = leave_surname.fit_transform(df_train['Surname'], df_train['Transported'])

1.6. Cabin

In [16]:
# Fill NaN based on Surname
df_surname_cabin = df_train.dropna(subset=['Cabin']).groupby('Surname')['Cabin'].first()
df_train['Cabin'] = df_train['Cabin'].fillna(df_train['Surname'].map(df_surname_cabin))

In [17]:
# Split Cabin into Deck, Number and Side
df_train[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = df_train['Cabin'].str.split('/', expand=True)

# Drop Cabin column now
df_train.drop(columns=['Cabin'], inplace=True)

In [18]:
# Encode Deck
leave_deck = ce.LeaveOneOutEncoder(cols=['Cabin_deck'])
df_train['Cabin_deck'] = leave_deck.fit_transform(df_train['Cabin_deck'], df_train['Transported'])

# Convert Num into int
df_train['Cabin_num'] = df_train['Cabin_num'].astype(int)

# Binary encode Side
df_train['Cabin_side'] = df_train['Cabin_side'].map({'P': 1, 'S': 0})

1.6. Check dataset and missing values

In [19]:
df_train.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Billed,Route,Surname,Cabin_deck,Cabin_num,Cabin_side
0,0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.635918,0.503624,0.731893,0,1
1,0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,736.0,0.393379,1.0,0.441805,0,0
2,0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,10383.0,0.635918,0.6,0.5,0,0
3,0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,5176.0,0.635918,0.6,0.5,0,0
4,0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1091.0,0.393379,0.4,0.441805,1,0


In [20]:
df_train.isna().sum()

PassengerId     0
CryoSleep       0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Billed          0
Route           0
Surname         0
Cabin_deck      0
Cabin_num       0
Cabin_side      0
dtype: int64

### Step 2: Prepare training and validation data

In [21]:
features = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Billed', 'Route', 'Surname', 'Cabin_deck', 'Cabin_num', 'Cabin_side']
X = df_train[features]
y = df_train.Transported

# Scale the data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Split into training and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75)

In [22]:
# Save encoders and scaler for testing
if not os.path.exists('joblib_files'):
    os.makedirs('joblib_files')
    joblib.dump(leave_route, 'leave_route.joblib')
    joblib.dump(leave_surname, 'leave_surname.joblib')
    joblib.dump(leave_deck, 'leave_deck.joblib')
    joblib.dump(scaler, 'scaler.joblib')
else:
    print('Joblib already saved')

Joblib already saved


### Step 3: Select Machine Learning model

3.1. Histogram-Based Gradient Boosting

In [23]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

gbc = HistGradientBoostingClassifier(max_iter=100, random_state=10)
gbc.fit(X_train, y_train)

y_gbc_pred = gbc.predict(X_valid)

print(f'Accuracy: {accuracy_score(y_valid, y_gbc_pred)*100:.2f}%')

Accuracy: 100.00%


3.2. Extra Trees Classifier

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=5, random_state=11)
etc.fit(X_train, y_train)

y_etc_pred = etc.predict(X_valid)

print(f'Accuracy: {accuracy_score(y_valid, y_etc_pred)*100:.2f}%')

Accuracy: 97.56%


3.3. Bagging meta-estimator

In [25]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

bme = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5, random_state=12)
bme.fit(X_train, y_train)

y_bme_pred = bme.predict(X_valid)

print(f'Accuracy: {accuracy_score(y_valid, y_bme_pred)*100:.2f}%')

Accuracy: 80.54%


3.4. Voting Classifier

In [26]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

cl_rf = RandomForestClassifier(n_estimators=100, random_state=1)
cl_nb = GaussianNB()
cl_lr = LogisticRegression(random_state=1)
cl_svc = SVC(kernel='linear', probability=True)

vc_soft = VotingClassifier(
    estimators=[('rf', cl_rf), ('nb', cl_nb), ('lr', cl_lr), ('svc', cl_svc)],
    voting='soft'
)

vc_hard = VotingClassifier(
    estimators=[('rf', cl_rf), ('nb', cl_nb), ('lr', cl_lr), ('svc', cl_svc)],
    voting='hard'
)

vc_soft.fit(X_train, y_train)
vc_hard.fit(X_train, y_train)

y_soft_pred = vc_soft.predict(X_valid)
y_hard_pred = vc_hard.predict(X_valid)

print(f'Accuracy (soft voting): {accuracy_score(y_valid, y_soft_pred)*100:.2f}%')
print(f'Accuracy (hard voting): {accuracy_score(y_valid, y_hard_pred)*100:.2f}%')

Accuracy (soft voting): 89.88%
Accuracy (hard voting): 77.97%


In [27]:
for clf, label in zip([cl_rf, cl_nb, cl_lr, vc_soft], ['Random Forest', 'Naive-Bayes', 'Logistic Regression', 'SVC', 'Ensemble']):
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_valid)
    score = accuracy_score(y_valid, prediction)
    print(f'Accuracy of {label}: {score*100:.2f}%')

Accuracy of Random Forest: 98.90%
Accuracy of Naive-Bayes: 67.25%
Accuracy of Logistic Regression: 76.08%
Accuracy of SVC: 89.88%


3.5. AdaBoost

In [28]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=14)
abc.fit(X_train, y_train)

y_abc_pred = abc.predict(X_valid)

print(f'Accuracy: {accuracy_score(y_valid, y_abc_pred)*100:.2f}%')

Accuracy: 99.49%


In [29]:
# Save the best model: HB Gradient Boosting
if not os.path.exists('joblib_files/gbc_model.joblib'):
    joblib.dump(gbc, 'joblib_files/gbc_model.joblib')
else:
    print('Model already saved')

Model already saved
