## Spaceship Titanic Challenge

[https://www.kaggle.com/competitions/spaceship-titanic](https://www.kaggle.com/competitions/spaceship-titanic)

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load data & get familiar with it

In [2]:
df = pd.read_csv('data/train.csv')
print(df.shape)
df.head()

(8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Print number of unique values in categorical columns

In [4]:
unq_count = [
    (col, df[col].nunique())
    for col in df.columns
    if df[col].dtype == "object"
]
unq_count

[('PassengerId', 8693),
 ('HomePlanet', 3),
 ('CryoSleep', 2),
 ('Cabin', 6560),
 ('Destination', 3),
 ('VIP', 2),
 ('Name', 8473)]

## Drop the `PassengerId, Cabin & Name` column and `split dataset into train & test`

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["PassengerId", "Name", "Cabin", "Transported"], axis=1),
    df["Transported"],
    test_size=0.2,
    random_state=42,
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6954, 10), (1739, 10), (6954,), (1739,))

In [6]:
X_train.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
2333,Earth,False,TRAPPIST-1e,28.0,False,0.0,55.0,0.0,656.0,0.0
2589,Earth,False,TRAPPIST-1e,17.0,False,0.0,1195.0,31.0,0.0,0.0
8302,Europa,True,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0


---

## Get column names on which we will apply different imputation and transformations

In [7]:
nominal_columns = ['HomePlanet','Destination']
ordinal_columns = ['CryoSleep','VIP']
numerical_columns = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

---

## Create `Pipelines` & `ColumnTransformers` 😎

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


my_column_impute_transformer = ColumnTransformer(
    transformers=[
        (
            "impute_missing_numerical",
            SimpleImputer(strategy="mean"),
            [3, 5, 6, 7, 8, 9],
        ),
        (
            "impute_missing_categorical",
            SimpleImputer(strategy="most_frequent"),
            [0, 1, 2, 4],
        ),
    ],
    remainder="passthrough",
)


my_categorical_encoding_transformer = ColumnTransformer(
    transformers=[
        (
            "one_hot_encode_categorical",
            OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first"),
            [6, 7, 8, 9],
        ),
    ],
    remainder="passthrough",
)

my_feature_scaling_transformer = ColumnTransformer(
    transformers=[
        ("standardize_numerical", StandardScaler(), [6, 7, 8, 9, 10, 11]),
    ],
    remainder="passthrough",
)

In [9]:
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBClassifier

my_xgb_classifier = XGBClassifier()

base_pipeline = Pipeline(
    [
        ("handle_missing_values", my_column_impute_transformer),
        ("handle_categorical_encoding", my_categorical_encoding_transformer),
        ("standardize_data", my_feature_scaling_transformer),
    ]
)

my_xgb_clf_pipeline = make_pipeline(base_pipeline, my_xgb_classifier)

In [10]:
# ordinal encode the y_train
from sklearn.preprocessing import LabelEncoder

my_y_train_label_encoder = LabelEncoder()
y_train_encoded = my_y_train_label_encoder.fit_transform(y_train)
y_train_encoded

array([0, 0, 1, ..., 0, 0, 0])

In [11]:
my_xgb_clf_pipeline.fit(X_train, y_train_encoded)

In [13]:
y_train_pred = my_xgb_clf_pipeline.predict(X_train)
y_test_pred = my_xgb_clf_pipeline.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score

# training accuracy
training_accuracy = accuracy_score(y_train, y_train_pred)

#testing accuracy
testing_accuracy = accuracy_score(y_test, y_test_pred)

In [15]:
print(f"training accuracy: {training_accuracy}")
print(f"testing accuracy: {testing_accuracy}")

training accuracy: 0.8846706931262582
testing accuracy: 0.780333525014376


---

## Let's now create a pipeline with `RandomForest` model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

my_random_forest_model = RandomForestClassifier(n_estimators=100)

my_rf_model_pipeline = make_pipeline(
    base_pipeline,
    my_random_forest_model
)

In [17]:
my_rf_model_pipeline.fit(X_train, y_train_encoded)

In [18]:
y_train_pred = my_rf_model_pipeline.predict(X_train)
y_test_pred = my_rf_model_pipeline.predict(X_test)


# training accuracy
training_accuracy = accuracy_score(y_train, y_train_pred)

# testing accuracy
testing_accuracy = accuracy_score(y_test, y_test_pred)

In [19]:
print(f"training accuracy: {training_accuracy}")
print(f"testing accuracy: {testing_accuracy}")

training accuracy: 0.9373022720736267
testing accuracy: 0.7694077055779184


---

## Create `Submission` dataset

In [20]:
test_df = pd.read_csv('data/test.csv')

In [23]:
my_final_xgb_clf_pred = my_xgb_clf_pipeline.predict(test_df)
my_final_rf_pred = my_rf_model_pipeline.predict(test_df)

In [24]:
my_submission_dataset_xgb = pd.DataFrame(
    {
        "PassengerId": test_df["PassengerId"],
        "Transported": [curr_test == 1 for curr_test in my_final_rf_pred],
    }
)

my_submission_dataset_rf = pd.DataFrame(
    {
        "PassengerId": test_df["PassengerId"],
        "Transported": [curr_test == 1 for curr_test in my_final_xgb_clf_pred],
    }
)

In [25]:
my_submission_dataset_rf.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [27]:
my_submission_dataset_rf.to_csv(path_or_buf="data/my_submission_rf.csv", index=False)
my_submission_dataset_xgb.to_csv(
    path_or_buf="data/my_submission_xgb_clf.csv", index=False
)