# Spaceship Titanic

Running the model should give the AI the capability of predict if passenger will be teleported to another dimension.

Model is a “supervised” data model, because we know the expected result, that is if passenger was teleported (“transported”) or not.

Because we must determine the relation between input data and the label, this is a REGRESSION PROBLEM: y = f(a, b, c, …)

In [2]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

print ("Pandas v."+ pd.__version__)

# Shows input data files
INPUT_DIR = './input'
print("Data Files:")
import os
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(" - " + os.path.join(dirname, filename))

Pandas v.2.2.3
Data Files:
 - ./input/test.csv
 - ./input/train.csv


In [3]:
# Read train data. Train data has a column ('Transported') with the known result
X = pd.read_csv(INPUT_DIR + '/train.csv', index_col='PassengerId')
y = X.Transported
X.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# Read test data
X_test = pd.read_csv(INPUT_DIR + '/test.csv', index_col='PassengerId')
X_test.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [5]:
print('Train data has ' + str(len(X)) + ' rows')
print('Test data has ' + str(len(X_test)) + ' rows.')

Train data has 8693 rows
Test data has 4277 rows.


In [6]:
# Break off the validation set from training data.
# We retain 80% of train data to make actual training, and 20% to make model validation
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [22]:
# We now want to add a new column named 'Money' that sums money spent by the passenger.
# We also remove the passenger name and cabin from the data.
# Finally, we transform booleans into floats

X_train_plus = X_train_full.copy()
X_valid_plus = X_valid_full.copy()

def clean_dataset(dataset):
    # Add Money column
    dataset['Money'] = dataset['RoomService'] + \
                       dataset['FoodCourt'] + \
                       dataset['ShoppingMall'] + \
                       dataset['Spa'] + \
                       dataset['VRDeck']
    # Drop useless columns
    if 'Transported' in dataset:
        dataset.drop(['Transported'], axis=1, inplace=True)
    dataset.drop(['RoomService'], axis=1, inplace=True)
    dataset.drop(['FoodCourt'], axis=1, inplace=True)
    dataset.drop(['ShoppingMall'], axis=1, inplace=True)
    dataset.drop(['Spa'], axis=1, inplace=True)
    dataset.drop(['VRDeck'], axis=1, inplace=True)
    dataset.drop(['Name'], axis=1, inplace=True)
    dataset.drop(['Cabin'], axis=1, inplace=True)
    # Replace booleans with floats
    dataset['CryoSleep'] = dataset['CryoSleep'].replace({'True': 1.0, 'False': 0.0}).astype(float)
    dataset['VIP'] = dataset['VIP'].replace({'True': 1.0, 'False': 0.0}).astype(float)
    return dataset

X_train_plus = clean_dataset(X_train_plus)
X_valid_plus = clean_dataset(X_valid_plus)
X_train_plus.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,Money
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8672_01,Earth,0.0,TRAPPIST-1e,40.0,0.0,443.0
7361_02,Europa,1.0,55 Cancri e,41.0,0.0,0.0
3761_01,,0.0,55 Cancri e,45.0,0.0,4537.0
0438_01,Europa,1.0,TRAPPIST-1e,40.0,0.0,
2875_01,Europa,1.0,55 Cancri e,55.0,0.0,0.0


In [23]:
def clean_train_and_valid_data(train_data, valid_data):
    # Replace literals
    ordinal_encoder = OrdinalEncoder()
    train_data[['HomePlanet']] = ordinal_encoder.fit_transform(train_data[['HomePlanet']])
    valid_data[['HomePlanet']] = ordinal_encoder.transform(valid_data[['HomePlanet']])
    train_data[['Destination']] = ordinal_encoder.fit_transform(train_data[['Destination']])
    valid_data[['Destination']] = ordinal_encoder.transform(valid_data[['Destination']])
    # Missing values
    imputer = SimpleImputer()
    train_data_final = pd.DataFrame(imputer.fit_transform(train_data))
    valid_data_final = pd.DataFrame(imputer.transform(valid_data))
    # Since imputation removed column names we put them back
    train_data_final.columns = train_data.columns
    valid_data_final.columns = valid_data.columns
    return train_data, valid_data

X_train_plus, X_valid_plus = clean_train_and_valid_data(X_train_plus, X_valid_plus)
X_train_plus.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,Money
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8672_01,0.0,0.0,2.0,40.0,0.0,443.0
7361_02,1.0,1.0,0.0,41.0,0.0,0.0
3761_01,,0.0,0.0,45.0,0.0,4537.0
0438_01,1.0,1.0,2.0,40.0,0.0,
2875_01,1.0,1.0,0.0,55.0,0.0,0.0


In [24]:
# Setup AI model - I use XGBRegressor
ai_model = XGBRegressor(n_estimators=100, learning_rate=0.15, early_stopping_rounds=15)
ai_model.fit(X_train_plus, y_train, eval_set=[(X_valid_plus, y_valid)])

[0]	validation_0-rmse:0.47897
[1]	validation_0-rmse:0.46331
[2]	validation_0-rmse:0.45177
[3]	validation_0-rmse:0.44318
[4]	validation_0-rmse:0.43727
[5]	validation_0-rmse:0.43296
[6]	validation_0-rmse:0.43017
[7]	validation_0-rmse:0.42774
[8]	validation_0-rmse:0.42595
[9]	validation_0-rmse:0.42483
[10]	validation_0-rmse:0.42402
[11]	validation_0-rmse:0.42304
[12]	validation_0-rmse:0.42249
[13]	validation_0-rmse:0.42235
[14]	validation_0-rmse:0.42256
[15]	validation_0-rmse:0.42239
[16]	validation_0-rmse:0.42234
[17]	validation_0-rmse:0.42199
[18]	validation_0-rmse:0.42196
[19]	validation_0-rmse:0.42187
[20]	validation_0-rmse:0.42178
[21]	validation_0-rmse:0.42190
[22]	validation_0-rmse:0.42209
[23]	validation_0-rmse:0.42212
[24]	validation_0-rmse:0.42220
[25]	validation_0-rmse:0.42228
[26]	validation_0-rmse:0.42254
[27]	validation_0-rmse:0.42256
[28]	validation_0-rmse:0.42231
[29]	validation_0-rmse:0.42217
[30]	validation_0-rmse:0.42228
[31]	validation_0-rmse:0.42243
[32]	validation_0-

In [25]:
# Define a function to compute the model score
def percentage(okays, total):
    return '{:.3%}'.format(okays/total)

def model_score(predictions, expected_results):
    total = len(predictions)
    assert (total == expected_results.size)
    predicted_ok = 0
    for i, p in enumerate(predictions):
        if expected_results.iloc[i] == p:
            predicted_ok += 1
    print(f'Model precision = {percentage(predicted_ok, total)} - MAE = {mean_absolute_error(expected_results, predictions)}')

In [26]:
# Compute predictions and then calculate model precision confronting predictions with data for model validation (valid)
predictions_1 = ai_model.predict(X_valid_plus)
predictions_1 = np.rint(predictions_1).astype(int)
# Calculate Precision
model_score(predictions_1, y_valid)

Model precision = 72.973% - MAE = 0.2702702702702703


In [27]:
# Ok, this model seems OK. We prepare test data set for prediction
X_test_predict = X_test.copy()
X_test_predict = clean_dataset(X_test_predict)
X_test_predict.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,Money
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0013_01,Earth,1.0,TRAPPIST-1e,27.0,0.0,0.0
0018_01,Earth,0.0,TRAPPIST-1e,19.0,0.0,2832.0
0019_01,Europa,1.0,55 Cancri e,31.0,0.0,0.0
0021_01,Europa,0.0,TRAPPIST-1e,38.0,0.0,7418.0
0023_01,Earth,0.0,TRAPPIST-1e,20.0,0.0,645.0


In [28]:
def clean_test_data(test_data):
    fit_test_data = test_data.copy()
    # Replace literals
    ordinal_encoder = OrdinalEncoder()
    fit_test_data[['HomePlanet']] = ordinal_encoder.fit_transform(fit_test_data[['HomePlanet']])
    fit_test_data[['Destination']] = ordinal_encoder.fit_transform(fit_test_data[['Destination']])
    # Missing values
    imputer = SimpleImputer()
    test_data_imputed = pd.DataFrame(imputer.fit_transform(fit_test_data))
    # Since imputation removed column names, we put them back
    test_data_imputed.columns = test_data.columns
    return test_data_imputed

X_test_clean = clean_test_data(X_test_predict)
X_test_clean.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Money
0,0.0,1.0,2.0,27.0,0.0,0.0
1,0.0,0.0,2.0,19.0,0.0,2832.0
2,1.0,1.0,0.0,31.0,0.0,0.0
3,1.0,0.0,2.0,38.0,0.0,7418.0
4,0.0,0.0,2.0,20.0,0.0,645.0


In [30]:
# Use model to make predictions
test_predictions = ai_model.predict(X_test_clean)
print(test_predictions[:10])
test_predictions = np.rint(test_predictions).astype(int)
# Convert into Booleans
bool_predictions = test_predictions.astype(np.bool_)
print(bool_predictions[:10])

[0.56642634 0.30504063 0.97801876 0.29104984 0.3460571  0.2865238
 0.8242195  0.96612895 0.9764855  0.29157028]
[ True False  True False False False  True  True  True False]


For the first ten passengers, we can predict if they will be transported by the anomaly (True of False)
