# Spaceship Titanic

https://www.kaggle.com/competitions/spaceship-titanic

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_score, recall_score, f1_score, auc

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import warnings
warnings.simplefilter("ignore")

from google.colab import drive  # Mount Google Drive if you are using Google Colab
drive.mount('/content/drive')

Mounted at /content/drive


# Download from Kaggle and Load Spaceship Titanic Dataset

In [None]:
# Create '.kaggle' folder in 'root'
!mkdir ~/.kaggle

In [None]:
# Copy 'kaggle.json' to recently created folder
!cp '/content/drive/MyDrive/Colab Notebooks/DPL302m/spaceship-titanic/kaggle.json' ~/.kaggle/

In [None]:
# Set permission
!chmod 600 ~/.kaggle/kaggle.json 

In [None]:
# Download dataset from kaggle competition
!kaggle competitions download -c spaceship-titanic

In [None]:
# Unzip files
!unzip spaceship-titanic.zip

In [None]:
# Run this if you wish to remove the zip
!rm spaceship-titanic.zip

In [None]:
# Load train and test
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

# Data Exploration and Preprocessing, Feature Engineering

>The task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly.

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
numerical = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']
categorical = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'FirstName', 'LastName', 'Group']
encoded_categorical = ['Encoded_HomePlanet', 'Encoded_CryoSleep', 'Encoded_Destination', 'Encoded_VIP', 'Encoded_Deck', 'Encoded_Side', 'Encoded_FirstName', 'Encoded_LastName', 'Encoded_Group']

In [None]:
def preprocess(data):
  # Display missing data before preprocessing
  print("Missing Data Before Preprocessing: {}".format(data.isnull().sum().sum()))
  print(data.isnull().sum())
  print()

  # Split columns
  data[["Deck", "Cabin_num", "Side"]] = data["Cabin"].str.split("/", expand=True)
  data[["ID", "Group"]] = data["PassengerId"].str.split("_", expand=True)
  data[["FirstName", "LastName"]] = data["Name"].str.split(" ", expand=True)

  # Convert to float data type
  data['Cabin_num'] = data['Cabin_num'].astype(float)

  # Drop the original columns
  data.drop(['Cabin', 'PassengerId', 'ID', 'Name'], axis=1, inplace=True)

  # Handling missing values for both numerical and categorical features
  for feature in numerical + categorical:
    if feature in numerical:
      # Replace missing values with the median of the respective feature
      data[feature].replace(np.nan, data[feature].median(), inplace=True)
    else:
      # Fill missing values with the mode (most frequent value) of the respective feature
      data[feature].fillna(data[feature].mode().values[0], inplace=True)

  # Drop duplicate records
  data.drop_duplicates(inplace=True)

  # Convert all numerical features to integers
  data[numerical] = data[numerical].astype(int)

  # Display missing data after preprocessing
  print("Missing Data After Preprocessing: {}".format(data.isnull().sum().sum()))
  print(data.isnull().sum())
  print()

  # Display duplicate records after preprocessing
  print("Duplicate Records After Preprocessing: {}".format(data.duplicated().sum()))
  print()

  # Return the preprocessed DataFrame without outliers
  return data

In [None]:
df_train = preprocess(train)

In [None]:
df_test = preprocess(test)

# Exploratory Data Analysis (EDA)

In [None]:
df_train.columns

In [None]:
print('Unique Indexes:', df_train.index.is_unique)

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.describe(include = 'object')

In [None]:
df_train.describe(include = 'bool')

In [None]:
df_train['Transported'].value_counts().plot(kind="bar")

In [None]:
df_train.corr()

# Feature Encoding

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Perform label encoding on each categorical feature in the 'categorical' list
for count, i in enumerate(categorical):
  df_train[i] = label_encoder.fit_transform(df_train[i])
  encoded_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
  print(f"Encoded Labels for {i}: {encoded_labels}")

# Create new columns with encoded values for each categorical feature
for column, new_column in zip(categorical, encoded_categorical):
  df_train[new_column] = label_encoder.fit_transform(df_train[column])
  encoded_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Drop the original categorical columns
df_train.drop(columns=categorical, inplace=True)

# Inspect the train data
df_train.info()

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Perform label encoding on each categorical feature in the 'categorical' list
encoded_labels = {}
for count, i in enumerate(categorical):
    df_test[i] = label_encoder.fit_transform(df_test[i])
    encoded_labels = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(f"Encoded Labels for {i}: {encoded_labels}")

# Create new columns with encoded values for each categorical feature
for column, new_column in zip(categorical, encoded_categorical):
    df_test[new_column] = label_encoder.fit_transform(df_test[column])
    encoded_labels[new_column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Drop the original categorical columns
df_test.drop(columns=categorical, inplace=True)

# Inspect the test data
df_test.info()

# Modeling

In [None]:
# Splitting the data into features (x) and target variable (y)
x = df_train.drop("Transported", axis=1)  # Features (input variables)
y = df_train["Transported"]  # Target variable (output variable)

# Performing the train-test split
x_train_raw, x_test_raw, y_train_raw, y_test_raw = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
# Define the feature scaler
scaler = RobustScaler()

# Define the pipeline for feature scaling
pipeline = Pipeline([
    ('scaler', scaler),
    ('model', None)
])

# Scale the features using the pipeline
x_train = pipeline.fit_transform(x_train_raw)
x_test = pipeline.transform(x_test_raw)

In [None]:
# Encode the target variable
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

# Print the encoded classes
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

In [None]:
# Define the models
models = [
    ('Logistic Regression', LogisticRegression()),
    ('KNN', KNeighborsClassifier()),
    ('SVM', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('XGBoost', XGBClassifier())
]

In [None]:
# Define the hyperparameters to tune for each model
param_grid = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'SVM': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf']
    },
    'Decision Tree': {
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'Naive Bayes': {},  # No hyperparameters for Gaussian Naive Bayes
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1, 5],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5]
    }
}

In [None]:
# Perform GridSearchCV for each model
best_models = {}
for model_name, model in models:
    pipeline.set_params(model=model)
    grid_search = GridSearchCV(pipeline.named_steps['model'], param_grid[model_name], cv=5)
    grid_search.fit(x_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

In [None]:
# Define lists to store the evaluation results
model_names = []
accuracy_scores = []
recall_scores = []
precision_scores = []
f1_scores = []
auc_roc_scores = []
confusion_matrices = []
roc_curves = []

In [None]:
# Evaluate each model
for model_name, model in best_models.items():
    # Fit the model on the training data
    model.fit(x_train, y_train)
    
    # Predict the target variable for the test data
    y_pred = model.predict(x_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    
    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Calculate the ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    
    # Append the results to the lists
    model_names.append(model_name)
    accuracy_scores.append(accuracy)
    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    auc_roc_scores.append(auc_roc)
    confusion_matrices.append(cm)
    roc_curves.append((fpr, tpr))

In [None]:
# Create a DataFrame of evaluation results
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracy_scores,
    'Recall': recall_scores,
    'Precision': precision_scores,
    'F1 Score': f1_scores,
    'AUC-ROC': auc_roc_scores
})

In [None]:
# Print the DataFrame
results_df.head(6)

In [None]:
# Plot the confusion matrices
for model_name, cm in zip(model_names, confusion_matrices):
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

In [None]:
# Create a single plot for all ROC curves
plt.figure()

# Plot the ROC curves
for model_name, (fpr, tpr) in zip(model_names, roc_curves):
    plt.plot(fpr, tpr, label=model_name)

# Plot the random guess line
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')

# Set the plot title and labels
plt.title('ROC Curves - All Models')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# Add a legend
plt.legend()

# Show the plot
plt.show()

In [None]:
# Find the best model based on a chosen metric
best_model_idx = results_df['Accuracy'].idxmax()
best_model = best_models[model_names[best_model_idx]]
print('Best Model:')
print(best_model)

# Deep Neural Network

In [None]:
early_stopping1 = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 10, restore_best_weights = True) 
early_stopping2 = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", patience = 10, restore_best_weights = True) 

In [None]:
model = tf.keras.Sequential([
        tf.keras.layers.Input(name = "input", shape = (x_train.shape[1])),
        tf.keras.layers.Dense(256, activation = "relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, activation = "relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation = "relu"),
        tf.keras.layers.Dense(max(y_train)+1, activation = "softmax")])
        
model.summary()

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = "sparse_categorical_crossentropy",
              metrics = ["accuracy"])
 
model_history = model.fit(x_train, y_train,
                epochs = 100,
                verbose = 1, batch_size = 128,
                validation_data = (x_test, y_test),
                callbacks = [early_stopping1, early_stopping2])

In [None]:
print(model.evaluate(x_train, y_train)) 
print(model.evaluate(x_test, y_test))

In [None]:
plt.plot(model_history.history["loss"]) 
plt.plot(model_history.history["val_loss"]) 
plt.legend(["loss", "validation loss"], loc ="upper right")
plt.title("Train and Validation Loss") 
plt.xlabel("epoch") 
plt.ylabel("Sparse Categorical Cross Entropy") 
plt.show()

In [None]:
plt.plot(model_history.history["accuracy"])
plt.plot(model_history.history["val_accuracy"])
plt.legend(["accuracy", "validation accuracy"], loc ="upper right")
plt.title("Train and Validation Accuracy") 
plt.xlabel("epoch") 
plt.ylabel("Accuracy") 
plt.show()

In [None]:
df_test['Transported'] = np.empty(df_test.shape[0])
df_pred = df_test.drop('Transported', axis = 1).iloc[0:]
df_pred
y_pred = model.predict(scaler.transform(df_pred)).argmax(axis=1)
print('Prediction in Numerical ', y_pred)
print('Prediction in Text ', label_encoder.inverse_transform(y_pred))
# print('Actual Value ', df_test.iloc[0:]['genre'].to_numpy())

In [None]:
np.unique(y_pred, return_counts=True)

In [None]:
model.save("saved.h5")
loaded_model = keras.models.load_model("saved.h5")