# Setup

## Imports

In [45]:
from google.colab import drive
import os
mount_path = "/content/drive"

if not os.path.ismount(mount_path):
    drive.mount(mount_path)
else:
    print("Google Drive is already mounted.")

path = "drive/MyDrive/kaggle/spaceship_titanic/"

Google Drive is already mounted.


In [46]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import KNNImputer

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, classification_report, confusion_matrix
)

## Load Data

In [47]:
train_original = pd.read_csv(path + "train.csv")
test_original = pd.read_csv(path + "test.csv")

In [48]:
train = train_original.copy()
test = test_original.copy()

# Analysis

In [49]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [50]:
#train.describe()

In [51]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### Heatmap

In [52]:
def heat(df):
  plt.figure(figsize=(10, 6))
  sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
  plt.show()

# Preprocessing

In [53]:
train = train.drop(columns=['Transported', 'PassengerId'])
test = test.drop(columns=['PassengerId'])

In [54]:
def prepare_dataset(df):
  for i in ["Name", 'VIP', 'ShoppingMall']:
    df = df.drop(i, axis=1)

  df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
  try:
    df = df.drop('Cabin', axis=1)
  except KeyError:
    print("Field does not exist")

  encoder = LabelEncoder()
  #encoder = OneHotEncoder(sparse_output=False)
  for column in df.columns:
    if df[column].dtype == "object":
        df[column] = encoder.fit_transform(df[column])

  n = 15
  imputer = KNNImputer(n_neighbors=n)
  imputed_df = imputer.fit_transform(df)
  df = pd.DataFrame(imputed_df, columns=df.columns)

  scaler = MinMaxScaler()
  df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

  return df

In [55]:
train = prepare_dataset(train)
test = prepare_dataset(test)

In [56]:
#heat(train)

# Training

## Functions

In [57]:
def cross_val(model, X, y):
  cv_scores = cross_val_score(model, X, y, cv=5)

  mean_score = np.mean(cv_scores)
  std_dev = np.std(cv_scores)

  print(f"Cross-validation scores: {cv_scores}")
  print(f"Mean accuracy: {mean_score:.4f}")
  print(f"Standard deviation: {std_dev:.4f}")

In [58]:
def grid_search_graph(model):
  scores = model.cv_results_['mean_test_score'].reshape(len(Cs), len(Gs))
  extent = np.log10([Gs[0], Gs[-1], Cs[0], Cs[-1]])
  im = plt.imshow(scores, extent=extent, origin='lower')
  plt.colorbar(im)
  plt.contour(np.log10(Gs), np.log10(Cs), scores)
  plt.xlabel('log10(Gamma)')
  plt.ylabel('log10(C)')

## Execution

In [59]:
X = train
y = train_original['Transported']

In [60]:
#model = LogisticRegression(max_iter=1000)
#model = DecisionTreeClassifier()
#model = SVC()
model = SVC(probability=True, gamma=0.01, C=10.)
#model = RandomForestClassifier()
#model = KNeighborsClassifier()
#model = GaussianNB()

In [61]:
#cross_val(model, X, y)

In [62]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
#model.fit(X_train, y_train)

In [64]:
Cs = np.logspace(-1, 3, 5)
Gs = np.logspace(-7, -0, 5)
model = GridSearchCV(estimator=model, param_grid=dict(C=Cs, gamma=Gs), n_jobs=-1)

model.fit(X_train, y_train)
model.score(X_valid, y_valid)
print('Best parameters: ', model.best_params_)

Best parameters:  {'C': np.float64(10.0), 'gamma': np.float64(1.0)}


In [65]:
y_pred = model.predict(X_valid)
y_proba = model.predict_proba(X_valid)[:, 1] # Probability of the positive class

# Calculate metrics
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred)
recall = recall_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_proba)
logloss = log_loss(y_valid, y_proba)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC Score: {roc_auc:.4f}")
print(f"Log Loss: {logloss:.4f}\n")

# Print classification report and confusion matrix
print("Classification Report:\n", classification_report(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))

Accuracy: 0.7872
Precision: 0.7880
Recall: 0.7916
F1-Score: 0.7898
AUC-ROC Score: 0.8700
Log Loss: 0.4650

Classification Report:
               precision    recall  f1-score   support

       False       0.79      0.78      0.78       861
        True       0.79      0.79      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739

Confusion Matrix:
 [[674 187]
 [183 695]]


# Prediction

In [66]:
X_test = test

In [67]:
y_pred = model.predict(X_test)

In [68]:
submission = pd.DataFrame({
    'PassengerId': test_original['PassengerId'],
    'Transported': y_pred
})

In [69]:
submission.to_csv(path + "submission.csv", index=False)

# Unused Functions/Code

In [70]:
def replace_nan_w_mode(df, columns):
  for i in columns:
    most_common = df[i].mode()[0]
    df[i] = df[i].fillna(most_common)

In [71]:
def columns_w_nans(df):
  nan_columns = df.columns[df.isna().any()].tolist()
  print(nan_columns)

In [72]:
def column_values(column):
  print(train[column].value_counts())
  print(train[column].unique())

## Plots

### Histogram

In [73]:
#sns.histplot(train["Age"], bins=30, kde=True)

### Boxplot

In [74]:
#sns.boxplot(x=train["VRDeck"])

### Scatterplot

In [75]:
#sns.scatterplot(x="Age", y="HomePlanet", data=train, hue="Transported")
#plt.xticks(rotation=45)
#plt.show()

### Pairplot

In [76]:
#sns.pairplot(train)