In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic-passenger-data/README.md
/kaggle/input/spaceship-titanic-passenger-data/data/sample_submission.csv
/kaggle/input/spaceship-titanic-passenger-data/data/train.csv
/kaggle/input/spaceship-titanic-passenger-data/data/test.csv


<h3>Import Libraries</h3>

In [2]:
import warnings
import itertools
import seaborn as sns
import scipy.stats as stats
from scipy.stats import pointbiserialr, chi2_contingency
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import KNNImputer

<h3>Get Data</h3>

In [3]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic-passenger-data/data/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic-passenger-data/data/test.csv")
test_data['Transported'] = False
data = pd.concat([train_data, test_data], sort=False)
data.drop(columns=['Name'], inplace=True) # usually an unique identifier, so not important

# Turn 'VIP' and 'CryoSleep' features into booleans (pandas sometimes reads them as strings)
data['VIP'] = data['VIP'].astype(bool)
data['CryoSleep'] = data['CryoSleep'].astype(bool)
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12970 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12970 non-null  bool   
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12970 non-null  bool   
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Transported   12970 non-null  bool   
dtypes: bool(3), float64(6), object(4)
memory usage: 1.1+ MB


In [5]:
data.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep         0
Cabin           299
Destination     274
Age             270
VIP               0
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
dtype: int64

<h5>We can see that for some of the features, there are values that are missing (null values). Therefore, we must take action by filling in those missing values. However, firstly, we want to do something about the 'PassengerId' and 'Cabin' features, since we can split the their values into more columns.</h5>

<h3>Feature Creation</h3>

<h5>First, we want to extract the group IDs from the passenger IDs since passengers can be part of the same group, and people of the same group may be connected if they were transported or not.</h5>

In [6]:
data[['PassengerNumber', 'GroupId']] = data['PassengerId'].str.split('_', expand=True)
data.drop(columns=['PassengerId'], inplace=True)
data['PassengerNumber'] = data['PassengerNumber'].astype(int)
data['GroupId'] = data['GroupId'].astype(int)
data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerNumber,GroupId
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,1
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,1
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,2
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,1


<h5>Now, I want to split the 'Cabin' column into three separate columns to split the deck, num, and side into their own columns to better process the data. </h5>

In [7]:
data[['Deck', 'Num', 'Side']] = data['Cabin'].str.split('/', expand=True)
data.drop(columns=['Cabin'], inplace=True)
data['Num'] = data['Num'].fillna(-1).astype(int)
data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerNumber,GroupId,Deck,Num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,1,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,1,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,1,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,2,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,1,F,1,S


<h3>Handling Missing Values</h3>

In [8]:
impute_cols = ['CryoSleep', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'PassengerNumber', 'GroupId', 'Num', 'Transported']
non_impute_cols = list(set(data.columns) - set(impute_cols))

In [9]:
# Handle Missing Values for Numerical Features
imputer = KNNImputer(n_neighbors=5)
data_imputed = imputer.fit_transform(data[impute_cols])
data_imputed = pd.DataFrame(data_imputed, columns=impute_cols)
data_rest = data[non_impute_cols]
data = pd.concat([data_rest.reset_index(drop=True), data_imputed.reset_index(drop=True)],axis=1)

In [10]:
# Handle Missing Values for Categorical Features
for col in non_impute_cols:
    data[col] = data[col].fillna('U')

In [11]:
data.isna().sum()

HomePlanet         0
Side               0
Destination        0
Deck               0
CryoSleep          0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
PassengerNumber    0
GroupId            0
Num                0
Transported        0
dtype: int64

<h3>Feature Engineering</h3>

<h5>Now, we have multiple columns for the expenses made for each type of amenity. So, we can make a column for the average expenses, std expenses, and the total amount of expenses per passenger. </h5>

In [12]:
expenses = ['RoomService', 'FoodCourt', 'VRDeck', 'Spa', 'ShoppingMall']
data['TotalExpenses'] = data[expenses].sum(axis=1)
data['TotalExpenses_std'] = data[expenses].std(axis=1)
data['TotalExpenses_avg'] = data[expenses].mean(axis=1)
data.head()

Unnamed: 0,HomePlanet,Side,Destination,Deck,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerNumber,GroupId,Num,Transported,TotalExpenses,TotalExpenses_std,TotalExpenses_avg
0,Europa,P,TRAPPIST-1e,B,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Earth,S,TRAPPIST-1e,F,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,2.0,1.0,0.0,1.0,736.0,227.807375,147.2
2,Europa,S,TRAPPIST-1e,A,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,3.0,1.0,0.0,0.0,10383.0,3013.383198,2076.6
3,Europa,S,TRAPPIST-1e,A,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,3.0,2.0,0.0,0.0,5176.0,1373.410427,1035.2
4,Earth,S,TRAPPIST-1e,F,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,4.0,1.0,1.0,1.0,1091.0,223.988169,218.2


<h5>Now, we want all of our categorical features to be numerical so we will do one-hot encoding.</h5>

In [13]:
data = pd.get_dummies(data, columns=non_impute_cols)
data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerNumber,GroupId,...,Destination_U,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,False,True,False,False,False,False,False,False,False
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,2.0,1.0,...,False,False,False,False,False,False,True,False,False,False
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,3.0,1.0,...,False,True,False,False,False,False,False,False,False,False
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,3.0,2.0,...,False,True,False,False,False,False,False,False,False,False
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,4.0,1.0,...,False,False,False,False,False,False,True,False,False,False


In [14]:
data.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'PassengerNumber', 'GroupId', 'Num', 'Transported',
       'TotalExpenses', 'TotalExpenses_std', 'TotalExpenses_avg',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_U', 'Side_P', 'Side_S', 'Side_U', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_U',
       'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
       'Deck_T', 'Deck_U'],
      dtype='object')

In [15]:
data.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.315576
HomePlanet_Europa            0.131977
Deck_B                       0.107559
Destination_55 Cancri e      0.083625
Deck_C                       0.079540
Side_S                       0.073741
GroupId                      0.051881
FoodCourt                    0.035562
Deck_G                       0.016048
PassengerNumber              0.014628
Deck_A                       0.007513
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.005371
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
Deck_U                      -0.001007
Side_U                      -0.001007
VIP                         -0.009605
Deck_T                      -0.015196
Deck_D                      -0.025659
Num                         -0.035240
Age                         -0.050147
Deck_F                      -0.070171
Deck_E                      -0.071961
Destination_

In [16]:
data['4_top_corr'] = data['CryoSleep'] + data['HomePlanet_Europa'] + data['Deck_B'] + data['Destination_55 Cancri e']
data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerNumber,GroupId,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,4_top_corr
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,False,True,False,False,False,False,False,False,False,2.0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,2.0,1.0,...,False,False,False,False,False,True,False,False,False,0.0
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,3.0,1.0,...,True,False,False,False,False,False,False,False,False,1.0
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,3.0,2.0,...,True,False,False,False,False,False,False,False,False,1.0
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,4.0,1.0,...,False,False,False,False,False,True,False,False,False,0.0


In [17]:
data.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.315576
4_top_corr                   0.268638
HomePlanet_Europa            0.131977
Deck_B                       0.107559
Destination_55 Cancri e      0.083625
Deck_C                       0.079540
Side_S                       0.073741
GroupId                      0.051881
FoodCourt                    0.035562
Deck_G                       0.016048
PassengerNumber              0.014628
Deck_A                       0.007513
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.005371
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
Side_U                      -0.001007
Deck_U                      -0.001007
VIP                         -0.009605
Deck_T                      -0.015196
Deck_D                      -0.025659
Num                         -0.035240
Age                         -0.050147
Deck_F                      -0.070171
Deck_E      

<h3>Model Training</h3>

In [18]:
data_train, data_test = data[:train_data.shape[0]].copy(), data[train_data.shape[0]:].copy()
data_test.drop(columns=['Transported'], inplace=True)

In [19]:
models = {
    "Random Forest" : RandomForestClassifier(n_estimators=100,
                                            random_state=42,
                                            n_jobs=-1),
    "XGB" : XGBClassifier(random_state=42),
    "Gradient Boosting" : GradientBoostingClassifier(random_state=42)
}

In [20]:
X_train = data_train.drop(columns='Transported')
y_train = data_train['Transported']

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(scores)
    print(f"{name}: {scores.mean(): .4f} (+/-{scores.std() * 2: .4f}")

[0.73260495 0.75503163 0.80506038 0.81530495 0.76582278]
Random Forest:  0.7748 (+/- 0.0620
[0.61012076 0.68717654 0.63829787 0.74856157 0.7146145 ]
XGB:  0.6798 (+/- 0.1003
[0.53939045 0.70845313 0.78435883 0.79746835 0.66398159]
Gradient Boosting:  0.6987 (+/- 0.1871


<h3>Model Testing</h3>

In [21]:
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
pred = model.predict(data_test)

In [22]:
final = pd.DataFrame()
final['PassengerId'] = test_data['PassengerId']
final['Transported'] = pred.astype(bool)

final.to_csv('submission.csv', index=False)

<h3>Trying out New things to increase score</h3>

In [23]:
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder

# X_train = train_data.drop('Transported', axis=1)
# y_train = train_data['Transported']

# X_test = test_data.copy(deep=True)
# num_cols = []
# cat_cols = []


# for col in X_train.columns:
#     if col != 'Transported':
#         if train_data[col].dtype == 'object':
#             cat_cols.append(col)
#         else:
#             num_cols.append(col)

# numeric_pipeline = Pipeline(steps=[
#     ('imputer', KNNImputer(strategy='mean')),
#     ('scaler', StandardScaler())
# ])

# categorical_pipeline = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder())
# ])

# preprocessor = ColumnTransformer(transformers=[
#     ('num', numeric_pipeline, num_cols),
#     ('cat', categorical_pipeline, cat_cols)
# ])

# X_train_preprocessed = preprocessor.fit_transform(X_train)
# X_test_preprocessed = preprocessor.transform(X_test)

<h3>Outlier and Noise Handling</h3>

<h5>We will use the interquartile range to determine the outlier count.</h5>

In [24]:
# def detect_outliers_iqr(df):
#     outlier_flags = {}
#     for col in df.select_dtypes(include='number').columns:
#         Q1 = df[col].quantile(0.25)
#         Q3 = df[col].quantile(0.75)
#         IQR = Q3 - Q1
#         lower = Q1 - 1.5 * IQR
#         upper = Q3 + 1.5 * IQR
#         outliers = df[(df[col] < lower) | (df[col] > upper)]
#         outlier_flags[col] = len(outliers)
#     return pd.Series(outlier_flags).sort_values(ascending=False)

In [25]:
# outliers = detect_outliers_iqr(train_data_new)
# outliers

<h3>Model Training</h3>

In [26]:
# train_data_scaled['Transported'].value_counts().plot.bar()

In [27]:
# print("====== Cross Validation for Dataset with Transformed Outliers ======")
# print()

# columns_to_test = ['PassengerNumber', 'VIP', 'Num', 'Age', 'GroupId']

# # go through each classifier and do cross validation on them
# for name, clf in models.items():
#     print(f"******** {name} *********")
#     print()

#     results = []

#     # no columns removed
#     baseline_acc = cross_val_score(clf, X_scaled, y, cv=5, scoring='accuracy', n_jobs=-1)
#     results.append({
#         'dropped_cols': (),
#         'mean_acc': baseline_acc.mean(),
#         'std_acc': baseline_acc.std()
#     })

#     # Find every subset of the columns that need further testing and remove them from
#     # dataset to see if any has an affect on the results
#     for r in range(1, len(columns_to_test) + 1):
#         for subset in itertools.combinations(columns_to_test, r):
                
#             X_train = X_scaled.drop(list(subset), axis=1)
        
#             cv_acc = cross_val_score(clf, X_train, y, cv=5, scoring='accuracy', n_jobs=-1)

#             results.append({
#                 'dropped_cols': subset,
#                 'mean_acc': cv_acc.mean(),
#                 'std_acc': cv_acc.std()
#             })

#     results_df = pd.DataFrame(results).sort_values('mean_acc', ascending=False)
#     print(results_df)
#     print()