In [18]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

<br>
<br>
<br>

### Data collection

In [2]:
# importing data after feature engineering
train_data = pd.read_csv("../data/train_engineered.csv")
test_data = pd.read_csv("../data/test_engineered.csv")

In [3]:
# inspecting train data
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale,Transported
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,B,P,a,1,False
1,Earth,0,TRAPPIST-1e,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,adult,a,1,F,S,a,0,True
2,Europa,0,TRAPPIST-1e,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,old,a,1,A,S,a,1,False
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,7.157735,5.918894,8.110728,5.267858,adult,a,2,A,S,a,1,False
4,Earth,0,TRAPPIST-1e,16.0,0,5.717028,4.26268,5.023881,6.338594,1.098612,child,a,1,F,S,a,0,True


In [4]:
# inspecting test data
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,G,S,a,0
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,2.302585,0.0,7.94591,0.0,adult,a,1,F,S,a,0
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,C,S,a,1
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,8.802823,0.0,5.204007,6.37332,adult,a,1,C,S,a,1
4,Earth,0,TRAPPIST-1e,20.0,0,2.397895,0.0,6.455199,0.0,0.0,adult,a,1,F,S,a,0


In [5]:
# checking for null values in train data
train_data.isna().any().value_counts()

False    18
dtype: int64

In [6]:
# checking for null values in test data
test_data.isna().any().value_counts()

False    17
dtype: int64

<br>

### train data

In [7]:
# seperating x_train and y_train
x_train = train_data.drop("Transported", axis=1)
y_train = train_data.Transported

In [9]:
# inspecting x_train
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,B,P,a,1
1,Earth,0,TRAPPIST-1e,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,adult,a,1,F,S,a,0
2,Europa,0,TRAPPIST-1e,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,old,a,1,A,S,a,1
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,7.157735,5.918894,8.110728,5.267858,adult,a,2,A,S,a,1
4,Earth,0,TRAPPIST-1e,16.0,0,5.717028,4.26268,5.023881,6.338594,1.098612,child,a,1,F,S,a,0


In [10]:
# inspecting y_train
y_train.head()

0    False
1     True
2    False
3    False
4     True
Name: Transported, dtype: bool

In [11]:
# seperating numerical, categorical and boolean data
num_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'NumInGroup']
cat_columns = ['HomePlanet', 'Destination', 'AgeCategory', 'GroupCategory', 'Deck', 'Side', 'CabinNumberCategory']
bool_columns = ['CryoSleep', 'VIP', 'IsMale']

num_data = x_train[num_columns].copy()
cat_data = x_train[cat_columns].copy()
bool_data = x_train[bool_columns].copy()

In [12]:
# inspecting numerical data
num_data.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup
0,39.0,0.0,0.0,0.0,0.0,0.0,1
1,24.0,4.70048,2.302585,3.258097,6.309918,3.806662,1
2,58.0,3.78419,8.18228,0.0,8.812248,3.912023,1
3,33.0,0.0,7.157735,5.918894,8.110728,5.267858,2
4,16.0,5.717028,4.26268,5.023881,6.338594,1.098612,1


In [13]:
# inspecting categorical data
cat_data.head()

Unnamed: 0,HomePlanet,Destination,AgeCategory,GroupCategory,Deck,Side,CabinNumberCategory
0,Europa,TRAPPIST-1e,adult,a,B,P,a
1,Earth,TRAPPIST-1e,adult,a,F,S,a
2,Europa,TRAPPIST-1e,old,a,A,S,a
3,Europa,TRAPPIST-1e,adult,a,A,S,a
4,Earth,TRAPPIST-1e,child,a,F,S,a


In [14]:
# inspecting boolean data
bool_data.head()

Unnamed: 0,CryoSleep,VIP,IsMale
0,0,0,1
1,0,0,0
2,0,1,1
3,0,0,1
4,0,0,0


<br>

#### numerical data

In [15]:
# standard scaler for numerical data
scaler = StandardScaler()

In [17]:
# standardizing nuemrical data
num_data_scaled = scaler.fit_transform(num_data)
num_data_scaled = pd.DataFrame(num_data_scaled, columns=num_data.columns, index=num_data.index)
num_data_scaled.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup
0,0.711945,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161
1,-0.334037,1.056551,0.112472,0.611179,1.574204,0.716881,-0.491161
2,2.036857,0.722866,2.10372,-0.640175,2.469368,0.754916,-0.491161
3,0.293552,-0.655221,1.756743,1.633126,2.218411,1.244369,0.457443
4,-0.891895,1.426747,0.776288,1.289373,1.584463,-0.260719,-0.491161


<br>

#### categorical data

In [21]:
# encoding categorical data
cat_data_encoded = pd.get_dummies(cat_data)
cat_data_encoded.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,AgeCategory_adult,AgeCategory_child,AgeCategory_old,GroupCategory_a,...,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d
0,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
1,1,0,0,0,0,1,1,0,0,1,...,0,1,0,0,0,1,1,0,0,0
2,0,1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,1,1,0,0,0
3,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,1,1,0,0,0
4,1,0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,1,1,0,0,0


In [22]:
# cat_data columns
cat_data_encoded.columns

Index(['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'AgeCategory_adult', 'AgeCategory_child',
       'AgeCategory_old', 'GroupCategory_a', 'GroupCategory_b',
       'GroupCategory_c', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_T', 'Side_P', 'Side_S',
       'CabinNumberCategory_a', 'CabinNumberCategory_b',
       'CabinNumberCategory_c', 'CabinNumberCategory_d'],
      dtype='object')

<br>

#### boolean data

In [23]:
bool_data.head()

Unnamed: 0,CryoSleep,VIP,IsMale
0,0,0,1
1,0,0,0
2,0,1,1
3,0,0,1
4,0,0,0


In [24]:
# checking if all values are 0 and 1
bool_data.value_counts()

CryoSleep  VIP  IsMale
0          0    0         2808
                1         2622
1          0    1         1726
                0         1338
0          1    1          152
                0           23
1          1    1           22
                0            2
dtype: int64

<br>

#### combining datasets

In [30]:
# combining datasets into 1 dataframe
x_train_processed = pd.concat([num_data_scaled, cat_data_encoded, bool_data], axis=1)
x_train_processed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d,CryoSleep,VIP,IsMale
0,0.711945,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,0,1,0,...,0,1,0,1,0,0,0,0,0,1
1,-0.334037,1.056551,0.112472,0.611179,1.574204,0.716881,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0
2,2.036857,0.722866,2.10372,-0.640175,2.469368,0.754916,-0.491161,0,1,0,...,0,0,1,1,0,0,0,0,1,1
3,0.293552,-0.655221,1.756743,1.633126,2.218411,1.244369,0.457443,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4,-0.891895,1.426747,0.776288,1.289373,1.584463,-0.260719,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0


In [31]:
# checking for null values
x_train_processed.isna().any().value_counts()

False    36
dtype: int64

In [32]:
# inspecting x_train_processed
x_train_processed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d,CryoSleep,VIP,IsMale
0,0.711945,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,0,1,0,...,0,1,0,1,0,0,0,0,0,1
1,-0.334037,1.056551,0.112472,0.611179,1.574204,0.716881,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0
2,2.036857,0.722866,2.10372,-0.640175,2.469368,0.754916,-0.491161,0,1,0,...,0,0,1,1,0,0,0,0,1,1
3,0.293552,-0.655221,1.756743,1.633126,2.218411,1.244369,0.457443,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4,-0.891895,1.426747,0.776288,1.289373,1.584463,-0.260719,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0


<br>

#### target

In [34]:
# inspecting target variable
y_train

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [36]:
# converting (True, False) -> (1, 0)
y_train_processed = y_train.apply(lambda x: 1 if x else 0)
y_train_processed.head()

0    0
1    1
2    0
3    0
4    1
Name: Transported, dtype: int64

<br>

#### test data

In [38]:
# inspecting test data
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,G,S,a,0
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,2.302585,0.0,7.94591,0.0,adult,a,1,F,S,a,0
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,C,S,a,1
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,8.802823,0.0,5.204007,6.37332,adult,a,1,C,S,a,1
4,Earth,0,TRAPPIST-1e,20.0,0,2.397895,0.0,6.455199,0.0,0.0,adult,a,1,F,S,a,0


In [39]:
# seperating numerical, categorical and boolean data
num_data_test = test_data[num_columns].copy()
cat_data_test = test_data[cat_columns].copy()
bool_data_test = test_data[bool_columns].copy()

In [40]:
# inspecting numerical data
num_data_test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup
0,27.0,0.0,0.0,0.0,0.0,0.0,1
1,19.0,0.0,2.302585,0.0,7.94591,0.0,1
2,31.0,0.0,0.0,0.0,0.0,0.0,1
3,38.0,0.0,8.802823,0.0,5.204007,6.37332,1
4,20.0,2.397895,0.0,6.455199,0.0,0.0,1


In [41]:
# inspecting categorical data
cat_data_test.head()

Unnamed: 0,HomePlanet,Destination,AgeCategory,GroupCategory,Deck,Side,CabinNumberCategory
0,Earth,TRAPPIST-1e,adult,a,G,S,a
1,Earth,TRAPPIST-1e,adult,a,F,S,a
2,Europa,55 Cancri e,adult,a,C,S,a
3,Europa,TRAPPIST-1e,adult,a,C,S,a
4,Earth,TRAPPIST-1e,adult,a,F,S,a


In [42]:
# inspecting boolean data
bool_data_test.head()

Unnamed: 0,CryoSleep,VIP,IsMale
0,1,0,0
1,0,0,0
2,1,0,1
3,0,0,1
4,0,0,0


<br>

#### numerical data

In [45]:
# standardizing numerical data with same scaler
num_test_scaled = scaler.transform(num_data_test)
num_test_scaled = pd.DataFrame(num_test_scaled, columns=num_data_test.columns, index=num_data_test.index)
num_test_scaled.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup
0,-0.124841,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161
1,-0.682698,-0.655221,0.112472,-0.640175,2.159451,-0.657316,-0.491161
2,0.154088,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161
3,0.642213,-0.655221,2.313877,-0.640175,1.178585,1.643438,-0.491161
4,-0.612966,0.21802,-0.667333,1.839107,-0.683054,-0.657316,-0.491161


<br>

#### categorical data

In [46]:
# encoding categorical data
cat_test_encoded = pd.get_dummies(cat_data_test)
cat_test_encoded.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,AgeCategory_adult,AgeCategory_child,AgeCategory_old,GroupCategory_a,...,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d
0,1,0,0,0,0,1,1,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1,1,0,0,0,0,1,1,0,0,1,...,0,1,0,0,0,1,1,0,0,0
2,0,1,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,1,0,0,0
3,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,1,1,0,0,0
4,1,0,0,0,0,1,1,0,0,1,...,0,1,0,0,0,1,1,0,0,0


<br>

#### boolean data

In [48]:
# no change needed
bool_data_test.head()

Unnamed: 0,CryoSleep,VIP,IsMale
0,1,0,0
1,0,0,0
2,1,0,1
3,0,0,1
4,0,0,0


<br>

#### combining datasets

In [49]:
# combining all 3 datasets
x_test_processed = pd.concat([num_test_scaled, cat_test_encoded, bool_data_test], axis=1)
x_test_processed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d,CryoSleep,VIP,IsMale
0,-0.124841,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,1,0,0
1,-0.682698,-0.655221,0.112472,-0.640175,2.159451,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0
2,0.154088,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,0,1,0,...,0,0,1,1,0,0,0,1,0,1
3,0.642213,-0.655221,2.313877,-0.640175,1.178585,1.643438,-0.491161,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4,-0.612966,0.21802,-0.667333,1.839107,-0.683054,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0


In [50]:
# making sure columns are in same order as x_train_processed
x_test_processed = x_test_processed[x_train_processed.columns]
x_test_processed.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NumInGroup,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_T,Side_P,Side_S,CabinNumberCategory_a,CabinNumberCategory_b,CabinNumberCategory_c,CabinNumberCategory_d,CryoSleep,VIP,IsMale
0,-0.124841,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,1,0,0
1,-0.682698,-0.655221,0.112472,-0.640175,2.159451,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0
2,0.154088,-0.655221,-0.667333,-0.640175,-0.683054,-0.657316,-0.491161,0,1,0,...,0,0,1,1,0,0,0,1,0,1
3,0.642213,-0.655221,2.313877,-0.640175,1.178585,1.643438,-0.491161,0,1,0,...,0,0,1,1,0,0,0,0,0,1
4,-0.612966,0.21802,-0.667333,1.839107,-0.683054,-0.657316,-0.491161,1,0,0,...,0,0,1,1,0,0,0,0,0,0


<br>
<br>

### Saving datasets

In [51]:
# saving as x_train_preprocessed.csv
x_train_processed.to_csv("../data/x_train_preprocessed.csv", index=None)

In [52]:
# saving as x_test_preprocessed.csv
x_test_processed.to_csv("../data/x_test_preprocessed.csv", index=None)

In [53]:
# saving as y_train_preprocessed.csv
y_train_processed.to_csv("../data/y_train_preprocessed.csv", index=None)