In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
train_process_df = pd.read_csv("../intermediate_storage/spaceship_titanic_preprocessed.csv")


#### Feature Selection

In [5]:
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 
          'FoodCourt', 'ShoppingMall', 'VRDeck', 'GroupId', 'GroupSize', 'IsAlone', 'Deck', 'Num', 'Side']

In [6]:
X = train_process_df[features]

In [7]:
y = train_process_df['Transported']


In [8]:
X

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,VRDeck,GroupId,GroupSize,IsAlone,Deck,Num,Side
0,Europa,False,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,1,1,1,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,44.0,2,1,1,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,49.0,3,2,0,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,193.0,3,2,0,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,2.0,4,1,1,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8492,Europa,False,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,74.0,9276,1,1,A,98,P
8493,Earth,True,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,9278,1,1,G,1499,S
8494,Earth,False,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,0.0,9279,1,1,G,1500,S
8495,Europa,False,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,3235.0,9280,2,0,E,608,S


In [9]:
# Nomial Features

In [10]:
nominal_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Deck']
X_nominals = train_process_df[nominal_cols].astype('category').reset_index(drop=True)

In [12]:
# Encoding
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False, drop='first')

In [13]:
 
#oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, drop='first')

X_nominals_encoded = oh_encoder.fit_transform(X_nominals)
X_nominals_encoded = pd.DataFrame\
    (X_nominals_encoded, columns=oh_encoder.get_feature_names_out())

In [14]:
X_nominals_encoded

Unnamed: 0,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_1.0,Side_S,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8492,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8493,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8494,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8495,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
# Ordinal

In [16]:
X_ordinals = train_process_df["GroupSize"].astype('category').reset_index(drop=True)

In [26]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

X_ordinals_encoded = ord_encoder.fit_transform(X_ordinals.values.reshape(-1, 1))
X_ordinals_encoded = pd.DataFrame(X_ordinals_encoded, columns=ord_encoder.get_feature_names_out())

In [50]:
X_ordinals_encoded.rename(columns={"x0": "GroupSize"}, inplace=True)

In [51]:
X_ordinals_encoded

Unnamed: 0,GroupSize
0,0.0
1,0.0
2,1.0
3,1.0
4,0.0
...,...
8492,0.0
8493,0.0
8494,0.0
8495,1.0


In [33]:
# Quantitative Features

In [34]:
cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'VRDeck', "Age"]

In [35]:
X_continuous = train_process_df[cols].reset_index(drop=True)

In [36]:
## Encoding 

In [37]:
from sklearn.preprocessing import KBinsDiscretizer


In [39]:
kbins_encoder = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
X_continuous_encoded = kbins_encoder.fit_transform(X_continuous)
X_continuous_encoded = pd.DataFrame(X_continuous_encoded, columns=X_continuous.columns)



In [40]:
X_continuous_encoded

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,VRDeck,Age
0,0.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,2.0
2,0.0,1.0,0.0,0.0,4.0
3,0.0,1.0,1.0,1.0,3.0
4,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
8492,0.0,1.0,0.0,0.0,4.0
8493,0.0,0.0,0.0,0.0,1.0
8494,0.0,0.0,1.0,0.0,2.0
8495,0.0,1.0,0.0,1.0,3.0


In [41]:
## Target Feature

In [44]:
train_process_df

Unnamed: 0.1,Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,...,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone,Deck,Num,Side
0,0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,...,0.0,0.0,Maham Ofracculy,False,1,1,1,B,0,P
1,1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,...,549.0,44.0,Juanna Vines,True,2,1,1,F,0,S
2,2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,...,6715.0,49.0,Altark Susent,False,3,2,0,A,0,S
3,3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,...,3329.0,193.0,Solam Susent,False,3,2,0,A,0,S
4,4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,...,565.0,2.0,Willy Santantines,True,4,1,1,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8492,8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,1.0,0.0,6819.0,...,1643.0,74.0,Gravior Noxnuther,False,9276,1,1,A,98,P
8493,8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,0.0,0.0,0.0,...,0.0,0.0,Kurta Mondalley,False,9278,1,1,G,1499,S
8494,8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,0.0,0.0,0.0,...,1.0,0.0,Fayey Connon,True,9279,1,1,G,1500,S
8495,8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,0.0,0.0,1049.0,...,353.0,3235.0,Celeon Hontichre,False,9280,2,0,E,608,S


In [46]:
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()

y_encoded = lb_encoder.fit_transform(train_process_df['Transported'])
y_encoded = pd.Series(y_encoded, name='Transported')

In [52]:
X_encoded = pd.concat([X_nominals_encoded, X_ordinals_encoded, X_continuous, y_encoded], axis=1)

In [53]:
X_encoded

Unnamed: 0,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_1.0,Side_S,Deck_B,Deck_C,Deck_D,...,Deck_F,Deck_G,Deck_T,GroupSize,RoomService,FoodCourt,ShoppingMall,VRDeck,Age,Transported
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,109.0,9.0,25.0,44.0,24.0,1
2,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,43.0,3576.0,0.0,49.0,58.0,0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1283.0,371.0,193.0,33.0,0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,303.0,70.0,151.0,2.0,16.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8492,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6819.0,0.0,74.0,41.0,0
8493,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0
8494,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1872.0,0.0,26.0,1
8495,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1049.0,0.0,3235.0,32.0,0


In [54]:
X_encoded.to_csv("../intermediate_storage/titanic_features.csv")