In [1]:
# general imports
import pandas as pd

<br>
<br>
<br>

Data collection

In [20]:
# loading prepared data
data = pd.read_csv("../data/data_prepared.csv")
data.head()

Unnamed: 0,home_planet,cryo_sleep,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,group,number_in_group,deck,num_in_cabin,side,total_spending,age_category,gender
0,Mars,False,TRAPPIST-1e,31.0,False,1226.0,0.0,1.0,0.0,0.0,1510,1,F,291,S,1227.0,middle_adult,1.0
1,Europa,False,TRAPPIST-1e,26.0,False,0.0,896.0,0.0,690.0,1.0,7253,1,D,225,P,1587.0,young_adult,1.0
2,Earth,True,TRAPPIST-1e,24.0,False,0.0,0.0,0.0,0.0,0.0,4714,1,G,765,P,0.0,young_adult,1.0
3,Earth,False,55 Cancri e,33.0,False,0.0,0.0,0.0,436.0,224.0,7727,1,E,507,S,660.0,middle_adult,0.0
4,Mars,False,TRAPPIST-1e,21.0,False,1097.0,0.0,80.0,589.0,0.0,3237,1,D,104,P,1766.0,young_adult,1.0


<br>
<br>
<br>

Data preprocessing

- encoding categorical values
- transform boolean values
- scaling numerical values

In [16]:
# for data scaling
from sklearn.preprocessing import RobustScaler

In [21]:
# seperating categorical and boolean data
categorical_attributes = ["home_planet", "destination", "deck", "side", "age_category"]
boolean_attributes = ["cryo_sleep", "vip"]

cat_data = data[categorical_attributes].copy()
bool_data = data[boolean_attributes].copy()

In [22]:
# encoding categorical data
cat_data_encoded = pd.get_dummies(cat_data)
cat_data_encoded.head()

Unnamed: 0,home_planet_Earth,home_planet_Europa,home_planet_Mars,destination_55 Cancri e,destination_PSO J318.5-22,destination_TRAPPIST-1e,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S,age_category_child,age_category_middle_adult,age_category_old_adult,age_category_young_adult
0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1
2,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1
3,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
4,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1


In [23]:
# transforming boolean data
bool_data_encoded = pd.DataFrame()

for attribute in boolean_attributes:
    bool_data_encoded[attribute] = bool_data[attribute].apply(lambda x: 1 if x else 0)

bool_data_encoded.head()

Unnamed: 0,cryo_sleep,vip
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [24]:
# seperating numerical data
num_attributes = ["age", "room_service", "food_court", "shopping_mall", "spa", "vr_deck", "group", "number_in_group", "num_in_cabin", "total_spending"]

num_data = data[num_attributes].copy()

In [26]:
# scaling numerical data
scaler = RobustScaler()

num_data_scaled = scaler.fit_transform(num_data)
# converting NDArray to datafram
num_data_scaled = pd.DataFrame(num_data_scaled, columns=num_attributes)

In [29]:
num_data_scaled.head()

Unnamed: 0,age,room_service,food_court,shopping_mall,spa,vr_deck,group,number_in_group,num_in_cabin,total_spending
0,0.235294,31.435897,0.0,0.045455,0.0,0.0,-0.689033,0.0,-0.144231,0.35343
1,-0.058824,0.0,12.892086,0.0,11.896552,0.024691,0.574414,0.0,-0.223558,0.602911
2,-0.176471,0.0,0.0,0.0,0.0,0.0,0.01584,0.0,0.425481,-0.496881
3,0.352941,0.0,0.0,0.0,7.517241,5.530864,0.678693,0.0,0.115385,-0.039501
4,-0.352941,28.128205,0.0,3.636364,10.155172,0.0,-0.309097,0.0,-0.36899,0.726958


<br>

In [30]:
# combining into single dataframe
data_preprocessed = pd.concat([num_data_scaled, cat_data_encoded, bool_data_encoded, data.gender], axis=1)

data_preprocessed.head()

Unnamed: 0,age,room_service,food_court,shopping_mall,spa,vr_deck,group,number_in_group,num_in_cabin,total_spending,...,deck_T,side_P,side_S,age_category_child,age_category_middle_adult,age_category_old_adult,age_category_young_adult,cryo_sleep,vip,gender
0,0.235294,31.435897,0.0,0.045455,0.0,0.0,-0.689033,0.0,-0.144231,0.35343,...,0,0,1,0,1,0,0,0,0,1.0
1,-0.058824,0.0,12.892086,0.0,11.896552,0.024691,0.574414,0.0,-0.223558,0.602911,...,0,1,0,0,0,0,1,0,0,1.0
2,-0.176471,0.0,0.0,0.0,0.0,0.0,0.01584,0.0,0.425481,-0.496881,...,0,1,0,0,0,0,1,1,0,1.0
3,0.352941,0.0,0.0,0.0,7.517241,5.530864,0.678693,0.0,0.115385,-0.039501,...,0,0,1,0,1,0,0,0,0,0.0
4,-0.352941,28.128205,0.0,3.636364,10.155172,0.0,-0.309097,0.0,-0.36899,0.726958,...,0,1,0,0,0,0,1,0,0,1.0


In [31]:
# shape of combined data
data_preprocessed.shape

(7823, 33)

In [33]:
# checking for null values
data_preprocessed.isna().any().value_counts()

False    33
dtype: int64

In [35]:
# checking for non-numerical datatype
data_preprocessed.dtypes.value_counts()

uint8      20
float64    11
int64       2
dtype: int64