In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
from sklearn.model_selection import train_test_split

In [2]:
# reading data
data = pd.read_csv("../data/train.csv")

# changing column names
new_columns = ["passenger_id", "home_planet", "cryo_sleep", "cabin", "destination", "age", "vip", "room_service", "food_court", "shopping_mall", "spa", "vr_deck", "name", "transported"]
data.columns = new_columns

# seperating X and y
X = data.drop(["transported"], axis=1)
y = data.transported

In [3]:
X.head()

Unnamed: 0,passenger_id,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [4]:
y.head()

0    False
1     True
2    False
3    False
4     True
Name: transported, dtype: bool

In [5]:
# splitting data into training and testing set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=42)

In [6]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(7823, 13)
(870, 13)
(7823,)
(870,)


<br>
<br>
<br>

In [7]:
# passenger_id
passenger_id = x_train.passenger_id

# seperating group and number in group
group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[0]))
number_in_group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[1]))

In [8]:
# home_planet
home_planet = x_train.home_planet

# filling 201 null values with mode
home_planet.fillna(value = home_planet.mode()[0], inplace=True)

In [9]:
# cryo_sleep
cryo_sleep = x_train.cryo_sleep

# filling 217 null values with mode
cryo_sleep.fillna(value = cryo_sleep.mode()[0], inplace=True)

In [10]:
# cabin
cabin = x_train.cabin

# seperating deck, num and side
deck = cabin.apply(lambda x: x.split("/")[0] if pd.notna(x) else x)
num_in_cabin = cabin.apply(lambda x: x.split("/")[1] if pd.notna(x) else x)
side = cabin.apply(lambda x: x.split("/")[2] if pd.notna(x) else x)

# filling 199 null values
deck.fillna(value = deck.mode()[0], inplace=True)
num_in_cabin.fillna(value = 0, inplace=True)
side.fillna(value = side.mode()[0], inplace=True)

In [11]:
# destination
destination = x_train.destination

# filling 182 null values with mode
destination.fillna(value = destination.mode()[0], inplace=True)

In [12]:
# vip
vip = x_train.vip

# filling 203 null values with mode
vip.fillna(value = vip.mode()[0], inplace=True)

In [13]:
# age
age = x_train.age

# filling 179 null values with median
age.fillna(value = age.median(), inplace=True)

In [14]:
# spendings
room_service = x_train.room_service
food_court = x_train.food_court
shopping_mall = x_train.shopping_mall
spa = x_train.spa
vr_deck = x_train.vr_deck

# filling null values with 0.0
room_service.fillna(value=0.0, inplace=True)
food_court.fillna(value=0.0, inplace=True)
shopping_mall.fillna(value=0.0, inplace=True)
spa.fillna(value=0.0, inplace=True)
vr_deck.fillna(value=0.0, inplace=True)

In [15]:
# total_spending
total_spending = room_service + food_court + shopping_mall + spa + vr_deck

<br>
<br>
<br>

#### Feature Engineering

In [16]:
# age categories
def get_age_category(age: float) -> str:
    if age <= 16:
        return "child"
    elif age <= 30:
        return "young_adult"
    elif age <= 45:
        return "middle_adult"
    else:
        return "old_adult"

age_category = age.apply(lambda x: get_age_category(x))

In [19]:
# gender from name
def get_gender(name: str) -> int:
    if pd.isna(name):
        return np.nan
    if name.split(" ")[0][-1] in ["a", "e", "i", "y"]:
        return 0
    else:
        return 1

gender = x_train.name.apply(lambda x: get_gender(x))

# filling null values with 1.0
gender.fillna(value=1.0, inplace=True)

<br>
<br>

In [33]:
X_prepared = pd.DataFrame()

X_prepared["group"] = group
X_prepared["number_in_group"] = number_in_group
X_prepared["home_planet"] = home_planet
X_prepared["cryo_sleep"] = cryo_sleep
X_prepared["deck"] = deck
X_prepared["num_in_cabin"] = num_in_cabin
X_prepared["side"] = side
X_prepared["destination"] = destination
X_prepared["age"] = age
X_prepared["vip"] = vip
X_prepared["room_service"] = room_service
X_prepared["food_court"] = food_court
X_prepared["shopping_mall"] = shopping_mall
X_prepared["spa"] = spa
X_prepared["vr_deck"] = vr_deck
X_prepared["total_spending"] = total_spending
X_prepared["age_category"] = age_category
X_prepared["gender"] = gender

In [34]:
X_prepared.head()

Unnamed: 0,group,number_in_group,home_planet,cryo_sleep,deck,num_in_cabin,side,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,total_spending,age_category,gender
1432,1510,1,Mars,False,F,291,S,TRAPPIST-1e,31.0,False,1226.0,0.0,1.0,0.0,0.0,1227.0,middle_adult,1.0
6858,7253,1,Europa,False,D,225,P,TRAPPIST-1e,26.0,False,0.0,896.0,0.0,690.0,1.0,1587.0,young_adult,1.0
4436,4714,1,Earth,True,G,765,P,TRAPPIST-1e,24.0,False,0.0,0.0,0.0,0.0,0.0,0.0,young_adult,1.0
7230,7727,1,Earth,False,E,507,S,55 Cancri e,33.0,False,0.0,0.0,0.0,436.0,224.0,660.0,middle_adult,0.0
2992,3237,1,Mars,False,D,104,P,TRAPPIST-1e,21.0,False,1097.0,0.0,80.0,589.0,0.0,1766.0,young_adult,1.0
