In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
import sys
import os
sys.path.append(os.getcwd()[:-10])
# utility functions
from utils.utils import format_as_per_convention # type: ignore[import]

In [3]:
# loading data
data = pd.read_csv("../data/train.csv")

In [6]:
data.shape

(8693, 14)

In [5]:
# creating new column names
# as per python naming conventions
new_column_names = format_as_per_convention(data.columns)
new_column_names[6] = "vip"
new_column_names[11] = "vr_deck"

data.columns = new_column_names
data.head()

Unnamed: 0,passenger_id,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,name,transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
data_without_na = data.dropna()

In [8]:
data_without_na.shape

(6606, 14)

<br>
<br>

In [15]:
passenger_id = data_without_na.passenger_id
passenger_id.head()

0    0001_01
1    0002_01
2    0003_01
3    0003_02
4    0004_01
Name: passenger_id, dtype: object

In [16]:
group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[0]))
number_in_group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[1]))

print(group.head())
print(number_in_group.head())

0    1
1    2
2    3
3    3
4    4
Name: passenger_id, dtype: int64
0    1
1    1
2    1
3    2
4    1
Name: passenger_id, dtype: int64


In [17]:
home_planet = data_without_na.home_planet
home_planet.head()

0    Europa
1     Earth
2    Europa
3    Europa
4     Earth
Name: home_planet, dtype: object

In [19]:
home_planet.value_counts()

Earth     3566
Europa    1673
Mars      1367
Name: home_planet, dtype: int64

In [22]:
cryo_sleep = data_without_na.cryo_sleep
cryo_sleep.head()

0    False
1    False
2    False
3    False
4    False
Name: cryo_sleep, dtype: object

In [23]:
cryo_sleep.value_counts()

False    4274
True     2332
Name: cryo_sleep, dtype: int64

In [24]:
cabin = data_without_na.cabin
cabin.head()

0    B/0/P
1    F/0/S
2    A/0/S
3    A/0/S
4    F/1/S
Name: cabin, dtype: object

In [31]:
deck = cabin.apply(lambda x: x.split("/")[0])
num_in_cabin = cabin.apply(lambda x: x.split("/")[1])
side = cabin.apply(lambda x: x.split("/")[2])
print(deck.head())
print(num_in_cabin.head())
print(side.head())

0    B
1    F
2    A
3    A
4    F
Name: cabin, dtype: object
0    0
1    0
2    0
3    0
4    1
Name: cabin, dtype: object
0    P
1    S
2    S
3    S
4    S
Name: cabin, dtype: object


In [34]:
side.value_counts()

S    3345
P    3261
Name: cabin, dtype: int64

In [35]:
destination = data_without_na.destination
destination.head()

0    TRAPPIST-1e
1    TRAPPIST-1e
2    TRAPPIST-1e
3    TRAPPIST-1e
4    TRAPPIST-1e
Name: destination, dtype: object

In [36]:
destination.value_counts()

TRAPPIST-1e      4576
55 Cancri e      1407
PSO J318.5-22     623
Name: destination, dtype: int64

In [37]:
age = data_without_na.age
age.head()

0    39.0
1    24.0
2    58.0
3    33.0
4    16.0
Name: age, dtype: float64

In [40]:
age.describe()

count    6606.000000
mean       28.894036
std        14.533429
min         0.000000
25%        19.000000
50%        27.000000
75%        38.000000
max        79.000000
Name: age, dtype: float64

In [41]:
vip = data_without_na.vip
vip.head()

0    False
1    False
2     True
3    False
4    False
Name: vip, dtype: object

In [42]:
vip.value_counts()

False    6444
True      162
Name: vip, dtype: int64

In [44]:
room_service = data_without_na.room_service
food_court = data_without_na.food_court
shopping_mall = data_without_na.shopping_mall
spa = data_without_na.spa
vr_deck = data_without_na.vr_deck

In [49]:
total_spending = room_service + food_court + shopping_mall + spa + vr_deck
total_spending.head()

0        0.0
1      736.0
2    10383.0
3     5176.0
4     1091.0
dtype: float64

In [None]:
# ignoring name category

In [53]:
transported = data_without_na.transported
transported.head()

0    False
1     True
2    False
3    False
4     True
Name: transported, dtype: bool

In [55]:
transported = transported.apply(lambda x: 1 if x else 0)
transported.head()

0    0
1    1
2    0
3    0
4    1
Name: transported, dtype: int64

In [57]:
print(group.shape)
print(number_in_group.shape)
print(home_planet.shape)
print(cryo_sleep.shape)
print(deck.shape)
print(num_in_cabin.shape)
print(side.shape)
print(destination.shape)
print(age.shape)
print(vip.shape)
print(room_service.shape)
print(food_court.shape)
print(shopping_mall.shape)
print(spa.shape)
print(vr_deck.shape)
print(total_spending.shape)
print(transported.shape)

(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)
(6606,)


In [58]:
data_without_na_engineered = pd.DataFrame()

In [61]:
data_without_na_engineered["group"] = group
data_without_na_engineered["number_in_group"] = number_in_group
data_without_na_engineered["home_planet"] = home_planet
data_without_na_engineered["cryo_sleep"] = cryo_sleep
data_without_na_engineered["deck"] = deck
data_without_na_engineered["num_in_cabin"] = num_in_cabin
data_without_na_engineered["side"] = side
data_without_na_engineered["destination"] = destination
data_without_na_engineered["age"] = age
data_without_na_engineered["vip"] = vip
data_without_na_engineered["room_service"] = room_service
data_without_na_engineered["food_court"] = food_court
data_without_na_engineered["shopping_mall"] = shopping_mall
data_without_na_engineered["spa"] = spa
data_without_na_engineered["vr_deck"] = vr_deck
data_without_na_engineered["total_spending"] = total_spending
data_without_na_engineered["transported"] = transported

In [62]:
data_without_na_engineered.head()

Unnamed: 0,group,number_in_group,home_planet,cryo_sleep,deck,num_in_cabin,side,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,total_spending,transported
0,1,1,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2,1,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0,1
2,3,1,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,0
3,3,2,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,0
4,4,1,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0,1


In [63]:
data_without_na_engineered.to_csv("../data/data_without_na_engineered.csv", index=None)