In [67]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(color_codes=True)

<br>
<br>
<br>

### Data collection

In [2]:
# importing non null data
train_data = pd.read_csv("../data/train_2.csv")
test_data = pd.read_csv("../data/test_2.csv")

In [3]:
# inspecting train data
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# inspecting test data
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [5]:
# checking for null data
train_data.isna().any().value_counts()

False    14
dtype: int64

In [6]:
# checking for null data
test_data.isna().any().value_counts()

False    13
dtype: int64

In [15]:
# combining training and test data
total_data = pd.concat([train_data.drop("Transported", axis=1), test_data])
total_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [16]:
# shape of combined data
total_data.shape

(12970, 13)

<br>
<br>
<br>

### Feature engineering

#### Age

In [17]:
# creating age category
# value for bins are evaluated from EDA
total_data["AgeCategory"] = pd.cut(x=total_data.Age, bins=[-1, 18, 40, 80], labels=['child', 'adult', 'old'])

In [18]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,AgeCategory
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,adult
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,adult
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,old
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,adult
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,child


<br>

#### PassengerId

In [19]:
# splitting 'passenger_id' into 'group' and 'num_in_group'
group = total_data.PassengerId.apply(lambda x: int(x.split("_")[0]))
num_in_group = total_data.PassengerId.apply(lambda x: int(x.split("_")[1]))

In [24]:
# creating group category
# value for bins are evaluated from EDA
total_data['GroupCategory'] = pd.cut(x=group, bins=[-1, 3500, 7500, 9300], labels=['a', 'b', 'c'], ordered=False)

In [26]:
# using num_in_group as it is 
total_data['NumInGroup'] = num_in_group

In [27]:
# removing PassengerId column
total_data.drop('PassengerId', axis=1, inplace=True)

In [28]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,AgeCategory,GroupCategory,NumInGroup
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,adult,a,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,adult,a,1
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,old,a,1
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,adult,a,2
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,child,a,1


<br>

#### CryoSleep

In [33]:
# converting (True, False) -> (1, 0)
total_data.CryoSleep = total_data.CryoSleep.apply(lambda x: 1 if x else 0)

In [34]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,AgeCategory,GroupCategory,NumInGroup
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,adult,a,1
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,adult,a,1
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,old,a,1
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,adult,a,2
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,child,a,1


In [35]:
# value counts
total_data.CryoSleep.value_counts()

0    8323
1    4647
Name: CryoSleep, dtype: int64

<br>

#### Cabin

In [36]:
# extracting deck, num_in_cabin and side
deck = total_data.Cabin.apply(lambda x: np.nan if pd.isna(x) else x.split("/")[0])
num_in_cabin = total_data.Cabin.apply(lambda x: np.nan if pd.isna(x) else int(x.split("/")[1]))
side = total_data.Cabin.apply(lambda x: np.nan if pd.isna(x) else x.split("/")[2])

In [40]:
# creating Deck feature
total_data["Deck"] = deck

In [41]:
# creating Side feature
total_data["Side"] = side

In [46]:
# creating cabin number category
# value for bins are evaluated from EDA
total_data['CabinNumberCategory'] = pd.cut(x=num_in_cabin, bins=[-1, 300, 700, 1200, 1900], labels=['a', 'b', 'c', 'd'], ordered=False)

In [47]:
# dropping Cabin column
total_data.drop('Cabin', axis=1, inplace=True)

In [48]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory
0,Europa,0,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,adult,a,1,B,P,a
1,Earth,0,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,adult,a,1,F,S,a
2,Europa,0,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,old,a,1,A,S,a
3,Europa,0,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,adult,a,2,A,S,a
4,Earth,0,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,child,a,1,F,S,a


<br>

#### VIP

In [49]:
# converting (True, False) -> (1, 0)
total_data.VIP = total_data.VIP.apply(lambda x: 1 if x else 0)

In [50]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,adult,a,1,B,P,a
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,adult,a,1,F,S,a
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,old,a,1,A,S,a
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,adult,a,2,A,S,a
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,child,a,1,F,S,a


In [53]:
# value counts
total_data.VIP.value_counts()

0    12697
1      273
Name: VIP, dtype: int64

<br>

#### Name

In [54]:
# extracting first name
first_name = total_data.Name.apply(lambda x: x.split(" ")[0])

In [57]:
# getting gender from name
is_male = first_name.apply(lambda x: 0 if x[-1] in ["a", "e", "i", "y"] else 1)

In [59]:
# value counts
is_male.value_counts()

1    6788
0    6182
Name: Name, dtype: int64

In [61]:
# creating IsMale feature
total_data['IsMale'] = is_male

In [62]:
# dropping name column
total_data.drop("Name", axis=1, inplace=True)

In [63]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,B,P,a,1
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,adult,a,1,F,S,a,0
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,old,a,1,A,S,a,1
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,adult,a,2,A,S,a,1
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,child,a,1,F,S,a,0


<br>

#### Spendings

Taking log transformation to decrease skewness

In [71]:
# log(1 + x)
total_data.RoomService = np.log1p(total_data.RoomService)
total_data.FoodCourt = np.log1p(total_data.FoodCourt)
total_data.ShoppingMall = np.log1p(total_data.ShoppingMall)
total_data.Spa = np.log1p(total_data.Spa)
total_data.VRDeck = np.log1p(total_data.VRDeck)

In [72]:
# inspecting updated dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,B,P,a,1
1,Earth,0,TRAPPIST-1e,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,adult,a,1,F,S,a,0
2,Europa,0,TRAPPIST-1e,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,old,a,1,A,S,a,1
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,7.157735,5.918894,8.110728,5.267858,adult,a,2,A,S,a,1
4,Earth,0,TRAPPIST-1e,16.0,0,5.717028,4.26268,5.023881,6.338594,1.098612,child,a,1,F,S,a,0


<br>

### Saving datasets

In [78]:
# inspecting final dataframe
total_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,B,P,a,1
1,Earth,0,TRAPPIST-1e,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,adult,a,1,F,S,a,0
2,Europa,0,TRAPPIST-1e,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,old,a,1,A,S,a,1
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,7.157735,5.918894,8.110728,5.267858,adult,a,2,A,S,a,1
4,Earth,0,TRAPPIST-1e,16.0,0,5.717028,4.26268,5.023881,6.338594,1.098612,child,a,1,F,S,a,0


In [75]:
# shape of final data
total_data.shape

(12970, 17)

In [77]:
# checking for null values
total_data.isna().any().value_counts()

False    17
dtype: int64

<br>

In [79]:
# seperating training data
train_data_2 = total_data.iloc[:8693].copy()
train_data_2.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,B,P,a,1
1,Earth,0,TRAPPIST-1e,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,adult,a,1,F,S,a,0
2,Europa,0,TRAPPIST-1e,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,old,a,1,A,S,a,1
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,7.157735,5.918894,8.110728,5.267858,adult,a,2,A,S,a,1
4,Earth,0,TRAPPIST-1e,16.0,0,5.717028,4.26268,5.023881,6.338594,1.098612,child,a,1,F,S,a,0


In [80]:
# adding target variable
train_data_2['Transported'] = train_data.Transported
train_data_2.shape

(8693, 18)

In [81]:
# cehcking for null values
train_data_2.isna().any().value_counts()

False    18
dtype: int64

In [82]:
# saving as csv
train_data_2.to_csv("../data/train_engineered.csv", index=None)

<br>

In [83]:
# seperating test data
test_data_2 = total_data.iloc[8693:].copy()
test_data_2.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,AgeCategory,GroupCategory,NumInGroup,Deck,Side,CabinNumberCategory,IsMale
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,G,S,a,0
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,2.302585,0.0,7.94591,0.0,adult,a,1,F,S,a,0
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,adult,a,1,C,S,a,1
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,8.802823,0.0,5.204007,6.37332,adult,a,1,C,S,a,1
4,Earth,0,TRAPPIST-1e,20.0,0,2.397895,0.0,6.455199,0.0,0.0,adult,a,1,F,S,a,0


In [84]:
# verifying shape of test data
test_data_2.shape

(4277, 17)

In [85]:
# checking for null values
test_data_2.isna().any().value_counts()

False    17
dtype: int64

In [86]:
# saving as csv
test_data_2.to_csv("../data/test_engineered.csv", index=None)

<br>
<br>
<br>

### Next Steps:

After performing feature engineering, data needs to be preprocessed before feeding it to the model.<br>
This includes handling categorical and text data, scaling numerical data, specifying datatype for each feature, etc.<br>
<br>

Next notebook: **Data Preprocessing**