In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
train_raw_df = pd.read_csv("../dataset/spaceship-titanic/train.csv")
test_raw_df = pd.read_csv("../dataset/spaceship-titanic/test.csv")

In [3]:
train_process_df = train_raw_df.copy()

### Data Cleaning + Feature Engineering

#### PassengerId


In [4]:
train_process_df["GroupId"] = train_process_df["PassengerId"].str.split("_", expand=True)[0]

In [5]:
group_size = train_process_df["GroupId"].value_counts().to_dict()

In [6]:
train_process_df["GroupSize"] = train_process_df["GroupId"].apply(lambda x: group_size[x])

In [7]:
train_process_df["IsAlone"] = train_process_df["GroupSize"].apply(lambda x: 1 if x == 1 else 0)

In [8]:
train_process_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,1,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,2,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,2,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1,1
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,1,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,2,0


#### HomePlanet
- Handling Missing Values

In [9]:
train_process_df["HomePlanet"].isnull().sum()

201

In [10]:
train_process_df[train_process_df["HomePlanet"].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,,0.0,0.0,Colatz Keen,True,0064,2,0
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,False,0119,2,0
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,,0.0,0.0,Arraid Inicont,True,0210,1,1
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,0242,1,1
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,0251,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,Jurs Mone,False,9084,1,1
8613,9194_01,,False,E/603/S,55 Cancri e,53.0,False,0.0,4017.0,0.0,13.0,3147.0,,False,9194,2,0
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,Gian Perle,True,9248,1,1
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False,9257,1,1


In [11]:
train_process_df[train_process_df.HomePlanet.isnull() & ~train_process_df.IsAlone]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,,0.0,0.0,Colatz Keen,True,0064,2,0
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,False,0119,2,0
405,0444_02,,False,F/99/P,TRAPPIST-1e,26.0,True,1869.0,0.0,136.0,0.0,0.0,Colal Curte,False,0444,2,0
407,0445_02,,False,E/25/S,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,77.0,715.0,Rice Marshopper,False,0445,2,0
438,0470_02,,True,F/86/S,TRAPPIST-1e,36.0,False,0.0,0.0,0.0,0.0,0.0,Jurs Binie,True,0470,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8353,8938_01,,True,F/1729/S,TRAPPIST-1e,22.0,False,0.0,0.0,0.0,0.0,0.0,Quites Must,True,8938,2,0
8383,8961_04,,False,F/1839/P,TRAPPIST-1e,21.0,False,0.0,6.0,0.0,593.0,0.0,Dandy Blancoy,False,8961,5,0
8454,9030_02,,False,G/1465/P,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,298.0,511.0,Stace Ericksonga,False,9030,2,0
8489,9072_01,,True,F/1758/S,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Gork Creke,True,9072,3,0


In [12]:
# Does Group has same HomePlanet ? If yes, can be used for fillna
temp_df = train_process_df.groupby(["GroupId", "HomePlanet"]).size().reset_index(name='count')

In [13]:
temp_df

Unnamed: 0,GroupId,HomePlanet,count
0,0001,Europa,1
1,0002,Earth,1
2,0003,Europa,2
3,0004,Earth,1
4,0005,Earth,1
...,...,...,...
6102,9275,Europa,3
6103,9276,Europa,1
6104,9278,Earth,1
6105,9279,Earth,1


In [14]:
temp_df.GroupId.is_unique

True

In [15]:
train_process_df.iloc[800:850,:]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
800,0846_01,Earth,True,G/125/S,TRAPPIST-1e,13.0,False,0.0,0.0,0.0,0.0,0.0,Tera Simstravery,False,846,1,1
801,0848_01,Mars,True,F/175/P,TRAPPIST-1e,30.0,False,0.0,0.0,0.0,0.0,0.0,Sts Raca,True,848,2,0
802,0848_02,Mars,True,F/175/P,TRAPPIST-1e,14.0,False,0.0,0.0,0.0,0.0,0.0,Monam Raca,True,848,2,0
803,0849_01,Europa,False,B/32/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Cellum Oidederval,True,849,2,0
804,0849_02,Europa,True,B/32/S,55 Cancri e,32.0,False,0.0,0.0,0.0,0.0,0.0,Tareson Oidederval,True,849,2,0
805,0850_01,Mars,True,E/62/S,TRAPPIST-1e,54.0,False,0.0,0.0,0.0,0.0,0.0,Koops Nan,True,850,1,1
806,0851_01,Earth,False,F/176/P,55 Cancri e,23.0,False,217.0,378.0,19.0,1.0,12.0,Iandy Sarios,False,851,1,1
807,0853_01,,True,A/9/S,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,Hamelik Ageurante,True,853,1,1
808,0859_01,Earth,True,G/127/S,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,0.0,0.0,Eleaha Lambles,True,859,1,1
809,0860_01,Earth,False,E/48/P,TRAPPIST-1e,24.0,False,34.0,7.0,320.0,0.0,427.0,,False,860,1,1


In [16]:
# data point 847 848 and negative case 807
train_process_df.sort_values(by=["GroupId", "HomePlanet"]).iloc[800:850,:]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
800,0846_01,Earth,True,G/125/S,TRAPPIST-1e,13.0,False,0.0,0.0,0.0,0.0,0.0,Tera Simstravery,False,846,1,1
801,0848_01,Mars,True,F/175/P,TRAPPIST-1e,30.0,False,0.0,0.0,0.0,0.0,0.0,Sts Raca,True,848,2,0
802,0848_02,Mars,True,F/175/P,TRAPPIST-1e,14.0,False,0.0,0.0,0.0,0.0,0.0,Monam Raca,True,848,2,0
803,0849_01,Europa,False,B/32/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Cellum Oidederval,True,849,2,0
804,0849_02,Europa,True,B/32/S,55 Cancri e,32.0,False,0.0,0.0,0.0,0.0,0.0,Tareson Oidederval,True,849,2,0
805,0850_01,Mars,True,E/62/S,TRAPPIST-1e,54.0,False,0.0,0.0,0.0,0.0,0.0,Koops Nan,True,850,1,1
806,0851_01,Earth,False,F/176/P,55 Cancri e,23.0,False,217.0,378.0,19.0,1.0,12.0,Iandy Sarios,False,851,1,1
807,0853_01,,True,A/9/S,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,Hamelik Ageurante,True,853,1,1
808,0859_01,Earth,True,G/127/S,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,0.0,0.0,Eleaha Lambles,True,859,1,1
809,0860_01,Earth,False,E/48/P,TRAPPIST-1e,24.0,False,34.0,7.0,320.0,0.0,427.0,,False,860,1,1


In [17]:
train_process_df[train_process_df.IsAlone==0].loc[:, "HomePlanet"]

2       Europa
3       Europa
6        Earth
7        Earth
9       Europa
         ...  
8685    Europa
8686    Europa
8687    Europa
8691    Europa
8692    Europa
Name: HomePlanet, Length: 3888, dtype: object

In [18]:
train_process_df[train_process_df.IsAlone==0].sort_values(by=["GroupId", "HomePlanet"])["HomePlanet"]

2       Europa
3       Europa
6        Earth
7        Earth
9       Europa
         ...  
8685    Europa
8686    Europa
8687    Europa
8691    Europa
8692    Europa
Name: HomePlanet, Length: 3888, dtype: object

In [19]:
# forward fill 
train_process_df[train_process_df.IsAlone==0].loc[:, "HomePlanet"] = train_process_df[train_process_df.IsAlone==0].sort_values(by=["GroupId", "HomePlanet"]).loc[:, "HomePlanet"].fillna(method="ffill")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_process_df[train_process_df.IsAlone==0].loc[:, "HomePlanet"] = train_process_df[train_process_df.IsAlone==0].sort_values(by=["GroupId", "HomePlanet"]).loc[:, "HomePlanet"].fillna(method="ffill")


In [20]:
# reference: to above probelm
#https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

In [21]:
train_process_df.loc[train_process_df.IsAlone==0, "HomePlanet"] = train_process_df[train_process_df.IsAlone==0].sort_values(by=["GroupId", "HomePlanet"]).loc[:, "HomePlanet"].fillna(method="ffill")

In [22]:
train_process_df.sort_values(by=["GroupId", "HomePlanet"]).iloc[800:850,:]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
800,0846_01,Earth,True,G/125/S,TRAPPIST-1e,13.0,False,0.0,0.0,0.0,0.0,0.0,Tera Simstravery,False,846,1,1
801,0848_01,Mars,True,F/175/P,TRAPPIST-1e,30.0,False,0.0,0.0,0.0,0.0,0.0,Sts Raca,True,848,2,0
802,0848_02,Mars,True,F/175/P,TRAPPIST-1e,14.0,False,0.0,0.0,0.0,0.0,0.0,Monam Raca,True,848,2,0
803,0849_01,Europa,False,B/32/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Cellum Oidederval,True,849,2,0
804,0849_02,Europa,True,B/32/S,55 Cancri e,32.0,False,0.0,0.0,0.0,0.0,0.0,Tareson Oidederval,True,849,2,0
805,0850_01,Mars,True,E/62/S,TRAPPIST-1e,54.0,False,0.0,0.0,0.0,0.0,0.0,Koops Nan,True,850,1,1
806,0851_01,Earth,False,F/176/P,55 Cancri e,23.0,False,217.0,378.0,19.0,1.0,12.0,Iandy Sarios,False,851,1,1
807,0853_01,,True,A/9/S,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,Hamelik Ageurante,True,853,1,1
808,0859_01,Earth,True,G/127/S,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,0.0,0.0,Eleaha Lambles,True,859,1,1
809,0860_01,Earth,False,E/48/P,TRAPPIST-1e,24.0,False,34.0,7.0,320.0,0.0,427.0,,False,860,1,1


In [23]:
train_process_df.HomePlanet.isnull().sum()

109

In [24]:
## filled 50% of data HomePlanet column

In [25]:
train_process_df["HomePlanet"].mode()

0    Earth
Name: HomePlanet, dtype: object

In [26]:
train_process_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,1,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,2,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,2,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1,1
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,1,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,2,0


In [27]:
train_process_df.loc[train_process_df.HomePlanet.isnull() & train_process_df.IsAlone, "HomePlanet"]

186     NaN
225     NaN
234     NaN
274     NaN
286     NaN
       ... 
8468    NaN
8515    NaN
8666    NaN
8674    NaN
8684    NaN
Name: HomePlanet, Length: 109, dtype: object

In [28]:
# filling rest with mode approach
train_process_df.loc[train_process_df.HomePlanet.isnull() & train_process_df.IsAlone, "HomePlanet"] = train_process_df["HomePlanet"].mode()[0]

In [29]:
train_process_df.HomePlanet.isnull().sum()

0

#### CryoSleep 

In [30]:
train_process_df.CryoSleep.isnull().sum()

217

In [31]:
train_process_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,1,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,2,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,2,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1,1
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,1,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,2,0


In [32]:
train_process_df.loc[
    train_process_df.CryoSleep.isnull() & (
        (train_process_df.RoomService>0) | 
        (train_process_df.FoodCourt>0) | 
        (train_process_df.ShoppingMall>0) |
        (train_process_df.Spa>0) |
        (train_process_df.VRDeck>0)
    )]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
98,0105_01,Earth,,F/21/P,TRAPPIST-1e,27.0,False,0.0,0.0,570.0,2.0,131.0,Carry Cleachrand,False,0105,1,1
104,0110_02,Europa,,B/5/P,TRAPPIST-1e,40.0,False,0.0,331.0,0.0,0.0,1687.0,Aldeba Bootious,False,0110,4,0
152,0173_01,Earth,,E/11/S,TRAPPIST-1e,58.0,False,0.0,985.0,0.0,5.0,0.0,Hilip Grifford,True,0173,1,1
224,0241_01,Europa,,E/11/P,55 Cancri e,33.0,False,0.0,1249.0,0.0,4812.0,1116.0,Alas Dischod,False,0241,1,1
314,0348_02,Mars,,,TRAPPIST-1e,36.0,False,520.0,0.0,1865.0,0.0,0.0,Weet Mane,True,0348,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8420,8993_01,Earth,,F/1844/P,PSO J318.5-22,27.0,False,32.0,8.0,5.0,588.0,18.0,Juliey Hahnstonsen,False,8993,1,1
8455,9031_01,Earth,,G/1455/S,TRAPPIST-1e,51.0,False,0.0,0.0,689.0,0.0,32.0,Noria Bairdyork,False,9031,1,1
8591,9170_01,Earth,,E/587/P,TRAPPIST-1e,33.0,False,1060.0,403.0,1.0,0.0,145.0,Jeandy Ewins,False,9170,1,1
8675,9259_01,Earth,,F/1893/P,TRAPPIST-1e,44.0,False,1030.0,1015.0,0.0,11.0,,Annah Gilleyons,True,9259,1,1


In [33]:
train_process_df.loc[
    train_process_df.CryoSleep.isnull() & (
        (train_process_df.RoomService>0) | 
        (train_process_df.FoodCourt>0) | 
        (train_process_df.ShoppingMall>0) |
        (train_process_df.Spa>0) |
        (train_process_df.VRDeck>0)
    ), "CryoSleep"] = False

In [34]:
train_process_df.CryoSleep.isnull().sum()

98

#### Cabin

In [35]:
train_process_df.Cabin

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object

In [36]:
train_process_df.Cabin.isnull().sum()

199

In [37]:
train_process_df[train_process_df.Cabin.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,0012,1,1
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0101,1,1
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,0110,4,0
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,0239,1,1
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0244,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False,8772,2,0
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False,9057,2,0
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True,9069,5,0
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False,9081,8,0


In [38]:
train_process_df.groupby(["Cabin", "GroupId"]).size().reset_index(name='count')

Unnamed: 0,Cabin,GroupId,count
0,A/0/P,0119,2
1,A/0/S,0003,2
2,A/1/S,0056,3
3,A/10/P,1258,1
4,A/10/S,0871,1
...,...,...,...
6555,T/0/P,1071,1
6556,T/1/P,2414,1
6557,T/2/P,2935,1
6558,T/2/S,4863,1


In [39]:
temp_df =train_process_df.groupby(["Cabin", "GroupId"]).size().reset_index(name='count')

In [40]:
temp_df.GroupId.is_unique

False

In [41]:
temp_df.GroupId.value_counts().unique()

array([3, 2, 1])

In [42]:
temp_df.GroupId.value_counts()

6611    3
6850    3
0938    3
2092    3
2234    3
       ..
1576    1
1551    1
1316    1
1509    1
2971    1
Name: GroupId, Length: 6118, dtype: int64

In [43]:
temp_df[temp_df["count"]>1]

Unnamed: 0,Cabin,GroupId,count
0,A/0/P,0119,2
1,A/0/S,0003,2
2,A/1/S,0056,3
5,A/100/S,8307,2
6,A/101/S,8363,2
...,...,...,...
6529,G/982/S,6032,2
6544,G/991/S,6092,2
6548,G/994/S,6137,5
6553,G/999/P,6174,2


In [44]:
train_process_df[train_process_df.GroupId == "0020"]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
21,0020_01,Earth,True,E/0/S,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Almary Brantuarez,False,20,6,0
22,0020_02,Earth,True,E/0/S,55 Cancri e,49.0,False,0.0,0.0,0.0,0.0,0.0,Glendy Brantuarez,False,20,6,0
23,0020_03,Earth,True,E/0/S,55 Cancri e,29.0,False,0.0,0.0,,0.0,0.0,Mollen Mcfaddennon,False,20,6,0
24,0020_04,Earth,False,E/0/S,TRAPPIST-1e,10.0,False,0.0,0.0,0.0,0.0,0.0,Breney Jacostanley,True,20,6,0
25,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,False,,0.0,0.0,0.0,0.0,Mael Brantuarez,False,20,6,0
26,0020_06,Earth,False,E/0/S,TRAPPIST-1e,7.0,False,0.0,0.0,0.0,0.0,0.0,Terta Mcfaddennon,False,20,6,0


In [45]:
train_process_df[train_process_df.GroupId == "0044"]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
43,0044_01,Earth,True,G/3/P,TRAPPIST-1e,55.0,False,0.0,0.0,0.0,0.0,0.0,Jodye Coopelandez,False,44,3,0
44,0044_02,Earth,True,G/3/P,55 Cancri e,4.0,False,0.0,0.0,0.0,0.0,0.0,Kayne Coopelandez,True,44,3,0
45,0044_03,Earth,True,G/3/P,PSO J318.5-22,21.0,False,0.0,0.0,0.0,0.0,0.0,Cassa Coopelandez,True,44,3,0


In [46]:
train_process_df[train_process_df.Cabin.isnull() & ~ train_process_df.IsAlone]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,0110,4,0
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True,0244,2,0
251,0278_01,Earth,False,,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,888.0,,Judya Beachez,False,0278,2,0
260,0287_01,Europa,True,,55 Cancri e,39.0,False,0.0,0.0,0.0,0.0,0.0,Tetra Bootty,True,0287,2,0
314,0348_02,Mars,False,,TRAPPIST-1e,36.0,False,520.0,0.0,1865.0,0.0,0.0,Weet Mane,True,0348,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8202,8770_03,Earth,False,,PSO J318.5-22,14.0,False,476.0,14.0,0.0,328.0,8.0,Sony Morgerson,False,8770,7,0
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False,8772,2,0
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False,9057,2,0
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True,9069,5,0


In [47]:
train_process_df[train_process_df.GroupId == "0110"]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,110,4,0
104,0110_02,Europa,False,B/5/P,TRAPPIST-1e,40.0,False,0.0,331.0,0.0,0.0,1687.0,Aldeba Bootious,False,110,4,0
105,0110_03,Europa,False,B/5/P,TRAPPIST-1e,42.0,False,2209.0,11418.0,0.0,1868.0,445.0,Alaratz Aloubtled,True,110,4,0
106,0110_04,Europa,True,B/5/P,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,0.0,Izark Aloubtled,True,110,4,0


In [48]:
train_process_df.loc[train_process_df.IsAlone==0, "Cabin"] = train_process_df[train_process_df.IsAlone==0].sort_values(by=["GroupId", "Cabin"]).loc[:, "Cabin"].fillna(method="ffill")

In [49]:
train_process_df[train_process_df.GroupId == "0110"]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
103,0110_01,Europa,False,B/5/P,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False,110,4,0
104,0110_02,Europa,False,B/5/P,TRAPPIST-1e,40.0,False,0.0,331.0,0.0,0.0,1687.0,Aldeba Bootious,False,110,4,0
105,0110_03,Europa,False,B/5/P,TRAPPIST-1e,42.0,False,2209.0,11418.0,0.0,1868.0,445.0,Alaratz Aloubtled,True,110,4,0
106,0110_04,Europa,True,B/5/P,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,0.0,Izark Aloubtled,True,110,4,0


In [50]:
train_process_df.Cabin.isnull().sum()

99

In [51]:
train_process_df[train_process_df.Cabin.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False,0012,1,1
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True,0101,1,1
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False,0239,1,1
272,0301_01,Earth,True,,TRAPPIST-1e,16.0,False,0.0,0.0,0.0,0.0,0.0,Margia Wriggins,False,0301,1,1
280,0310_01,Europa,False,,TRAPPIST-1e,67.0,False,,230.0,0.0,4476.0,241.0,Naviton Coudered,False,0310,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847,8375_01,Earth,False,,PSO J318.5-22,15.0,False,64.0,0.0,0.0,0.0,640.0,Carlen Valezaley,True,8375,1,1
7899,8437_01,Earth,False,,PSO J318.5-22,36.0,False,52.0,0.0,132.0,334.0,176.0,Anny Tersony,False,8437,1,1
8039,8603_01,Earth,True,,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Hilip Klinson,False,8603,1,1
8066,8623_01,Earth,False,,TRAPPIST-1e,15.0,False,9.0,731.0,85.0,0.0,537.0,Vandy Hoffergess,False,8623,1,1


In [52]:
train_process_df[train_process_df.IsAlone==1]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,1,1
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,1,1
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True,0005,1,1
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True,0007,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8681,9270_01,Earth,True,G/1497/S,55 Cancri e,33.0,False,0.0,0.0,0.0,0.0,0.0,Lan Mckinsond,True,9270,1,1
8684,9274_01,Earth,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True,9274,1,1
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1,1


In [53]:
train_process_df[train_process_df.IsAlone==1].sample(20)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
4571,4873_01,Europa,False,C/191/S,55 Cancri e,35.0,False,0.0,1135.0,0.0,0.0,3880.0,Misa Excialing,True,4873,1,1
2858,3087_01,Earth,,G/505/P,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,Chadie Flyncharlan,True,3087,1,1
8119,8670_01,Earth,True,G/1398/S,PSO J318.5-22,16.0,False,0.0,0.0,0.0,0.0,0.0,Evenna Williotters,True,8670,1,1
7501,8024_01,Earth,False,F/1533/S,TRAPPIST-1e,73.0,False,16.0,0.0,722.0,2.0,89.0,Tancis Bushallert,False,8024,1,1
5838,6176_01,Mars,True,F/1275/P,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jetax Kinie,True,6176,1,1
5999,6349_01,Mars,False,F/1314/P,TRAPPIST-1e,18.0,False,593.0,0.0,508.0,0.0,12.0,Lers Frasp,True,6349,1,1
49,0051_01,Earth,False,E/2/S,TRAPPIST-1e,56.0,False,0.0,112.0,0.0,1379.0,127.0,Vivia Johnshines,False,51,1,1
7809,8336_01,Mars,True,E/547/S,TRAPPIST-1e,30.0,False,0.0,0.0,0.0,0.0,0.0,Minnys Sutte,True,8336,1,1
2746,2948_01,Europa,False,C/112/S,TRAPPIST-1e,55.0,False,0.0,1971.0,77.0,577.0,1050.0,Tabiton Prearright,True,2948,1,1
7190,7682_01,Earth,True,G/1248/S,TRAPPIST-1e,53.0,False,0.0,0.0,0.0,0.0,0.0,Denny Solon,False,7682,1,1


In [81]:
train_process_df.Cabin.isnull().sum()

99

#### Cabin Features 

In [135]:
train_process_df["Deck"] = train_process_df.Cabin.str.split("/", expand=True)[0]

In [136]:
train_process_df["Num"] = train_process_df.Cabin.str.split("/", expand=True)[1]

In [137]:
train_process_df["Side"] = train_process_df.Cabin.str.split("/", expand=True)[2]

#### Destination

In [82]:
train_process_df.Destination.isnull().sum()

182

In [91]:
temp_df =train_process_df.groupby(["Destination", "GroupId"]).size().reset_index(name='count')

In [92]:
temp_df

Unnamed: 0,Destination,GroupId,count
0,55 Cancri e,0008,2
1,55 Cancri e,0014,1
2,55 Cancri e,0015,1
3,55 Cancri e,0017,1
4,55 Cancri e,0020,2
...,...,...,...
6875,TRAPPIST-1e,9272,2
6876,TRAPPIST-1e,9274,1
6877,TRAPPIST-1e,9275,3
6878,TRAPPIST-1e,9279,1


In [93]:
temp_df.GroupId.is_unique

False

In [94]:
train_process_df['Destination'].fillna(train_process_df['Destination'].mode()[0], inplace=True)

In [95]:
train_process_df.Destination.isnull().sum()

0

#### Age

In [96]:
train_process_df.Age.isnull().sum()

179

In [97]:
train_process_df['Age'].fillna(train_process_df['Age'].median(), inplace=True)

In [98]:
train_process_df.Age.isnull().sum()

0

#### VIP

In [54]:
train_process_df.VIP.isnull().sum()

203

In [55]:
def df_vip(s):
    if not s is np.nan:
        return 1 if s else 0
    return s

In [56]:
train_process_df["VIP"] = train_process_df["VIP"].apply(lambda x: df_vip(x))

In [57]:
train_process_df.VIP.isnull().sum()

203

In [58]:
train_process_df[~train_process_df.VIP.isnull() & train_process_df.IsAlone==1]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,1,1
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,1,1
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,0.0,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True,0005,1,1
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,0.0,0.0,785.0,17.0,216.0,0.0,Andona Beston,True,0007,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8681,9270_01,Earth,True,G/1497/S,55 Cancri e,33.0,0.0,0.0,0.0,0.0,0.0,0.0,Lan Mckinsond,True,9270,1,1
8684,9274_01,Earth,True,G/1508/P,TRAPPIST-1e,23.0,0.0,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True,9274,1,1
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1,1


In [59]:
train_process_df.loc[~train_process_df.VIP.isnull() & train_process_df.IsAlone==1, "VIP"].value_counts()

0.0    4603
1.0      88
Name: VIP, dtype: int64

In [60]:
# conclusion:Most of the solo person are not VIP

In [61]:
#train_process_df.loc[train_process_df.VIP.isnull() & train_process_df.IsAlone==1, "VIP"] = 0.0

In [62]:
train_process_df.VIP.isnull().sum()

203

In [63]:
train_process_df.loc[train_process_df.VIP.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
38,0036_01,Earth,False,F/8/S,55 Cancri e,15.0,,0.0,492.0,48.0,20.0,401.0,Marina Leodger,False,0036,1,1
102,0108_03,Earth,False,G/19/S,TRAPPIST-1e,0.0,,0.0,0.0,0.0,0.0,0.0,Oline Handertiz,True,0108,3,0
145,0165_01,Mars,True,F/37/P,TRAPPIST-1e,35.0,,0.0,0.0,0.0,0.0,0.0,Graven Anche,True,0165,1,1
228,0244_02,Mars,True,F/47/S,55 Cancri e,14.0,,0.0,0.0,0.0,0.0,0.0,Tous Sad,True,0244,2,0
566,0593_01,Mars,False,D/24/P,TRAPPIST-1e,,,43.0,152.0,182.0,1.0,2005.0,Hon Kra,False,0593,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8494,9074_01,Earth,True,G/1460/S,TRAPPIST-1e,0.0,,0.0,0.0,,0.0,0.0,Adamie Trerady,True,9074,6,0
8512,9081_06,Earth,False,F/1858/P,PSO J318.5-22,16.0,,0.0,0.0,761.0,0.0,0.0,Daryla Clemondsey,False,9081,8,0
8542,9122_01,Earth,True,G/1469/S,55 Cancri e,55.0,,0.0,0.0,0.0,0.0,0.0,Paulas Schmondez,False,9122,1,1
8630,9205_03,Europa,True,B/300/P,TRAPPIST-1e,52.0,,0.0,0.0,0.0,0.0,0.0,Propent Brakeng,True,9205,3,0


In [64]:
# Is there are relation in extravagant passenger and VIP

In [65]:
train_process_df.loc[(
        (train_process_df.RoomService>0) | 
        (train_process_df.FoodCourt>0) | 
        (train_process_df.ShoppingMall>0) |
        (train_process_df.Spa>0) |
        (train_process_df.VRDeck>0)
    )]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,1,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,2,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,2,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,1,1
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,0.0,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True,0005,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8687,9275_03,Europa,False,A/97/P,TRAPPIST-1e,30.0,0.0,0.0,3208.0,0.0,2.0,330.0,Atlasym Conable,True,9275,3,0
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,1
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,1,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,2,0


In [66]:
train_process_df.loc[(
        (train_process_df.RoomService>0) | 
        (train_process_df.FoodCourt>0) | 
        (train_process_df.ShoppingMall>0) |
        (train_process_df.Spa>0) |
        (train_process_df.VRDeck>0)
    ), "VIP"].value_counts()

0.0    4759
1.0     171
Name: VIP, dtype: int64

In [67]:
## Is their relation with Age

In [72]:
train_process_df.loc[(train_process_df.VIP== 1.0) & (train_process_df.Age < 20.0)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
3283,3527_01,Mars,False,F/729/P,TRAPPIST-1e,18.0,1.0,733.0,0.0,688.0,1.0,0.0,Cindee Terte,False,3527,1,1
5845,6186_01,Mars,False,E/396/P,TRAPPIST-1e,19.0,1.0,61.0,783.0,110.0,0.0,0.0,Tot Ancy,True,6186,1,1


In [74]:
train_process_df.loc[(train_process_df.VIP.isnull()) & (train_process_df.Age < 15.0)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
102,0108_03,Earth,False,G/19/S,TRAPPIST-1e,0.0,,0.0,0.0,0.0,0.0,0.0,Oline Handertiz,True,108,3,0
228,0244_02,Mars,True,F/47/S,55 Cancri e,14.0,,0.0,0.0,0.0,0.0,0.0,Tous Sad,True,244,2,0
649,0680_03,Earth,True,G/105/P,TRAPPIST-1e,5.0,,0.0,0.0,0.0,0.0,0.0,Joandy Camerrison,False,680,3,0
1119,1186_02,Earth,False,G/183/S,TRAPPIST-1e,8.0,,0.0,0.0,0.0,0.0,0.0,Elany Browlerson,False,1186,5,0
1752,1865_01,Earth,True,G/292/S,TRAPPIST-1e,4.0,,0.0,0.0,0.0,0.0,0.0,Elanda Roses,False,1865,6,0
1958,2092_03,Mars,False,F/410/S,TRAPPIST-1e,13.0,,6726.0,0.0,1605.0,1266.0,0.0,Cray Stpie,False,2092,5,0
2003,2138_01,Earth,True,G/344/S,55 Cancri e,13.0,,0.0,0.0,0.0,0.0,0.0,Colly Powery,True,2138,1,1
2525,2709_04,Mars,False,F/523/S,TRAPPIST-1e,12.0,,0.0,0.0,0.0,0.0,0.0,Worms Ble,True,2709,4,0
2639,2822_02,Earth,,G/450/S,TRAPPIST-1e,5.0,,0.0,0.0,0.0,0.0,,Salley Harverez,False,2822,5,0
3733,4001_07,Earth,False,G/659/S,TRAPPIST-1e,3.0,,0.0,0.0,0.0,0.0,0.0,Everly Lowelliott,False,4001,7,0


In [75]:
train_process_df.loc[(train_process_df.VIP.isnull()) & (train_process_df.Age < 18.0), "VIP"] = 0.0

In [78]:
train_process_df.loc[(train_process_df.Age > 55.0)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,2,0
28,0024_01,Europa,True,C/2/S,TRAPPIST-1e,62.0,0.0,0.0,0.0,,0.0,0.0,Penton Fullided,True,0024,1,1
49,0051_01,Earth,False,E/2/S,TRAPPIST-1e,56.0,0.0,0.0,112.0,0.0,1379.0,127.0,Vivia Johnshines,False,0051,1,1
57,0062_01,Earth,False,F/13/S,TRAPPIST-1e,62.0,0.0,0.0,592.0,0.0,17.0,25.0,Sterry Greeves,False,0062,1,1
60,0066_01,Earth,False,G/6/P,TRAPPIST-1e,62.0,0.0,1.0,153.0,197.0,0.0,460.0,Diandy Pecketton,False,0066,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8539,9118_02,Earth,False,G/1482/P,TRAPPIST-1e,64.0,0.0,37.0,1585.0,234.0,0.0,18.0,Gwendy Malliamsey,True,9118,2,0
8561,9141_01,Europa,True,B/349/S,TRAPPIST-1e,56.0,0.0,0.0,0.0,0.0,0.0,0.0,Alraid Valing,True,9141,1,1
8633,9211_01,Earth,False,G/1486/S,TRAPPIST-1e,60.0,0.0,0.0,8.0,,360.0,74.0,Stendy Steelerettt,False,9211,1,1
8662,9242_01,Earth,False,F/1891/P,TRAPPIST-1e,62.0,0.0,102.0,696.0,0.0,0.0,0.0,Dary Bakerrison,True,9242,1,1


In [79]:
train_process_df['VIP'].fillna(train_process_df['VIP'].mode()[0], inplace=True)

In [80]:
train_process_df.VIP.isnull().sum()

0

#### RoomService

In [99]:
train_process_df.RoomService.isnull().sum()

181

In [100]:
train_process_df[train_process_df.RoomService.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
25,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,0.0,,0.0,0.0,0.0,0.0,Mael Brantuarez,False,0020,6,0
35,0031_03,Mars,False,F/9/P,TRAPPIST-1e,20.0,0.0,,0.0,1750.0,990.0,0.0,Dontch Datie,True,0031,3,0
83,0091_01,Earth,True,G/16/S,TRAPPIST-1e,26.0,0.0,,0.0,0.0,0.0,0.0,Deanne Yorkland,True,0091,2,0
132,0141_01,Mars,False,F/30/P,TRAPPIST-1e,31.0,0.0,,0.0,97.0,0.0,0.0,Pyrohs Harte,False,0141,1,1
170,0193_02,Mars,False,F/41/P,TRAPPIST-1e,23.0,0.0,,0.0,8.0,1072.0,0.0,Frook Raf,False,0193,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8425,8998_02,Earth,False,E/591/S,TRAPPIST-1e,47.0,0.0,,1.0,0.0,967.0,5.0,Jonaye Englence,False,8998,2,0
8450,9026_01,Earth,True,G/1463/P,TRAPPIST-1e,58.0,0.0,,0.0,0.0,0.0,0.0,Mathy Boyers,True,9026,1,1
8525,9101_01,Earth,False,F/1865/P,TRAPPIST-1e,21.0,0.0,,0.0,496.0,430.0,0.0,Gera Frazie,False,9101,1,1
8534,9112_01,Mars,False,D/290/P,TRAPPIST-1e,28.0,0.0,,0.0,0.0,0.0,0.0,Wealke Brin,False,9112,1,1


In [101]:
train_process_df["RoomService"].fillna(train_process_df["RoomService"].median(), inplace=True)

In [102]:
train_process_df.RoomService.isnull().sum()

0

#### FoodCourt 

In [103]:
train_process_df.FoodCourt.isnull().sum()

183

In [104]:
train_process_df[train_process_df.FoodCourt.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupSize,IsAlone
95,0103_01,Earth,False,F/24/S,TRAPPIST-1e,24.0,0.0,0.0,,0.0,0.0,17.0,Arlen Mclainez,True,0103,3,0
218,0232_01,Earth,True,G/36/S,PSO J318.5-22,27.0,0.0,0.0,,0.0,0.0,0.0,Nica Bakerrison,False,0232,1,1
367,0405_01,Earth,False,F/89/P,55 Cancri e,17.0,0.0,177.0,,0.0,310.0,7.0,Denis Flowensley,False,0405,1,1
385,0427_01,Europa,False,E/26/P,TRAPPIST-1e,31.0,0.0,417.0,,7185.0,562.0,792.0,Algor Efulows,True,0427,1,1
456,0498_01,Europa,True,B/18/S,55 Cancri e,27.0,0.0,0.0,,0.0,0.0,0.0,Batomam Preent,True,0498,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8396,8975_02,Earth,False,F/1840/P,TRAPPIST-1e,27.0,0.0,1.0,,0.0,20.0,948.0,Inerry Goffey,False,8975,2,0
8422,8997_01,Earth,True,F/1742/S,55 Cancri e,15.0,0.0,0.0,,0.0,0.0,0.0,Dont Guerson,True,8997,2,0
8460,9034_04,Europa,False,D/288/P,TRAPPIST-1e,60.0,0.0,0.0,,947.0,668.0,1.0,,True,9034,5,0
8537,9116_01,Earth,False,F/1868/P,TRAPPIST-1e,26.0,0.0,0.0,,0.0,1.0,48.0,Candra Franton,True,9116,1,1


In [106]:
train_process_df["FoodCourt"].fillna(train_process_df["FoodCourt"].median(), inplace=True)

In [105]:
train_process_df["ShoppingMall"].fillna(train_process_df["ShoppingMall"].median(), inplace=True)

In [107]:
train_process_df["Spa"].fillna(train_process_df["Spa"].median(), inplace=True)

In [108]:
train_process_df["VRDeck"].fillna(train_process_df["VRDeck"].median(), inplace=True)

#### Name

In [110]:
train_process_df.Name.isnull().sum()

200

In [114]:
import random
import string

In [115]:
letters = string.ascii_lowercase

In [125]:
def df_random_name(s):
    if s is np.nan:
        return"".join(random.choices(letters, k=10))
    else:
        return s

In [128]:
train_process_df["Name"] = train_process_df.Name.apply(lambda x: df_random_name(x))

In [129]:
train_process_df.Name.isnull().sum()

0

#### Dataset

In [138]:
train_process_df.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
GroupId         0
GroupSize       0
IsAlone         0
Deck            0
Num             0
Side            0
dtype: int64

In [139]:
train_process_df.dropna(inplace=True)

### Data Transformation

#### Transported


In [8]:
train_process_df["Transported"]

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [9]:
train_process_df["Transported"] = train_process_df["Transported"].apply(lambda x: 1 if x else 0)

In [10]:
train_process_df["Transported"]

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

In [188]:
train_process_df["VIP"] = train_process_df["VIP"].apply(lambda x: 1 if x else 0)

### Data Reduction

In [133]:
# Not needed

In [140]:
train_process_df.to_csv("../intermediate_storage/spaceship_titanic_preprocessed.csv")