In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
import sys
import os
sys.path.append(os.getcwd()[:-10])
# utility functions
from utils.utils import format_as_per_convention # type: ignore[import]

<br>
<br>
<br>

### Data collection

In [3]:
# loading data
data = pd.read_csv("../data/train.csv")
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# dataset general info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [13]:
# shape of dataset
data.shape

(8693, 14)

In [5]:
# creating new column names
# as per python naming conventions
new_column_names = format_as_per_convention(data.columns)
new_column_names[6] = "vip"
new_column_names[11] = "vr_deck"
new_column_names

['passenger_id',
 'home_planet',
 'cryo_sleep',
 'cabin',
 'destination',
 'age',
 'vip',
 'room_service',
 'food_court',
 'shopping_mall',
 'spa',
 'vr_deck',
 'name',
 'transported']

In [6]:
# updating dataset column names
data.columns = new_column_names
data.head()

Unnamed: 0,passenger_id,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,name,transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


<br>
<br>
<br>

### Feature inspection

passenger_id

In [7]:
passenger_id = data.passenger_id
passenger_id.head()

0    0001_01
1    0002_01
2    0003_01
3    0003_02
4    0004_01
Name: passenger_id, dtype: object

In [8]:
# checking if null values are present
passenger_id.isna().any()

False

The format of passenger_id is **gggg_pp**, where
- gggg -> group number
- pp -> passenger number in that group.

In [9]:
# extracting group and number_in_group into seperate feature
group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[0]))
number_in_group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[1]))

print(group.head())
print(number_in_group.head())

0    1
1    2
2    3
3    3
4    4
Name: passenger_id, dtype: int64
0    1
1    1
2    1
3    2
4    1
Name: passenger_id, dtype: int64


<br>

home_planet

In [10]:
home_planet = data.home_planet
home_planet.head()

0    Europa
1     Earth
2    Europa
3    Europa
4     Earth
Name: home_planet, dtype: object

In [11]:
home_planet.isna().value_counts()

False    8492
True      201
Name: home_planet, dtype: int64

In [12]:
home_planet.value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: home_planet, dtype: int64

There are 201 missing values in home_planet<br>
One way to impute these values is to take help of **group** feature<br>
<br>
Let's assume that people in the same group, belongs to same home planet<br>
So, we can impute missing values by looking at home_plant of our group members

Now lets check if our above assumption is true or not

In [14]:
# combining group and home planet into one dataframe
group_home_planet = pd.DataFrame({"group": group, "home_planet": home_planet})
group_home_planet.head()

Unnamed: 0,group,home_planet
0,1,Europa
1,2,Earth
2,3,Europa
3,3,Europa
4,4,Earth


In [28]:
# grouping by 'group' and checking unique home_planet values
group_home_planet.groupby("group").nunique().home_planet.value_counts()

1    6107
0     110
Name: home_planet, dtype: int64

As we can see for every group, there is atmost 1 unique home planet<br>
That means every group member is from the same home planet<br>
<br>
(0 unique home planet indicates that the value is missing)

In [55]:
# holds home_planet for every group
group_home_planet_dict: dict[int, str] = {}

# filling group_home_planet_dict
for i in range(len(group_home_planet)):
    item = group_home_planet.iloc[i]
    i_group = item.group
    i_home_planet = item.home_planet
    if i_group not in group_home_planet_dict:
        if i_home_planet is not np.nan:
            group_home_planet_dict[i_group] = i_home_planet

In [69]:
# holds updated home_planet
home_planet_filled: list[str] = []

# filling missing values
for i in range(len(group_home_planet)):
    item = group_home_planet.iloc[i]
    i_group = item.group
    i_home_planet = item.home_planet
    if i_home_planet is np.nan:
        if i_group in group_home_planet_dict:
            home_planet_filled.append(group_home_planet_dict[i_group])
        else:
            home_planet_filled.append(np.nan)
    else:
        home_planet_filled.append(i_home_planet)

In [70]:
# converting to series
home_planet_filled = pd.Series(home_planet_filled)

In [71]:
home_planet_filled.isna().value_counts()

False    8582
True      111
dtype: int64

There are still some missing values,<br>
these values are for those passengers who do not have any group member<br>
<br>
Filling them with mode will be a good choice

In [72]:
# filling remaining values with mode
mode = home_planet.mode()[0]
home_planet_filled.fillna(value=mode, inplace=True)

In [73]:
# updating home planet
home_planet = home_planet_filled

In [74]:
home_planet.isna().any()

False

<br>

cryo_sleep

In [75]:
cryo_sleep = data.cryo_sleep
cryo_sleep.head()

0    False
1    False
2    False
3    False
4    False
Name: cryo_sleep, dtype: object

In [76]:
cryo_sleep.value_counts()

False    5439
True     3037
Name: cryo_sleep, dtype: int64

In [77]:
cryo_sleep.isna().value_counts()

False    8476
True      217
Name: cryo_sleep, dtype: int64

In [83]:
def encode_cryo_sleep(s):
    if s is True:
        return 1
    elif s is False:
        return 0
    else:
        np.nan

In [85]:
cryo_sleep_encoded = cryo_sleep.apply(lambda x: encode_cryo_sleep(x))
cryo_sleep_encoded.head(15)

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     1.0
8     0.0
9     1.0
10    1.0
11    0.0
12    0.0
13    0.0
14    0.0
Name: cryo_sleep, dtype: float64