In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [9]:
import sys
import os
sys.path.append(os.getcwd()[:-10])
# utility functions
from utils.utils import format_as_per_convention # type: ignore[import]

<br>
<br>
<br>

### Data collection

In [17]:
# loading data
data = pd.read_csv("../data/train.csv")
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [12]:
# dataset general info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [13]:
# shape of dataset
data.shape

(8693, 14)

In [18]:
# creating new column names
# as per python naming conventions
new_column_names = format_as_per_convention(data.columns)
new_column_names[6] = "vip"
new_column_names[11] = "vr_deck"
new_column_names

['passenger_id',
 'home_planet',
 'cryo_sleep',
 'cabin',
 'destination',
 'age',
 'vip',
 'room_service',
 'food_court',
 'shopping_mall',
 'spa',
 'vr_deck',
 'name',
 'transported']

In [19]:
# updating dataset column names
data.columns = new_column_names
data.head()

Unnamed: 0,passenger_id,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,name,transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


<br>
<br>
<br>

### Feature inspection

passenger_id

In [20]:
passenger_id = data.passenger_id
passenger_id.head()

0    0001_01
1    0002_01
2    0003_01
3    0003_02
4    0004_01
Name: passenger_id, dtype: object

In [23]:
# checking if null values are present
passenger_id.isna().any()

False

The format of passenger_id is **gggg_pp**, where
- gggg -> group number
- pp -> passenger number in that group.

In [26]:
# extracting group and number_in_group into seperate feature
group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[0]))
number_in_group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[1]))

print(group.head())
print(number_in_group.head())

0    1
1    2
2    3
3    3
4    4
Name: passenger_id, dtype: int64
0    1
1    1
2    1
3    2
4    1
Name: passenger_id, dtype: int64


<br>

home_planet