In [70]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [125]:
TRAIN_PATH = '/content/drive/MyDrive/Spaceship-titanic/train.csv'
TEST_PATH = '/content/drive/MyDrive/Spaceship-titanic/test.csv'
TARGET_CLASS = 'Transported'

In [161]:
#functions

def missing_value(df):
      missing_values = df.isna().sum().to_frame()
      missing_values = missing_values.rename(columns= {0: 'missing_values'})
      missing_values['% of total'] = (missing_values['missing_values'] / df.shape[0]).round(2)*100
      return missing_values

In [162]:
df = pd.read_csv(TRAIN_PATH)

In [163]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [165]:
#missing column
missing = missing_value(df)
missing

Unnamed: 0,missing_values,% of total
PassengerId,0,0.0
HomePlanet,201,2.0
CryoSleep,217,2.0
Cabin,199,2.0
Destination,182,2.0
Age,179,2.0
VIP,203,2.0
RoomService,181,2.0
FoodCourt,183,2.0
ShoppingMall,208,2.0


In [166]:
px.bar(x=missing.index, y='missing_values', data_frame=missing)

In [167]:
#replace null values in the dataframe

for col in df.columns:
  if df[str(col)].dtypes == 'object':
    df[str(col)].fillna(df[str(col)].mode()[0], inplace=True)
  elif df[str(col)].dtypes == 'float64':
    df[str(col)].fillna(df[str(col)].mean().round(0), inplace=True)



In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(5)
memory usage: 772.6+ KB


In [169]:
#create categorical column for age

map_age = {'Children': range(0,13),
           'Teenagers': range(13,18),
           'young Adult': range(18,30),
           'Middle Age': range(30,50),
           'Elderly' : range(50,100)
           }

df['Age_category'] = df['Age'].apply(lambda x:next((k for k, v in map_age.items() if x in v), 'unknown'))

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [170]:
df['Cabin'].value_counts

<bound method IndexOpsMixin.value_counts of 0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object>

In [171]:
df[['Deck', 'Cabin_num', 'side']] = df['Cabin'].str.split('/', expand=True)

In [172]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Age_category,Deck,Cabin_num,side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,Middle Age,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,young Adult,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,Elderly,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,Middle Age,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,Teenagers,F,1,S


In [173]:
df['Age'].value_counts()

29.0    409
24.0    324
18.0    320
21.0    311
19.0    293
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: Age, Length: 80, dtype: int64

In [174]:
drop_data = []
for col in df.columns:
  if len(df[str(col)].unique())/len(df[str(col)]) > 0.3:
    drop_data.append(col)

In [175]:
drop_data.append('Age')

In [176]:
df_clean = df.drop(columns=drop_data, axis=1)
df_clean.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Age_category,Deck,Cabin_num,side
0,Europa,False,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0,False,Middle Age,B,0,P
1,Earth,False,TRAPPIST-1e,False,109.0,9.0,25.0,549.0,44.0,True,young Adult,F,0,S
2,Europa,False,TRAPPIST-1e,True,43.0,3576.0,0.0,6715.0,49.0,False,Elderly,A,0,S
3,Europa,False,TRAPPIST-1e,False,0.0,1283.0,371.0,3329.0,193.0,False,Middle Age,A,0,S
4,Earth,False,TRAPPIST-1e,False,303.0,70.0,151.0,565.0,2.0,True,Teenagers,F,1,S


In [18]:
for col in df.columns:
  fig = px.histogram(df, x=str(col))
  fig.show()