In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic-passenger-data/README.md
/kaggle/input/spaceship-titanic-passenger-data/data/sample_submission.csv
/kaggle/input/spaceship-titanic-passenger-data/data/train.csv
/kaggle/input/spaceship-titanic-passenger-data/data/test.csv


<h3>Import Libraries</h3>

<h3>Feature Selection</h3>

In [2]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic-passenger-data/data/train.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


<h5>We can see that for some of the features, there are values that are missing (null values). Therefore, we must take action. Since, at most, there are about 2.3% of the 8693 entries that are missing for one feature, it would be better to impute (or fill out) those missing values.</h5>

<h3>Missing Value Handling</h3>

In [5]:
# print out the unique values for each feature that contains at least
# one null value
for column in train_data.columns:
    if train_data[column].isnull().any():
        unique_vals = train_data[column].unique()
        print(f"{column}: {unique_vals}")
        print(f"Length of Unique Values: {len(unique_vals)}")
        print()

HomePlanet: ['Europa' 'Earth' 'Mars' nan]
Length of Unique Values: 4

CryoSleep: [False True nan]
Length of Unique Values: 3

Cabin: ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
Length of Unique Values: 6561

Destination: ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Length of Unique Values: 4

Age: [39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.]
Length of Unique Values: 81

VIP: [False True nan]
Length of Unique Values: 3

RoomService: [   0.  109.   43. ... 1569. 8586.  745.]
Length of Unique Values: 1274

FoodCourt: [   0.    9. 3576. ... 3208. 6819. 4688.]
Length of Unique Values: 1508

ShoppingMall: [   0.   25.  371. ... 1085.  510. 1872.]
Length of Unique Values: 1116

Spa: [   0. 

<h5>First, I want to split the 'Cabin' column into three separate columns to split the deck, num, and side into their own columns to better process the data. </h5>

In [6]:
deck_col = []
num_col = []
side_col = []

# go through the Cabin column to separate the details of the passenger's 
# location into three (deck, num, side) so that they can be in their 
# own column. 
for val in train_data['Cabin']:
    # Check if the value is a missing value
    if pd.notna(val):
        # split the string
        deck, num, side = val.split('/')
        
        deck_col.append(deck)
        num_col.append(num)
        side_col.append(side)
    else:
        # Add the "Unknown" string to the columns to represent a missing
        # value
        deck_col.append('Unknown')
        num_col.append('Unknown')
        side_col.append('Unknown')
        
train_data_new = train_data.drop('Cabin', axis=1)
train_data_new['Deck'] = deck_col
train_data_new['Num'] = num_col
train_data_new['Side'] = side_col

In [7]:
train_data_new.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [8]:
train_data_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
 13  Deck          8693 non-null   object 
 14  Num           8693 non-null   object 
 15  Side          8693 non-null   object 
dtypes: bool(1), float64(6), object(9)
memory usage: 1.0+ MB


<h5>For string objects, we will just replace the null values with the string 'Unknown'. For the age, VRDeck, Spa, ShoppingMall, FoodCourt, RoomService feature, we will replace the null values with the median age.</h5>

In [9]:
for column in train_data_new.columns:
    if train_data_new[column].isnull().any():
        if train_data_new[column].dtype == 'object':
            train_data_new[column] = train_data_new[column].fillna('Unknown')          
        elif train_data_new[column].dtype == 'float64':
            median = train_data_new[column].median()
            train_data_new[column] = train_data_new[column].fillna(median)

In [10]:
train_data_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Destination   8693 non-null   object 
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   object 
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Name          8693 non-null   object 
 12  Transported   8693 non-null   bool   
 13  Deck          8693 non-null   object 
 14  Num           8693 non-null   object 
 15  Side          8693 non-null   object 
dtypes: bool(1), float64(6), object(9)
memory usage: 1.0+ MB
