In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
6746,7117_01,Europa,True,B/236/P,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,Alasmon Secont,True
4349,4632_02,Europa,True,C/176/S,TRAPPIST-1e,68.0,False,0.0,0.0,0.0,0.0,0.0,Azhasim Dightent,True
6598,6967_01,Europa,False,C/256/S,55 Cancri e,37.0,False,0.0,9965.0,0.0,5697.0,676.0,Winon Oilpfulle,False
4897,5220_01,Mars,False,D/165/P,TRAPPIST-1e,21.0,False,2206.0,0.0,353.0,0.0,19.0,Jackok Cooki,False
2399,2581_01,Earth,False,E/156/P,TRAPPIST-1e,20.0,False,496.0,20.0,16.0,0.0,254.0,Eulah Conney,False


In [4]:
train_df.shape

(8693, 14)

## Data cleaning

In [5]:
train_df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
services_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_df[services_cols]  = train_df[services_cols].fillna(0.0)
test_df[services_cols]  = test_df[services_cols].fillna(0.0)

In [7]:
train_age_mean = train_df['Age'].mean()
test_age_mean = test_df['Age'].mean()

train_df['Age'] = train_df['Age'].fillna(train_age_mean)
test_df['Age'] = test_df['Age'].fillna(test_age_mean)

In [8]:
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')

In [9]:
train_df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [10]:
test_df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

In [11]:
train_df = train_df.drop(['Name'], axis='columns')
test_df = test_df.drop(['Name'], axis='columns')

## Data preprocessing

In [16]:
train_df.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
4751,5068_01,Earth,False,E/314/P,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,0.0,0.0,True
6924,7342_01,Earth,False,G/1178/P,TRAPPIST-1e,50.0,False,0.0,185.0,0.0,1.0,558.0,True
3821,4083_01,Europa,False,C/146/S,TRAPPIST-1e,21.0,False,0.0,6190.0,1828.0,7.0,35.0,True
8541,9120_01,Earth,False,F/1763/S,TRAPPIST-1e,13.0,False,101.0,12.0,184.0,780.0,0.0,False
4837,5161_01,Mars,True,D/156/S,55 Cancri e,41.0,False,0.0,0.0,0.0,0.0,0.0,True


In [21]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(4)
memory usage: 704.7+ KB


In [12]:
train_df["Destination"].value_counts()

TRAPPIST-1e      6041
55 Cancri e      1836
PSO J318.5-22     816
Name: Destination, dtype: int64