### Spaceship Titanic

Solution to the problem Spaceship Titanic using Gradient Boosting

https://www.kaggle.com/competitions/spaceship-titanic/overview

Reading train and test data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('train.csv')

In [2]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
len_data = len(data)

In [4]:
test_data = pd.read_csv('test.csv')

In [5]:
data = pd.concat([data, test_data], axis = 0)

Data preprocessing

In [6]:
num = ['RoomService', 'Age', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
target = 'Transported'

In [7]:
data[num] = data[num].fillna(-1)
data[cat] = data[cat].fillna('empty')

In [8]:
data.isna().sum()

PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin              0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name               0
Transported     4277
dtype: int64

All Nan values are replaced (except target values for test data)

In [9]:
data.isnull().any().to_frame()

Unnamed: 0,0
PassengerId,False
HomePlanet,False
CryoSleep,False
Cabin,False
Destination,False
Age,False
VIP,False
RoomService,False
FoodCourt,False
ShoppingMall,False


In [10]:
dummies = pd.get_dummies(data[cat])
new_data = pd.concat([data[num], dummies], axis = 1)

GB model

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=50, max_depth=5)

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(new_data[:][:len_data], data[target][:len_data].astype(np.int64))

In [13]:
model.fit(x_train, y_train)

GradientBoostingClassifier(max_depth=5, n_estimators=50)

In [17]:
model.score(x_train, y_train)

0.8211382113821138

In [18]:
model.score(x_test, y_test)

0.7953081876724931

CSV file for Kaggle submission

In [19]:
result = pd.DataFrame(columns = ['Transported'], index = (data['PassengerId'][len_data:]))
result['Transported'] = model.predict(new_data[:][len_data:]).astype(bool)
result.to_csv('submission.csv')