In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
5250,5602_01,Europa,True,B/190/P,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,Castab Sedece,True
6399,6760_01,Mars,False,F/1404/P,PSO J318.5-22,30.0,False,51.0,0.0,1373.0,0.0,5.0,Shex Pri,True
5816,6153_01,Europa,True,B/236/S,55 Cancri e,41.0,False,0.0,0.0,0.0,0.0,0.0,Weionan Genuild,True
2681,2871_02,Europa,True,B/88/P,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Sarthim Unregul,True
586,0609_01,Mars,False,F/128/P,TRAPPIST-1e,33.0,False,149.0,0.0,,0.0,109.0,Guteda Sacle,False


In [4]:
train_df.shape

(8693, 14)

## Data cleaning

In [5]:
train_df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
services_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_df[services_cols]  = train_df[services_cols].fillna(0.0)
test_df[services_cols]  = test_df[services_cols].fillna(0.0)

In [7]:
train_age_mean = train_df['Age'].mean()
test_age_mean = test_df['Age'].mean()

train_df['Age'] = train_df['Age'].fillna(train_age_mean)
test_df['Age'] = test_df['Age'].fillna(test_age_mean)

In [8]:
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')

In [9]:
train_df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [10]:
test_df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

In [11]:
col_to_drop = ['PassengerId', 'Cabin', 'Name']

n_train_df = train_df.drop(col_to_drop, axis='columns')
n_test_df = test_df.drop(col_to_drop, axis='columns')

## Data preprocessing

In [12]:
n_train_df.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
3239,Earth,True,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,False
4379,Europa,False,TRAPPIST-1e,38.0,False,0.0,93.0,0.0,0.0,7303.0,False
6259,Earth,True,PSO J318.5-22,34.0,False,0.0,0.0,0.0,0.0,0.0,True
3183,Earth,False,PSO J318.5-22,23.0,False,381.0,0.0,137.0,104.0,8.0,False
7483,Earth,False,TRAPPIST-1e,22.0,False,0.0,1145.0,0.0,0.0,371.0,False


In [13]:
n_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   bool   
 2   Destination   8693 non-null   object 
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   bool   
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(2)
memory usage: 568.9+ KB


### Dummy Variable

In [14]:
n_train_df["CryoSleep"] = n_train_df["CryoSleep"].apply([lambda x: 1 if(x == True) else 0])
n_train_df["VIP"] = n_train_df["VIP"].apply([lambda x: 1 if(x == True) else 0])
n_train_df["Transported"] = n_train_df["Transported"].apply([lambda x: 1 if(x == True) else 0])


n_test_df["CryoSleep"] = n_test_df["CryoSleep"].apply([lambda x: 1 if(x == True) else 0])
n_test_df["VIP"] = n_test_df["VIP"].apply([lambda x: 1 if(x == True) else 0])

In [15]:
n_train_df["HomePlanet"].value_counts()

Earth     4697
Europa    2184
Mars      1812
Name: HomePlanet, dtype: int64

In [16]:
n_train_df["Destination"].value_counts()

TRAPPIST-1e      6041
55 Cancri e      1836
PSO J318.5-22     816
Name: Destination, dtype: int64

In [17]:
n_train_df = pd.get_dummies(n_train_df, columns=['HomePlanet', 'Destination'], drop_first=True)
n_test_df = pd.get_dummies(n_test_df, columns=['HomePlanet', 'Destination'], drop_first=True)

In [18]:
X = n_train_df.drop(columns=["Transported"])
y = n_train_df["Transported"]

## Train test split

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build Model 

### 1. Random Forest Classifier

In [55]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=12)

In [56]:
model.fit(X_train, y_train)

In [57]:
model.predict([X_train.iloc[100]])



array([0], dtype=int64)

In [58]:
y_train.iloc[100]

0

In [59]:
model.score(X_test, y_test)

0.7814836112708453

### 2. CatBoost

In [60]:
# !pip install catboost
# !pip install ipywidgets
# !jupyter nbextension enable  — py widgetsnbextension

In [68]:
from catboost import CatBoostClassifier
model_catB = CatBoostClassifier(iterations=50, max_depth=12, learning_rate=0.15, random_state=0)

In [69]:
model_catB.fit(X_train, y_train, eval_set=(X_test, y_test))

0:	learn: 0.6463133	test: 0.6496987	best: 0.6496987 (0)	total: 114ms	remaining: 5.58s
1:	learn: 0.6067186	test: 0.6125711	best: 0.6125711 (1)	total: 217ms	remaining: 5.21s
2:	learn: 0.5765410	test: 0.5854433	best: 0.5854433 (2)	total: 322ms	remaining: 5.05s
3:	learn: 0.5518140	test: 0.5627068	best: 0.5627068 (3)	total: 431ms	remaining: 4.96s
4:	learn: 0.5335571	test: 0.5462314	best: 0.5462314 (4)	total: 535ms	remaining: 4.81s
5:	learn: 0.5168487	test: 0.5318060	best: 0.5318060 (5)	total: 641ms	remaining: 4.7s
6:	learn: 0.5047420	test: 0.5210655	best: 0.5210655 (6)	total: 745ms	remaining: 4.58s
7:	learn: 0.4918381	test: 0.5097866	best: 0.5097866 (7)	total: 850ms	remaining: 4.46s
8:	learn: 0.4835888	test: 0.5030682	best: 0.5030682 (8)	total: 964ms	remaining: 4.39s
9:	learn: 0.4752062	test: 0.4954692	best: 0.4954692 (9)	total: 1.07s	remaining: 4.28s
10:	learn: 0.4678804	test: 0.4893346	best: 0.4893346 (10)	total: 1.18s	remaining: 4.18s
11:	learn: 0.4613525	test: 0.4842806	best: 0.4842806 

<catboost.core.CatBoostClassifier at 0x216537cb0a0>

In [70]:
model_catB.score(X_test, y_test)

0.7814836112708453

## Model test data predict

In [None]:
test_df_pred = model.predict(n_test_df)

In [None]:
pId, trans_arr = [], []
for i in range(len(test_df_pred)):
    pId.append(test_df.iloc[i]['PassengerId'])
    if(test_df_pred[i] == 1):
        trans_arr.append('True')
    if(test_df_pred[i] == 0):
        trans_arr.append('False')

In [None]:
len(pId), len(trans_arr)

In [None]:
test_df.shape

In [None]:
sample_sub_df = pd.DataFrame(data={ 'PassengerId' : pId, 'Transported': trans_arr})

In [None]:
sample_sub_df

In [None]:
sample_sub_df.to_csv("sample_submission.csv", index=False)