In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("../data/data_without_na_engineered.csv")
data.head()

Unnamed: 0,group,number_in_group,home_planet,cryo_sleep,deck,num_in_cabin,side,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,total_spending,transported
0,1,1,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2,1,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0,1
2,3,1,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,0
3,3,2,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,0
4,4,1,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0,1


In [4]:
X = data.drop(["transported"], axis=1)
y = data.transported

In [5]:
X.cryo_sleep = X.cryo_sleep.apply(lambda x: 1 if x else 0)
X.vip = X.vip.apply(lambda x: 1 if x else 0)

In [6]:
X.head()

Unnamed: 0,group,number_in_group,home_planet,cryo_sleep,deck,num_in_cabin,side,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,total_spending
0,1,1,Europa,0,B,0,P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,Earth,0,F,0,S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,736.0
2,3,1,Europa,0,A,0,S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,10383.0
3,3,2,Europa,0,A,0,S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,5176.0
4,4,1,Earth,0,F,1,S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1091.0


In [7]:
num_attributes = ['group', "number_in_group", "cryo_sleep", "num_in_cabin", "age", "vip", "room_service", "food_court", "shopping_mall", "spa", "vr_deck", "total_spending"]
cat_attributes = ["home_planet", "deck", "side", "destination"]

In [8]:
num_data = X[num_attributes]
num_data.shape

(6606, 12)

In [9]:
cat_data = X[cat_attributes]
cat_data.shape

(6606, 4)

In [10]:
cat_encoded = pd.get_dummies(cat_data)
cat_encoded.shape

(6606, 16)

In [11]:
cat_encoded.head()

Unnamed: 0,home_planet_Earth,home_planet_Europa,home_planet_Mars,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S,destination_55 Cancri e,destination_PSO J318.5-22,destination_TRAPPIST-1e
0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1
1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
2,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1
3,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1
4,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1


In [12]:
total_data = pd.concat([num_data, cat_encoded], axis=1)
total_data.shape

(6606, 28)

In [22]:
total_data.head()

Unnamed: 0,group,number_in_group,cryo_sleep,num_in_cabin,age,vip,room_service,food_court,shopping_mall,spa,...,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S,destination_55 Cancri e,destination_PSO J318.5-22,destination_TRAPPIST-1e
0,1,1,0,0,39.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,1
1,2,1,0,0,24.0,0,109.0,9.0,25.0,549.0,...,0,0,1,0,0,0,1,0,0,1
2,3,1,0,0,58.0,1,43.0,3576.0,0.0,6715.0,...,0,0,0,0,0,0,1,0,0,1
3,3,2,0,0,33.0,0,0.0,1283.0,371.0,3329.0,...,0,0,0,0,0,0,1,0,0,1
4,4,1,0,1,16.0,0,303.0,70.0,151.0,565.0,...,0,0,1,0,0,0,1,0,0,1


In [60]:
total_data = total_data[['total_spending', 'num_in_cabin', 'group', 'age', 'spa', 'vr_deck',
       'room_service', 'food_court', 'shopping_mall', 'cryo_sleep',
       'number_in_group', 'home_planet_Earth', 'home_planet_Europa', 'deck_E',
       'deck_G', 'deck_F', 'side_P', 'side_S', 'destination_TRAPPIST-1e',
       'destination_55 Cancri e', 'home_planet_Mars',
       'destination_PSO J318.5-22', 'deck_C']]

In [90]:
total_data.shape

(6606, 28)

<br>
<br>
<br>

### Feature selection

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [73]:
forest = RandomForestClassifier(max_depth=3)

In [74]:
forest.fit(total_data, y)

In [75]:
predictions = forest.predict(total_data)
accuracy_score(predictions, y)

0.7356948228882834

In [76]:
scores = cross_val_score(forest, total_data, y, scoring="accuracy", cv=5)
min(scores)

0.7168811506434519

In [77]:
importances = pd.Series(forest.feature_importances_ , forest.feature_names_in_)

In [80]:
importances.sort_values(ascending=False)[:20]

total_spending             0.209610
cryo_sleep                 0.197504
spa                        0.128370
vr_deck                    0.115593
room_service               0.098590
shopping_mall              0.060195
food_court                 0.051035
home_planet_Europa         0.038609
home_planet_Earth          0.033251
age                        0.012461
deck_G                     0.010528
destination_55 Cancri e    0.007339
deck_E                     0.006021
deck_B                     0.005250
deck_F                     0.005091
num_in_cabin               0.003768
home_planet_Mars           0.003528
side_S                     0.003363
deck_C                     0.002992
side_P                     0.002325
dtype: float64