In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

<br>
<br>
<br>

#### Data collection

In [2]:
# reading data
data = pd.read_csv("../data/train.csv")

# changing column names
new_columns = ["passenger_id", "home_planet", "cryo_sleep", "cabin", "destination", "age", "vip", "room_service", "food_court", "shopping_mall", "spa", "vr_deck", "name", "transported"]
data.columns = new_columns

# seperating X and y
X = data.drop(["transported"], axis=1)
y = data.transported

In [3]:
X.head()

Unnamed: 0,passenger_id,home_planet,cryo_sleep,cabin,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [4]:
y.head()

0    False
1     True
2    False
3    False
4     True
Name: transported, dtype: bool

<br>
<br>
<br>

#### Data inspection

In [5]:
X.shape

(8693, 13)

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   passenger_id   8693 non-null   object 
 1   home_planet    8492 non-null   object 
 2   cryo_sleep     8476 non-null   object 
 3   cabin          8494 non-null   object 
 4   destination    8511 non-null   object 
 5   age            8514 non-null   float64
 6   vip            8490 non-null   object 
 7   room_service   8512 non-null   float64
 8   food_court     8510 non-null   float64
 9   shopping_mall  8485 non-null   float64
 10  spa            8510 non-null   float64
 11  vr_deck        8505 non-null   float64
 12  name           8493 non-null   object 
dtypes: float64(6), object(7)
memory usage: 883.0+ KB


In [7]:
y.value_counts()

True     4378
False    4315
Name: transported, dtype: int64

<br>
<br>
<br>

#### Data preparation

In [8]:
# passenger_id
passenger_id = X.passenger_id

# seperating group and number in group
group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[0]))
number_in_group: pd.Series = passenger_id.apply(lambda x: int(x.split("_")[1]))

In [9]:
# home_planet
home_planet = X.home_planet

# filling 201 null values with mode
home_planet.fillna(value = home_planet.mode()[0], inplace=True)

In [10]:
# cryo_sleep
cryo_sleep = X.cryo_sleep

# filling 217 null values with mode
cryo_sleep.fillna(value = cryo_sleep.mode()[0], inplace=True)

In [11]:
# cabin
cabin = X.cabin

# seperating deck, num and side
deck = cabin.apply(lambda x: x.split("/")[0] if pd.notna(x) else x)
num_in_cabin = cabin.apply(lambda x: x.split("/")[1] if pd.notna(x) else x)
side = cabin.apply(lambda x: x.split("/")[2] if pd.notna(x) else x)

# filling 199 null values
deck.fillna(value = deck.mode()[0], inplace=True)
num_in_cabin.fillna(value = 0, inplace=True)
side.fillna(value = side.mode()[0], inplace=True)

In [12]:
# destination
destination = X.destination

# filling 182 null values with mode
destination.fillna(value = destination.mode()[0], inplace=True)

In [13]:
# vip
vip = X.vip

# filling 203 null values with mode
vip.fillna(value = vip.mode()[0], inplace=True)

In [14]:
# age
age = X.age

# filling 179 null values with median
age.fillna(value = age.median(), inplace=True)

In [15]:
# spendings
room_service = X.room_service
food_court = X.food_court
shopping_mall = X.shopping_mall
spa = X.spa
vr_deck = X.vr_deck

# filling null values with 0.0
room_service.fillna(value=0.0, inplace=True)
food_court.fillna(value=0.0, inplace=True)
shopping_mall.fillna(value=0.0, inplace=True)
spa.fillna(value=0.0, inplace=True)
vr_deck.fillna(value=0.0, inplace=True)

In [16]:
# total_spending
total_spending = room_service + food_court + shopping_mall + spa + vr_deck

In [None]:
# ignoring name column for now

In [17]:
X_prepared = pd.DataFrame()

X_prepared["group"] = group
X_prepared["number_in_group"] = number_in_group
X_prepared["home_planet"] = home_planet
X_prepared["cryo_sleep"] = cryo_sleep
X_prepared["deck"] = deck
X_prepared["num_in_cabin"] = num_in_cabin
X_prepared["side"] = side
X_prepared["destination"] = destination
X_prepared["age"] = age
X_prepared["vip"] = vip
X_prepared["room_service"] = room_service
X_prepared["food_court"] = food_court
X_prepared["shopping_mall"] = shopping_mall
X_prepared["spa"] = spa
X_prepared["vr_deck"] = vr_deck
X_prepared["total_spending"] = total_spending

In [18]:
X_prepared.head()

Unnamed: 0,group,number_in_group,home_planet,cryo_sleep,deck,num_in_cabin,side,destination,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,total_spending
0,1,1,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0
2,3,1,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0
3,3,2,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0
4,4,1,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0


In [19]:
# shape of new data
X_prepared.shape

(8693, 16)

In [20]:
# checking if any null value is present
X_prepared.isna().any().value_counts()

False    16
dtype: int64

In [21]:
X_prepared.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   group            8693 non-null   int64  
 1   number_in_group  8693 non-null   int64  
 2   home_planet      8693 non-null   object 
 3   cryo_sleep       8693 non-null   bool   
 4   deck             8693 non-null   object 
 5   num_in_cabin     8693 non-null   object 
 6   side             8693 non-null   object 
 7   destination      8693 non-null   object 
 8   age              8693 non-null   float64
 9   vip              8693 non-null   bool   
 10  room_service     8693 non-null   float64
 11  food_court       8693 non-null   float64
 12  shopping_mall    8693 non-null   float64
 13  spa              8693 non-null   float64
 14  vr_deck          8693 non-null   float64
 15  total_spending   8693 non-null   float64
dtypes: bool(2), float64(7), int64(2), object(5)
memory usage: 96

<br>
<br>
<br>

#### Data preprocessing

In [110]:
X_preprocessed = X_prepared.copy()

In [111]:
# nominal encoding of categorical attributes
cat_attributes = ["home_planet", "deck", "side", "destination"]

cat_data = X_preprocessed[cat_attributes].copy()
X_preprocessed.drop(cat_attributes, axis=1, inplace=True)

dummies = pd.get_dummies(cat_data)
X_preprocessed = pd.concat([X_preprocessed, dummies], axis=1)

In [112]:
# converting bool to int
X_preprocessed.cryo_sleep = X_preprocessed.cryo_sleep.apply(lambda x: 1 if x else 0)
X_preprocessed.vip = X_preprocessed.vip.apply(lambda x: 1 if x else 0)

# changing dtype to int
X_preprocessed.num_in_cabin = X_preprocessed.num_in_cabin.astype("int")

In [113]:
# shape of preprocessed data
X_preprocessed.shape

(8693, 28)

In [114]:
# converting bool to int 
y_preprocessed = y.apply(lambda x: 1 if x else 0)

In [115]:
# shape of preprocessed target
y_preprocessed.shape

(8693,)

scaling

In [70]:
from sklearn.preprocessing import StandardScaler

In [71]:
features_to_scale = ["room_service", "food_court", "shopping_mall", "spa", "vr_deck", "total_spending"]

# data_to_scale = X_preprocessed[features_to_scale]

In [116]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(X_preprocessed)
data_scaled = pd.DataFrame(data_scaled, columns=X_preprocessed.columns)

In [76]:
# for feature in features_to_scale:
#     X_preprocessed[feature] = data_scaled[feature]

<br>
<br>

In [117]:
X_modelling = data_scaled.copy()

In [104]:
# X_modelling.drop(["total_spending"], axis=1, inplace=True)

In [118]:
X_modelling.shape

(8693, 28)

In [119]:
X_modelling.head()

Unnamed: 0,group,number_in_group,cryo_sleep,num_in_cabin,age,vip,room_service,food_court,shopping_mall,spa,...,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S,destination_55 Cancri e,destination_PSO J318.5-22,destination_TRAPPIST-1e
0,-1.734409,-0.491161,-0.73277,-1.141624,0.711945,-0.153063,-0.333105,-0.281027,-0.283579,-0.270626,...,-0.241218,-0.334759,-0.724629,-0.645897,-0.02399,1.032865,-1.032865,-0.511013,-0.317487,0.652521
1,-1.734034,-0.491161,-0.73277,-1.141624,-0.334037,-0.153063,-0.168073,-0.275387,-0.241771,0.217158,...,-0.241218,-0.334759,1.380016,-0.645897,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521
2,-1.73366,-0.491161,-0.73277,-1.141624,2.036857,6.533255,-0.268001,1.959998,-0.283579,5.695623,...,-0.241218,-0.334759,-0.724629,-0.645897,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521
3,-1.73366,0.457443,-0.73277,-1.141624,0.293552,-0.153063,-0.333105,0.52301,0.336851,2.687176,...,-0.241218,-0.334759,-0.724629,-0.645897,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521
4,-1.733286,-0.491161,-0.73277,-1.139678,-0.891895,-0.153063,0.125652,-0.237159,-0.031059,0.231374,...,-0.241218,-0.334759,1.380016,-0.645897,-0.02399,-0.968181,0.968181,-0.511013,-0.317487,0.652521


<br>
<br>
<br>

### Modelling

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [124]:
forest = RandomForestClassifier(n_estimators=200)

In [125]:
forest.fit(X_modelling, y_preprocessed)

predictions = forest.predict(X_modelling)
accuracy_score(y_preprocessed, predictions)

1.0

In [126]:
scores = cross_val_score(forest, X_modelling, y_preprocessed, scoring="accuracy", cv=5)
scores.mean()

0.7714292237050115

In [127]:
importance = pd.Series(forest.feature_importances_, forest.feature_names_in_)
importance.sort_values(ascending=False)

total_spending               0.118865
num_in_cabin                 0.106247
group                        0.106242
age                          0.090418
spa                          0.084475
vr_deck                      0.076532
room_service                 0.071379
food_court                   0.067111
cryo_sleep                   0.062113
shopping_mall                0.055699
number_in_group              0.025427
home_planet_Earth            0.018014
home_planet_Europa           0.013792
deck_G                       0.012670
deck_E                       0.012275
deck_F                       0.011950
side_P                       0.011212
side_S                       0.010340
destination_TRAPPIST-1e      0.008972
destination_55 Cancri e      0.007749
home_planet_Mars             0.007739
destination_PSO J318.5-22    0.005650
deck_C                       0.005316
deck_B                       0.004022
deck_D                       0.002537
vip                          0.001662
deck_A      