## 1. Libraries that have been used

In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [81]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_id = test["PassengerId"]

## 2. Exploring the data

In [82]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [83]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [84]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [85]:
test.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [86]:
len(train), len(test)

(8693, 4277)

In [87]:
train["HomePlanet"].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [88]:
test["HomePlanet"].value_counts()

Earth     2263
Europa    1002
Mars       925
Name: HomePlanet, dtype: int64

In [89]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [90]:
pd.crosstab(train["Age"], train["Transported"]).tail(20)

Transported,False,True
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
60.0,19,15
61.0,14,18
62.0,13,13
63.0,16,10
64.0,13,16
65.0,16,7
66.0,3,4
67.0,6,7
68.0,4,8
69.0,5,1


In [91]:
pd.crosstab(train["VIP"], train["Transported"])

Transported,False,True
VIP,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4093,4198
True,123,76


In [92]:
pd.crosstab(train["Cabin"], train["Transported"])

Transported,False,True
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A/0/P,1,1
A/0/S,2,0
A/1/S,0,3
A/10/P,1,0
A/10/S,0,1
...,...,...
T/0/P,1,0
T/1/P,1,0
T/2/P,1,0
T/2/S,1,0


In [93]:
pd.crosstab(train["CryoSleep"], train["Transported"])

Transported,False,True
CryoSleep,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3650,1789
True,554,2483


In [94]:
pd.crosstab(train["Destination"], train["Transported"])

Transported,False,True
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1
55 Cancri e,702,1098
PSO J318.5-22,395,401
TRAPPIST-1e,3128,2787


In [95]:
pd.crosstab(train["HomePlanet"], train["Transported"])

Transported,False,True
HomePlanet,Unnamed: 1_level_1,Unnamed: 2_level_1
Earth,2651,1951
Europa,727,1404
Mars,839,920


## 3. Cleaning the data and preprocessing

In [96]:
def clean(data):
    data.drop(["PassengerId", "Name", "Cabin"], axis=1, inplace=True)
    
clean(test)
clean(train)

In [97]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [98]:
test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0


In [99]:
X = train.drop("Transported", axis=1)
y = train["Transported"]

In [100]:
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [101]:
X["VIP"] = X["VIP"].fillna("missing").astype(str)
X["VIP"] = X["CryoSleep"].fillna("missing").astype(str)

In [102]:
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

num_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cat_features = ["HomePlanet", "CryoSleep","Destination", "VIP"]

imputer = ColumnTransformer([
    ("num_imputer", num_imputer, num_features),
    ("cat_imputer", cat_imputer, cat_features)
])

filled_X = imputer.fit_transform(X)
filled_test = imputer.fit_transform(test)

In [103]:
filled_X = pd.DataFrame(filled_X, columns=["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck",
                                          "HomePlanet", "CryoSleep", "Destination", "VIP"])
filled_X

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Destination,VIP
0,39.0,0.0,0.0,0.0,0.0,0.0,Europa,False,TRAPPIST-1e,False
1,24.0,109.0,9.0,25.0,549.0,44.0,Earth,False,TRAPPIST-1e,False
2,58.0,43.0,3576.0,0.0,6715.0,49.0,Europa,False,TRAPPIST-1e,False
3,33.0,0.0,1283.0,371.0,3329.0,193.0,Europa,False,TRAPPIST-1e,False
4,16.0,303.0,70.0,151.0,565.0,2.0,Earth,False,TRAPPIST-1e,False
...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,Europa,False,55 Cancri e,False
8689,18.0,0.0,0.0,0.0,0.0,0.0,Earth,True,PSO J318.5-22,True
8690,26.0,0.0,0.0,1872.0,1.0,0.0,Earth,False,TRAPPIST-1e,False
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,Europa,False,55 Cancri e,False


In [104]:
filled_test = pd.DataFrame(filled_test, columns=["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck",
                                          "HomePlanet", "CryoSleep", "Destination", "VIP"])
filled_test

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Destination,VIP
0,27.0,0.0,0.0,0.0,0.0,0.0,Earth,True,TRAPPIST-1e,False
1,19.0,0.0,9.0,0.0,2823.0,0.0,Earth,False,TRAPPIST-1e,False
2,31.0,0.0,0.0,0.0,0.0,0.0,Europa,True,55 Cancri e,False
3,38.0,0.0,6652.0,0.0,181.0,585.0,Europa,False,TRAPPIST-1e,False
4,20.0,10.0,0.0,635.0,0.0,0.0,Earth,False,TRAPPIST-1e,False
...,...,...,...,...,...,...,...,...,...,...
4272,34.0,0.0,0.0,0.0,0.0,0.0,Earth,True,TRAPPIST-1e,False
4273,42.0,0.0,847.0,17.0,10.0,144.0,Earth,False,TRAPPIST-1e,False
4274,28.658146,0.0,0.0,0.0,0.0,0.0,Mars,True,55 Cancri e,False
4275,28.658146,0.0,2680.0,0.0,0.0,523.0,Europa,False,missing,False


In [105]:
filled_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           8693 non-null   object
 1   RoomService   8693 non-null   object
 2   FoodCourt     8693 non-null   object
 3   ShoppingMall  8693 non-null   object
 4   Spa           8693 non-null   object
 5   VRDeck        8693 non-null   object
 6   HomePlanet    8693 non-null   object
 7   CryoSleep     8693 non-null   object
 8   Destination   8693 non-null   object
 9   VIP           8693 non-null   object
dtypes: object(10)
memory usage: 679.3+ KB


In [117]:
filled_X["VIP"] = filled_X["VIP"].astype(str)
filled_X["CryoSleep"] = filled_X["CryoSleep"].astype(str)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Age           4277 non-null   object
 1   RoomService   4277 non-null   object
 2   FoodCourt     4277 non-null   object
 3   ShoppingMall  4277 non-null   object
 4   Spa           4277 non-null   object
 5   VRDeck        4277 non-null   object
 6   HomePlanet    4277 non-null   object
 7   CryoSleep     4277 non-null   object
 8   Destination   4277 non-null   object
 9   VIP           4277 non-null   object
dtypes: object(10)
memory usage: 334.3+ KB


In [118]:
filled_test["VIP"] = filled_test["VIP"].astype(str)
filled_test["CryoSleep"] = filled_test["CryoSleep"].astype(str)

In [109]:
categorical_features = ["HomePlanet", "CryoSleep", "Destination", "VIP"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(filled_X)
transformed_X

array([[0.0, 1.0, 0.0, ..., 0.0, 0.0, 0.0],
       [1.0, 0.0, 0.0, ..., 25.0, 549.0, 44.0],
       [0.0, 1.0, 0.0, ..., 0.0, 6715.0, 49.0],
       ...,
       [1.0, 0.0, 0.0, ..., 1872.0, 1.0, 0.0],
       [0.0, 1.0, 0.0, ..., 0.0, 353.0, 3235.0],
       [0.0, 1.0, 0.0, ..., 0.0, 0.0, 12.0]], dtype=object)

In [119]:
transformed_test = transformer.fit_transform(filled_test)
transformed_test

array([[1.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [1.0, 0.0, 0.0, ..., 0.0, 2823.0, 0.0],
       [0.0, 1.0, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       [0.0, 0.0, 1.0, ..., 0.0, 0.0, 0.0],
       [0.0, 1.0, 0.0, ..., 0.0, 0.0, 523.0],
       [1.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [111]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [113]:
clf = SVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7648073605520413

In [120]:
predict = clf.predict(transformed_test)
predict

array([ True, False,  True, ...,  True,  True,  True])

In [122]:
submission = pd.DataFrame({"PassengerId" : test_id.values, "Transported" : predict})
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [123]:
submission.to_csv("submission.csv", index=False)