In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.plotting.register_matplotlib_converters()
%matplotlib inline

In [None]:
train_raw = "https://raw.githubusercontent.com/devthumos/spaceship_titanic/master/train.csv"
test_raw = "https://raw.githubusercontent.com/devthumos/spaceship_titanic/master/test.csv"

train_set = pd.read_csv(train_raw)
test_set = pd.read_csv(test_raw)

In [None]:
train_set.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
train_set.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


#### We can see that cabin can be splited in three categories deck, num and side

I don't think that the Cabin's Number is util to the model, then we won't use it

In [None]:
import re
train_set["Deck"] = train_set.Cabin.apply(lambda x: re.split(r"/", str(x))[0] if len(re.split(r"/", str(x))) > 2 else x)
train_set["Side"] = train_set.Cabin.apply(lambda x: re.split(r"/", str(x))[2] if len(re.split(r"/", str(x))) > 2 else x)
train_set = train_set.drop("Cabin", axis=1)

test_set["Deck"] = test_set.Cabin.apply(lambda x: re.split(r"/", str(x))[0] if len(re.split(r"/", str(x))) > 2 else x)
test_set["Side"] = test_set.Cabin.apply(lambda x: re.split(r"/", str(x))[2] if len(re.split(r"/", str(x))) > 2 else x)
test_set = test_set.drop("Cabin", axis=1)

#### We can make another feature "Spent" that would be the total spent by the person

In [None]:
train_set["Spent"] = train_set.RoomService + train_set.FoodCourt + train_set.ShoppingMall + train_set.Spa + train_set.VRDeck
test_set["Spent"] = test_set.RoomService + test_set.FoodCourt + test_set.ShoppingMall + test_set.Spa + test_set.VRDeck

## Categorical Columns, Numerical Columns and Boolean Columns

In [None]:
categorical_columns = [column for column in train_set.columns if train_set[column].dtype == "object"]
numerical_columns = [column for column in train_set.columns if train_set[column].dtype not in ["object", "bool"]]
bool_columns = list(set(train_set.columns) - (set(categorical_columns) | set(numerical_columns)))

In [None]:
print(categorical_columns)
print(numerical_columns)
print(bool_columns)

['PassengerId', 'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Name', 'Deck', 'Side']
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Spent']
['Transported']


## Missing Values

### Categorical Columns with Missing Values

In [None]:
train_set[categorical_columns].isnull().sum()

PassengerId      0
HomePlanet     201
CryoSleep      217
Destination    182
VIP            203
Name           200
Deck           199
Side           199
dtype: int64

In [None]:
test_set[categorical_columns].isnull().sum()

PassengerId      0
HomePlanet      87
CryoSleep       93
Destination     92
VIP             93
Name            94
Deck           100
Side           100
dtype: int64

### Numerical Columns with Missing Values

In [None]:
train_set[numerical_columns].isnull().sum()

Age             179
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Spent           908
dtype: int64

In [None]:
test_set[numerical_columns].isnull().sum()

Age              91
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Spent           455
dtype: int64

### Boolean Columns with Missing Values

In [None]:
train_set[bool_columns].isnull().sum()

Transported    0
dtype: int64

## Dropping Columns with High Cardinality

In [None]:
high_columns = [column for column in categorical_columns if train_set[column].nunique() > 15]
for column in high_columns:
  categorical_columns.remove(column)
high_columns

['PassengerId', 'Name']

In [None]:
train_set_index = train_set.PassengerId
test_set_index = test_set.PassengerId

train_set = train_set.drop(high_columns, axis=1)
test_set = test_set.drop(high_columns, axis=1)

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = train_set.drop("Transported", axis=1)
Y = train_set.Transported

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, shuffle=True, stratify=Y, random_state=0)

## Constructing Pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_transformer = SimpleImputer(strategy="mean")
categorical_transformer = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehotencoding", OneHotEncoder(handle_unknown="ignore", sparse=False))
])
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
])

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)

In [None]:
my_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

my_pipeline.fit(X_train, Y_train)

predictions = my_pipeline.predict(X_val)

In [None]:
from sklearn.metrics import precision_score


score = precision_score(Y_val.astype("int64"), predictions.astype("int64"))
print(score)

0.8035426731078905


In [None]:
pd.DataFrame({"Transported": my_pipeline.predict(test_set)}, index=test_set_index)

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [None]:
pd.DataFrame({"Transported": my_pipeline.predict(test_set)}, index=test_set_index).to_csv("submission.csv", header=True)