[kaggle](https://www.kaggle.com/code/fiftythirtyfour/spaceship-titanic)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [5]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [6]:
target = 'Transported'
num = [x for x in df.columns if df.dtypes[x] in ['float64']]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
cat.remove('PassengerId')

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=0)

# Explore

In [8]:
train[target].mean()

0.5035950532067874

In [9]:
num

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [10]:
cat

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']

In [11]:
train.groupby(target)[num].mean()

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,30.041161,404.154508,373.514471,168.143915,558.61933,537.887445
True,27.723113,64.533664,534.237135,191.589721,63.032371,69.289512


In [12]:
train.groupby('HomePlanet')[target].mean()

HomePlanet
Earth     0.427280
Europa    0.660422
Mars      0.512856
Name: Transported, dtype: float64

In [13]:
train.groupby('CryoSleep')[target].agg(['mean', 'sum', 'count'])

Unnamed: 0_level_0,mean,sum,count
CryoSleep,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.328516,1432,4359
True,0.819381,1987,2425


In [14]:
train.isnull().mean()

PassengerId     0.000000
HomePlanet      0.022433
CryoSleep       0.024446
Cabin           0.021714
Destination     0.021427
Age             0.020995
VIP             0.025309
RoomService     0.021714
FoodCourt       0.021283
ShoppingMall    0.024734
Spa             0.021858
VRDeck          0.020995
Name            0.022433
Transported     0.000000
dtype: float64

In [15]:
train.groupby('Destination')[target].agg(['mean', 'sum', 'count'])

Unnamed: 0_level_0,mean,sum,count
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
55 Cancri e,0.614525,880,1432
PSO J318.5-22,0.514469,320,622
TRAPPIST-1e,0.468954,2228,4751


# Model

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

In [17]:
class DataSelect(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attributes]

In [18]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [19]:
cat_pp = Pipeline([
    ('dataselect', DataSelect(['CryoSleep']))
#     , ('impute', SimpleImputer(strategy='constant', fill_value='empty'))
    , ('encoder', OneHotEncoder(sparse_output=False))
])

num_pp = Pipeline([
    ('dataselect', DataSelect(num))
    , ('impute', SimpleImputer(strategy='mean'))
    , ('scaler', StandardScaler())
])

pipe = FeatureUnion([
    ('cat', cat_pp)
    , ('num', num_pp)
])

pipe.fit(train)
train_pp = pipe.transform(train)
test_pp = pipe.transform(test)

# Models

In [20]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

In [21]:
gnb = GaussianNB()
gnb.fit(train_pp, train[target])
cross_val_score(gnb, train_pp, train[target]).mean()

0.7329603980367108

In [22]:
svc = SVC()
svc.fit(train_pp, train[target])
cross_val_score(svc, train_pp, train[target]).mean()

0.7909110468634439

In [23]:
dcs = DecisionTreeClassifier()
dcs.fit(train_pp, train[target])
cross_val_score(dcs, train_pp, train[target]).mean()

0.737127681032744

In [24]:
rfc = RandomForestClassifier()
rfc.fit(train_pp, train[target])
cross_val_score(rfc, train_pp, train[target]).mean()

0.7865970860981955

In [25]:
model_base = svc

# Fine Tune

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
# params = {
#     "kernel": ["poly"]
#     , "degree": [1, 10]
#     , "coef0": [1, 50]
#     , "C": [5]
# }

# grd = GridSearchCV(model_base, param_grid=params)
# grd.fit(train_pp, train[target])
# grd.best_params_

In [28]:
model_tune = SVC(kernel='poly', degree=3, coef0=1, C=5)
model_tune.fit(train_pp, train[target]);

# Validation

In [29]:
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

In [30]:
accuracy_score(train[target], cross_val_predict(model_base, train_pp, train[target]))

0.7909117054932413

In [31]:
precision_score(train[target], cross_val_predict(model_base, train_pp, train[target]))

0.7907438955139126

In [32]:
recall_score(train[target], cross_val_predict(model_base, train_pp, train[target]))

0.7952598515134209

In [33]:
accuracy_score(test[target], cross_val_predict(model_base, test_pp, test[target]))

0.7745830937320299

In [34]:
precision_score(test[target], cross_val_predict(model_base, test_pp, test[target]))

0.7775229357798165

In [35]:
recall_score(test[target], cross_val_predict(model_base, test_pp, test[target]))

0.773972602739726

In [36]:
tune_prediction_tr = cross_val_predict(model_tune, train_pp, train[target])
tune_prediction_te = cross_val_predict(model_tune, test_pp, test[target])

In [37]:
accuracy_score(train[target], tune_prediction_tr)

0.7970951970089157

In [38]:
precision_score(train[target], tune_prediction_tr)

0.7854217854217854

In [39]:
recall_score(train[target], tune_prediction_tr)

0.8215305539691605

In [40]:
accuracy_score(test[target], tune_prediction_te)

0.7740080506037953

In [41]:
precision_score(test[target], tune_prediction_te)

0.764512595837897

In [42]:
recall_score(test[target], tune_prediction_te)

0.7968036529680366

# Submission

In [43]:
samp = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
pd.DataFrame({
    "PassengerId": samp["PassengerId"]
    , "Transported": model_tune.predict(pipe.transform(samp))
}).to_csv('submission.csv', index=False)