In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC

from sklearn.preprocessing import FunctionTransformer

# Spaceship Titanic
### pipeline

In [3]:
spaceship_data_raw = pd.read_csv("data/train.csv")

In [4]:
spaceship_data_raw

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [5]:
spaceship_data_raw.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
spaceship_data_raw.columns.tolist()

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Transported']

In [7]:
passenger_features = spaceship_data_raw.drop(columns="Transported")

In [8]:
passenger_labels = spaceship_data_raw.Transported

In [9]:
passenger_features_train, passenger_features_val, passenger_labels_train, passenger_labels_val = train_test_split(
    passenger_features, passenger_labels, test_size = 1000, random_state = 42
)

In [10]:
passenger_features_train.shape, passenger_features_val.shape, passenger_labels_train.shape, passenger_labels_val.shape

((7693, 13), (1000, 13), (7693,), (1000,))

In [45]:
def split_column(df):
    df[["Cabin", "num", "site"]]= df["Cabin"].str.split('/',expand=True)
    return df

split_cabin_func = FunctionTransformer(split_column, validate=False)

In [46]:
all_features = [
    'PassengerId',
     'HomePlanet',
     'CryoSleep',
     'Cabin',
     'Destination',
     'Age',
     'VIP',
     'RoomService',
     'FoodCourt',
     'ShoppingMall',
     'Spa',
     'VRDeck',
     'Name',
]

In [68]:
split_cabin = make_column_transformer(
        (split_cabin_func, all_features)
)
split_cabin.fit(passenger_features_train).set_output(transform="pandas")



In [87]:
numeric_features = [
             'functiontransformer__Age',
             'functiontransformer__RoomService',
             'functiontransformer__FoodCourt',
             'functiontransformer__ShoppingMall',
             'functiontransformer__Spa',
             'functiontransformer__VRDeck',
             'functiontransformer__num',  
         ]

categorical_features = [
             'functiontransformer__HomePlanet',
             'functiontransformer__CryoSleep',
             'functiontransformer__Destination',
             'functiontransformer__VIP',
             'functiontransformer__Cabin',
             'functiontransformer__site'
         ]


In [88]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", MinMaxScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder(drop = "first")),

    ]
)

In [89]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features),
    ],
     verbose_feature_names_out=False
)

In [90]:
#preprocessor.fit(split_cabin.transform(passenger_features_train))

In [91]:
#preprocessor.get_feature_names_out()

In [92]:
#len(preprocessor.get_feature_names_out())

In [113]:
pipe_lin_svm = Pipeline(
    [
        ("split_cabin", split_cabin),
        ("preprocess", preprocessor),
        ("lin_svm", LinearSVC(max_iter=20_000))
    ]
)

In [114]:
pipe_lin_svm.fit(passenger_features_train, passenger_labels_train)



In [115]:
pipe_lin_svm.score(passenger_features_train, passenger_labels_train)

0.7888989990900819

In [116]:
pipe_lin_svm.score(passenger_features_val, passenger_labels_val)

0.76

In [97]:
grid_search = GridSearchCV(pipe_lin_svm, param_grid={"lin_svm__C": [1e-3, 1, 1e3, 1e6]})

In [98]:
grid_search.fit(passenger_features_train, passenger_labels_train)



In [99]:
best_model = grid_search.best_estimator_

In [100]:
grid_search.best_params_

{'lin_svm__C': 1}

In [101]:
best_model.score(passenger_features_train, passenger_labels_train)

0.7888989990900819

In [102]:
best_model.score(passenger_features_val, passenger_labels_val)

0.76

In [103]:
spaceship_test_raw = pd.read_csv("data/test.csv")

In [104]:
spaceship_test_raw.shape

(4277, 13)

In [105]:
predictions = best_model.predict(spaceship_test_raw)

In [106]:
submission = pd.DataFrame({"PassengerId": spaceship_test_raw.PassengerId})
submission["Transported"] = predictions

In [107]:
submission = submission.set_index("PassengerId")
submission

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [108]:
submission.to_csv("pipe_1.csv")