In [None]:
# Standard library imports
import joblib

# Third-party imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
from sklearn.model_selection import (train_test_split, StratifiedShuffleSplit,
                                     GridSearchCV, cross_val_score, RandomizedSearchCV)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (FunctionTransformer, LabelEncoder, MinMaxScaler,
                                   Normalizer, OneHotEncoder, OrdinalEncoder, PolynomialFeatures,
                                   PowerTransformer, QuantileTransformer, StandardScaler)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Reading the data

In [None]:
from pathlib import Path

In [None]:
path = (Path('..') / 'Kaggle-Titanic-Spacechip-Competion')

In [None]:
train_data_full = pd.read_csv(path / 'train.csv')
test_data = pd.read_csv(path / 'test.csv')

In [None]:
train_data_full

## Basic

In [None]:
# The name of the passengers isn't going to be used for nothing in this notebook
test_data = test_data.drop('Name', axis=1)
train_data_full = train_data_full.drop('Name', axis=1)

# The passenger group was utilized in processing steps so I already added it here to the DataFrame
test_data['passenger_group']=test_data['PassengerId'].apply(lambda x: x[0:4])
train_data_full['passenger_group']=train_data_full['PassengerId'].apply(lambda x: x[0:4])

# The passenger IDs will be used at the end to return the correct IDs with the model previsions
test_passenger_id = test_data['PassengerId']
passenger_id = train_data_full['PassengerId']

train_data = train_data_full.drop(['Transported'], axis=1)
y_train = train_data_full['Transported']

In [None]:
binary_columns = ['CryoSleep','VIP']
numeric_columns = ['FoodCourt','RoomService','Spa','VRDeck','ShoppingMall']

cabin_ix = 3 # The cabin index is used in the CabinSeparator class

class CabinSeparator(BaseEstimator, TransformerMixin):

    # The Cabin info has the aggregated information of the cabin deck, side and number. This function stores all the information but just returns to the original
    # dataframe the cabin deck and side, given that the number can go until past 1000 and would probably overfit the data and not help very much
    def __init__(self, add_separate_cabin=True):
        self.add_separate_cabin = add_separate_cabin
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.add_separate_cabin:
            X = pd.DataFrame(X)
            cabin_deck = X.iloc[:, cabin_ix].apply(lambda s: str(s).split('/')[0])
            cabin_num = X.iloc[:, cabin_ix].apply(lambda s: str(s).split('/')[0] if len(str(s).split('/'))==1 else str(s).split('/')[1])
            cabin_side = X.iloc[:, cabin_ix].apply(lambda s: str(s).split('/')[0] if len(str(s).split('/'))==1 else str(s).split('/')[2])
            X['cabin_deck'] = cabin_deck
            X['cabin_side'] = cabin_side
            X = X.drop(X.columns[[cabin_ix]], axis=1)
            return X.values
        else:
            return X
    def get_feature_names_out(X, self):
        # This is wrong?
        return list(X.columns).extend(['cabin_deck','cabin_side'])

class FillBinaryNumericTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # I am assuming here that if there is no record of the person in CryoSleep or if they are on the VIP list they probably aren't in neither
        X.loc[:, binary_columns] = X[binary_columns].fillna(False)
        # The same logic applies here, if there is no record of the passenger spending money, they probably didn't spend it
        X.loc[:, numeric_columns] = X[numeric_columns].fillna(0)
        return X

class FillCabinDestHomeAgeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Here the function groups the data by the passanger group, and fills in the other data based on the info of another passanger that's in their group
        # For example if the daughter didn't put her data correct but her mother did, the code will retrieve that information
        X['Cabin'] = X.groupby('passenger_group')['Cabin'].transform(lambda x: x.fillna(x.iloc[0]))
        X['Destination'] = X.groupby('passenger_group')['Destination'].transform(lambda x: x.fillna(x.iloc[0]))
        X['HomePlanet'] = X.groupby('passenger_group')['HomePlanet'].transform(lambda x: x.fillna(x.iloc[0]))
        X['Age'] = X.groupby('passenger_group')['Age'].transform(lambda value: value.fillna(value.mean()))
        return X

class FillRestTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # For the groups that didn't have any information, I just decided to fill in the Age with the mean of values from that group and the other just a 'None' string
        X['Age'] = X.groupby('passenger_group')['Age'].transform(lambda value: value.fillna(value.mean()))
        X['Age'] = X['Age'].transform(lambda value: value.fillna(value.mean()))
        X[['Cabin','HomePlanet','Destination']] = X[['Cabin','HomePlanet','Destination']].fillna('None')
        return X

class AddTotalSpent(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        total_spent = np.sum(X[:, 5:10], axis=1)
        X = np.column_stack((X, total_spent))
        return X

class AddPolyFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, degree=3, addpoly=True):
        self.degree = degree
        self.addpoly = addpoly
        self.poly = PolynomialFeatures(degree=self.degree, include_bias=False)

    def fit(self, X, y=None):
        if self.addpoly:
            self.poly.fit(X)
        return self

    def transform(self, X):
        if self.addpoly:
            return self.poly.transform(X)
        else:
            return X

    def get_feature_names_out(self, input_features=None):
        """Ensure feature names are passed through correctly"""
        if self.addpoly:
            return self.poly.get_feature_names_out(input_features)
        return np.array(input_features) if input_features is not None else np.array([])

In [None]:
train_column_names = [col for col in train_data.columns if col != 'Cabin'] + ['cabin_deck','cabin_side','total_spent']
test_column_names = [col for col in test_data.columns if col != 'Cabin'] + ['cabin_deck','cabin_side','total_spent']

preprocessing = Pipeline([
    ('binary_numeric', FillBinaryNumericTransformer()),
    ('cabin_dest_home_age', FillCabinDestHomeAgeTransformer()),
    ('rest', FillRestTransformer()),
    ('cabin_separator', CabinSeparator()),
    ('add_total_spent', AddTotalSpent()),
])

# Maybe I should fit and transform after doing the train_test_split but I think that fitting on the train
# and transforming on the test (that will be used to evaluate the model on Kaggle) is good enough for now
treated_train_data = pd.DataFrame(preprocessing.fit_transform(train_data), columns=train_column_names)
treated_test_data = pd.DataFrame(preprocessing.transform(test_data), columns=test_column_names)

columns_to_convert = ['total_spent', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Convert columns to numeric type
treated_train_data[columns_to_convert] = treated_train_data[columns_to_convert].apply(pd.to_numeric, errors='coerce')
treated_test_data[columns_to_convert] = treated_test_data[columns_to_convert].apply(pd.to_numeric, errors='coerce')

treated_train_data[['VIP','CryoSleep']] = treated_train_data[['VIP','CryoSleep']].astype(int)
treated_test_data[['VIP','CryoSleep']] = treated_test_data[['VIP','CryoSleep']].astype(int)

In [None]:
treated_test_data.info()

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(treated_train_data, y_train, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

## EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Suppose 'df' is your full training DataFrame that includes:
# - 'passenger_group'
# - 'cabin_deck'
# - 'Age'
# (You can replace 'Age' with any other numeric feature of interest.)

column = 'ShoppingMall'

# 1. Compute the mean Age per passenger group.
passenger_stats = (
    treated_train_data.groupby('passenger_group')[column]
      .mean()
      .reset_index()
      .rename(columns={column: f'{column}_passenger_mean'})
)

# 2. Get the cabin deck for each passenger group.
#    Assuming each passenger group is associated with one cabin deck:
group_deck = treated_train_data[['passenger_group', 'cabin_deck']].drop_duplicates()

# Merge the deck info into the passenger_stats DataFrame.
passenger_stats = passenger_stats.merge(group_deck, on='passenger_group', how='left')

# 3. Compute the mean Age per cabin deck.
deck_stats = (
    treated_train_data.groupby('cabin_deck')[column]
      .mean()
      .reset_index()
      .rename(columns={column: f'{column}_deck_mean'})
)

# Merge the cabin deck stats into the passenger_stats DataFrame.
passenger_stats = passenger_stats.merge(deck_stats, on='cabin_deck', how='left')

# 4. Create a scatter plot comparing the two means.
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=passenger_stats,
    x=f'{column}_deck_mean',
    y=f'{column}_passenger_mean',
    hue='cabin_deck',
    s=100,
    palette="deep"
)

# Plot the y=x line to serve as a reference.
min_val = min(passenger_stats[f'{column}_deck_mean'].min(), passenger_stats[f'{column}_passenger_mean'].min())
max_val = max(passenger_stats[f'{column}_deck_mean'].max(), passenger_stats[f'{column}_passenger_mean'].max())
plt.plot([min_val, max_val], [min_val, max_val], 'k--', label='y = x')

plt.xlabel(f'Cabin Deck Mean {column}')
plt.ylabel(f'Passenger Group Mean {column}')
plt.title(f'Comparing Cabin Deck vs. Passenger Group Mean {column}')
plt.legend(title='Cabin Deck')
plt.show()


In [None]:
passenger_stats

## Numerical

In [None]:
spending_cols = ['RoomService', 'VRDeck', 'ShoppingMall', 'FoodCourt', 'Spa', 'total_spent']

In [None]:
class GroupStatsTransformer(BaseEstimator, TransformerMixin):
    """
    Computes group-level statistics on training data and merges these features onto new data.
    The transformer computes, for each grouping variable, the following aggregations:
      - 'Age', 'VIP', 'CryoSleep': mean
      - 'RoomService', 'VRDeck', 'ShoppingMall', 'FoodCourt': sum

    The computed statistics are renamed to include a suffix indicating the group source.
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        df = X.copy()

        self.base_agg_dict_ = {
            'Age': 'mean',
            'VIP': 'mean',
            'CryoSleep': 'mean',
            'RoomService': 'sum',
            'VRDeck': 'sum',
            'ShoppingMall': 'sum',
            'FoodCourt': 'sum',
            'Spa': 'sum',
            'total_spent': 'sum'
        }

        # passenger_group
        self.passenger_stats_ = (
            df.groupby('passenger_group')
              .agg(self.base_agg_dict_)
              .reset_index()
        )
        self.passenger_stats_.rename(
            columns={col: f"{col}_gm_pass_group" for col in self.base_agg_dict_.keys()},
            inplace=True
        )

        # cabin_deck
        self.deck_stats_ = (
            df.groupby('cabin_deck')
              .agg(self.base_agg_dict_)
              .reset_index()
        )
        self.deck_stats_.rename(
            columns={col: f"{col}_gm_cab_deck" for col in self.base_agg_dict_.keys()},
            inplace=True
        )

        # cabin_side
        self.side_stats_ = (
            df.groupby('cabin_side')
              .agg(self.base_agg_dict_)
              .reset_index()
        )
        self.side_stats_.rename(
            columns={col: f"{col}_gm_cab_side" for col in self.base_agg_dict_.keys()},
            inplace=True
        )

        return self

    def transform(self, X):
        df = X.copy()

        if 'passenger_group' in df.columns:
            df = df.merge(self.passenger_stats_, on='passenger_group', how='left')

        if 'cabin_deck' in df.columns:
            df = df.merge(self.deck_stats_, on='cabin_deck', how='left')

        if 'cabin_side' in df.columns:
            df = df.merge(self.side_stats_, on='cabin_side', how='left')

        df.index = X.index

        return df

In [None]:
class SpendingClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, spending_cols, n_clusters=100, random_state=42):
        self.spending_cols = spending_cols
        self.n_clusters = n_clusters
        self.random_state = random_state

    def fit(self, X, y=None):
        X = X.copy()
        self.scaler_ = StandardScaler()
        self.scaler_.fit(X[self.spending_cols])
        X_scaled = self.scaler_.transform(X[self.spending_cols])

        # Fit KMeans clustering on the scaled spending data
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X_scaled)
        return self

    def transform(self, X):
        X = X.copy()
        X_scaled = self.scaler_.transform(X[self.spending_cols])
        clusters = self.kmeans_.predict(X_scaled)
        X['spending_cluster'] = clusters
        return X

In [None]:
spending_cols = ['RoomService', 'VRDeck', 'ShoppingMall', 'FoodCourt', 'Spa', 'total_spent']

group_stats_pipeline = Pipeline([
    ('group_stats', GroupStatsTransformer()),
    ('spending_cluster', SpendingClusterTransformer(spending_cols=spending_cols, n_clusters=100, random_state=42))
])

In [None]:
X_train_stats = group_stats_pipeline.fit_transform(X_train)
X_test_stats = group_stats_pipeline.transform(X_test)
X_val_stats = group_stats_pipeline.transform(X_val)

In [None]:
X_test_stats

In [None]:
## I could do some kind of otimization to reach better values for these, for example considering the total_spent amound and how much it affects it
## but for now ill leave like this and test out the model

class GroupStatsFiller(BaseEstimator, TransformerMixin):
    """
    Fills missing values in the training data using group-level statistics.
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        df = X.copy()

        self.base_agg_dict_ = {
            'Age_gm_pass_group': 'mean',
            'VIP_gm_pass_group': 'mean',
            'CryoSleep_gm_pass_group': 'mean',
            'RoomService_gm_pass_group': 'mean',
            'VRDeck_gm_pass_group': 'mean',
            'ShoppingMall_gm_pass_group': 'mean',
            'FoodCourt_gm_pass_group': 'mean',
            'Spa_gm_pass_group': 'mean',
            'total_spent_gm_pass_group': 'mean'
        }

        # passenger_group
        self.passenger_stats_ = (
            df.groupby('spending_cluster')
              .agg(self.base_agg_dict_)
              .reset_index()
        )

        return self

    def transform(self, X):
        df = X.copy()

        if 'spending_cluster' in df.columns:
            df = df.merge(self.passenger_stats_, on='spending_cluster', how='left', suffixes=('', '_fill'))
            for col in self.base_agg_dict_.keys():
                df[col] = df[col].fillna(df[f'{col}_fill'])
                df = df.drop(columns=[f'{col}_fill'])

        df.index = X.index

        return df

In [None]:
group_stats_pipeline = Pipeline([
    ('group_stats_filler', GroupStatsFiller()),
])

In [None]:
X_train_filled = group_stats_pipeline.fit_transform(X_train_stats)
X_test_filled = group_stats_pipeline.transform(X_test_stats)
X_val_filled = group_stats_pipeline.transform(X_val_stats)

In [None]:
X_test_stats

In [None]:
X_test_filled

In [None]:
import pandas as pd

def _make_column_names_unique(df, base_agg_dict):
    """Append the corresponding key from base_agg_dict to each column name."""
    keys_order = list(base_agg_dict.keys())  # Maintain the order of base_agg_dict
    new_columns = [f"{col}_{keys_order[i % len(keys_order)]}" for i, col in enumerate(df.columns)]

    df.columns = new_columns
    return df


def add_group_stats(df):
    def _update_agg_dict(base_dict, suffix):
        return {key: value.replace("mean", f"mean{suffix}").replace("sum", f"sum{suffix}") for key, value in base_dict.items()}

    base_agg_dict = {
        'Age': 'mean',
        'VIP': 'mean',
        'CryoSleep': 'mean',
        'RoomService': 'sum',
        'VRDeck': 'sum',
        'ShoppingMall': 'sum',
        'FoodCourt': 'sum'
    }

    # Passenger Group
    group_stats_pass = df.groupby('passenger_group').agg(base_agg_dict).reset_index()
    renamed_agg_dict_pass = _update_agg_dict(base_agg_dict, "_gm_pass_group")
    df_pass = df[['passenger_group']].merge(group_stats_pass, on='passenger_group', how='left')
    df_pass = df_pass.rename(columns=renamed_agg_dict_pass)
    new_pass_cols = df_pass.drop(columns='passenger_group')

    # Cabin Deck
    group_stats_deck = df.groupby('cabin_deck').agg(base_agg_dict).reset_index()
    renamed_agg_dict_deck = _update_agg_dict(base_agg_dict, "_gm_cab_deck")
    df_deck = df[['cabin_deck']].merge(group_stats_deck, on='cabin_deck', how='left')
    df_deck = df_deck.rename(columns=renamed_agg_dict_deck)
    new_deck_cols = df_deck.drop(columns='cabin_deck')

    # Cabin Side
    group_stats_side = df.groupby('cabin_side').agg(base_agg_dict).reset_index()
    renamed_agg_dict_side = _update_agg_dict(base_agg_dict, "_gm_cab_side")
    df_side = df[['cabin_side']].merge(group_stats_side, on='cabin_side', how='left')
    df_side = df_side.rename(columns=renamed_agg_dict_side)
    new_side_cols = df_side.drop(columns='cabin_side')

    # Concatenate New Features
    new_features = pd.concat([new_pass_cols, new_deck_cols, new_side_cols], axis=1)
    new_features.index = df.index

    new_features = _make_column_names_unique(new_features, base_agg_dict)

    return new_features


In [None]:
from feature_engine.transformation import PowerTransformer

log_transformer = FunctionTransformer(lambda x: np.log(x + 1e-9))
reciprocal_transformer = FunctionTransformer(lambda x: np.reciprocal(x + 1e-1))
power_transformer = PowerTransformer(exp=0.23)

transformer = ColumnTransformer(
    transformers=[
        ('lt', log_transformer, make_column_selector(dtype_include='float64')),
        ('rt', reciprocal_transformer, make_column_selector(dtype_include='float64')),
        ('pt', power_transformer, make_column_selector(dtype_include='float64')),
    ],
    remainder='drop',
    verbose_feature_names_out=True,
    n_jobs=-1
).set_output(transform='pandas')

In [None]:
X_train_stats.info()

In [None]:
X_train_transformed = transformer.fit_transform(X_train_filled)
X_test_transformed = transformer.transform(X_test_filled)
X_val_transformed = transformer.transform(X_val_filled)

X_train_transformed.index = X_train.index

X_train_concat = pd.concat([X_train.select_dtypes('float64'), X_train_transformed], axis=1)
X_test_concat = pd.concat([X_test.select_dtypes('float64'), X_test_transformed], axis=1)
X_val_concat = pd.concat([X_val.select_dtypes('float64'), X_val_transformed], axis=1)

In [None]:
X_train_concat

In [None]:
transformer_pipeline = Pipeline([
    ('quantile', QuantileTransformer(output_distribution='normal'))
])

In [None]:
X_train_transformed_array = transformer_pipeline.fit_transform(X_train_concat)
X_test_transformed_array = transformer_pipeline.transform(X_test_concat)
X_val_transformed_array = transformer_pipeline.transform(X_val_concat)

X_train_transformed_df = pd.DataFrame(X_train_transformed_array, columns=X_train_concat.columns)
X_test_transformed_df = pd.DataFrame(X_test_transformed_array, columns=X_test_concat.columns)
X_val_transformed_df = pd.DataFrame(X_val_transformed_array, columns=X_val_concat.columns)

X_train_transformed_df

## Categorical

In [None]:
X_train_dropped = X_train.drop(['PassengerId', 'passenger_group'], axis=1)

In [None]:
pd.concat([X_train_dropped, y_train], axis=1)

In [None]:
y_train.index = X_train_dropped.index

fig, axes = plt.subplots(1, 4, figsize=(18, 3))

pd.concat([X_train_dropped, y_train], axis=1).groupby(['cabin_deck'])['Transported'].mean().plot(ax=axes[0], title='cabin_deck vs Transported')
pd.concat([X_train_dropped, y_train], axis=1).groupby(['cabin_side'])['Transported'].mean().plot(ax=axes[1], title='cabin_side vs Transported')
pd.concat([X_train_dropped, y_train], axis=1).groupby(['Destination'])['Transported'].mean().plot(ax=axes[2], title='Destination vs Transported')
pd.concat([X_train_dropped, y_train], axis=1).groupby(['HomePlanet'])['Transported'].mean().plot(ax=axes[3], title='HomePlanet vs Transported')

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(18, 3))

pd.concat([X_train_dropped, y_train], axis=1).groupby(['cabin_deck'])['Transported'].std().plot(ax=axes[0], title='cabin_deck vs Transported')
pd.concat([X_train_dropped, y_train], axis=1).groupby(['cabin_side'])['Transported'].std().plot(ax=axes[1], title='cabin_side vs Transported')
pd.concat([X_train_dropped, y_train], axis=1).groupby(['Destination'])['Transported'].std().plot(ax=axes[2], title='Destination vs Transported')
pd.concat([X_train_dropped, y_train], axis=1).groupby(['HomePlanet'])['Transported'].std().plot(ax=axes[3], title='HomePlanet vs Transported')

plt.tight_layout()
plt.show()

In [None]:
from category_encoders import TargetEncoder
categorical_cols = X_train_dropped.select_dtypes('object').columns

pipeline = Pipeline([
        ('te', TargetEncoder(cols=categorical_cols, smoothing=0.3, min_samples_leaf=10)),
])

X_train_encoded = pipeline.fit_transform(X_train_dropped, y_train)
X_train_encoded

In [None]:
y_train.index = X_train_encoded.index

fig, axes = plt.subplots(1, 4, figsize=(18, 3))

pd.concat([X_train_encoded, y_train], axis=1).groupby(['cabin_deck'])['Transported'].mean().plot(ax=axes[0], title='cabin_deck vs Transported')
pd.concat([X_train_encoded, y_train], axis=1).groupby(['cabin_side'])['Transported'].mean().plot(ax=axes[1], title='cabin_side vs Transported')
pd.concat([X_train_encoded, y_train], axis=1).groupby(['Destination'])['Transported'].mean().plot(ax=axes[2], title='Destination vs Transported')
pd.concat([X_train_encoded, y_train], axis=1).groupby(['HomePlanet'])['Transported'].mean().plot(ax=axes[3], title='HomePlanet vs Transported')

plt.tight_layout()
plt.show()

In [None]:
from feature_engine.encoding import OrdinalEncoder

categorical_cols = X_train_dropped.select_dtypes('object').columns

cat_transformer = ColumnTransformer(
    transformers=[
        ('oh', OneHotEncoder(), make_column_selector(dtype_include='object')),
        ('oe', OrdinalEncoder(), make_column_selector(dtype_include='object')),
        ('te', TargetEncoder(cols=categorical_cols, smoothing=0.3, min_samples_leaf=10), make_column_selector(dtype_include='object')),
        ('ss', StandardScaler(), make_column_selector(dtype_include=['float64', 'int'])),
        ('quantile', QuantileTransformer(output_distribution='normal'), make_column_selector(dtype_include=['float64', 'int']))
    ],
    remainder='drop'
).set_output(transform='default')

cat_pipeline = Pipeline([
    ('cat_transformer', cat_transformer)
])

X_train_encoded_arr = cat_pipeline.fit_transform(X_train_dropped, y_train)
X_test_encoded_arr = cat_pipeline.transform(X_test)
X_val_encoded_arr = cat_pipeline.transform(X_val)

X_train_encoded = pd.DataFrame(X_train_encoded_arr, columns=cat_transformer.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded_arr, columns=cat_transformer.get_feature_names_out())
X_val_encoded = pd.DataFrame(X_val_encoded_arr, columns=cat_transformer.get_feature_names_out())
X_train_encoded

In [None]:
y_train.index = X_train_encoded.index

fig, axes = plt.subplots(1, 4, figsize=(18, 3))

pd.concat([X_train_encoded, y_train], axis=1).groupby(['oe__cabin_deck'])['Transported'].mean().plot(ax=axes[0], title='cabin_deck vs Transported')
pd.concat([X_train_encoded, y_train], axis=1).groupby(['oe__cabin_side'])['Transported'].mean().plot(ax=axes[1], title='cabin_side vs Transported')
pd.concat([X_train_encoded, y_train], axis=1).groupby(['oe__Destination'])['Transported'].mean().plot(ax=axes[2], title='Destination vs Transported')
pd.concat([X_train_encoded, y_train], axis=1).groupby(['oe__HomePlanet'])['Transported'].mean().plot(ax=axes[3], title='HomePlanet vs Transported')

plt.tight_layout()
plt.show()

In [None]:
X_train_final = pd.concat([X_train_transformed_df,X_train_encoded], axis=1)
X_test_final = pd.concat([X_test_transformed_df,X_test_encoded], axis=1)
X_val_final = pd.concat([X_val_transformed_df,X_val_encoded], axis=1)

In [None]:
X_train_final.index = X_train.index

In [None]:
X_train_final

#### Load models

In [None]:
gbc_model = joblib.load('best_gbc_model.pkl')
lr_model = joblib.load('best_lr_model.pkl')
svc_model = joblib.load('best_svc_model.pkl')
xgb_model = joblib.load('best_xgb_model.pkl')
stacking_clf = joblib.load('stacking_clf.pkl')

In [None]:
gbc_model.fit(X_train_final, y_train)

In [None]:
svc_model.fit(X_train_final, y_train)

In [None]:
lr_model.fit(X_train_final, y_train)

In [None]:
xgb_model.fit(X_train_final, y_train)

In [None]:
stacking_clf.fit(X_train_final, y_train)

In [None]:
def accuracy_check(model, X_val, y_val):

    y_pred = model.predict(X_val)
    eval_accuracy = accuracy_score(y_val, y_pred, )

    return eval_accuracy

In [None]:
X_val_final

In [None]:
accuracy_check(stacking_clf, X_val_final, y_val)

In [None]:
accuracy_check(gbc_model, X_val_final, y_val)

In [None]:
accuracy_check(xgb_model, X_val_final, y_val)

In [None]:
accuracy_check(lr_model, X_val_final, y_val)

In [None]:
accuracy_check(svc_model, X_val_final, y_val)

In [None]:
X_train_final.to_parquet(path / 'X_train.pq')
X_test_final.to_parquet(path / 'X_test.pq')
X_val_final.to_parquet(path / 'X_val.pq')