In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import (
    train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, cross_validate, RandomizedSearchCV, GridSearchCV, learning_curve
)
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, classification_report, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix
)

from scipy.stats import uniform, randint


In [11]:
# Train
#df = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/refs/heads/main/data/spaceship_titanic_train.csv")
# Test
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/refs/heads/main/data/spaceship_titanic_test.csv")

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Cabin_Prefix
        X['Cabin_Prefix'] = X['Cabin'].astype(str).str.split('/').str[0]
        # Passenger_ID_Prefix
        X['PassengerId_Prefix'] = X['PassengerId'].str.split('_').str[0]
        X['GroupSize'] = X.groupby('PassengerId_Prefix')['PassengerId'].transform('count')
        # Spend features
        X['TotalSpend'] = X[self.columns_to_sum].sum(axis=1)
        X['LuxurySpend'] = X[['Spa', 'VRDeck']].sum(axis=1)
        X['BasicSpend'] = X[['RoomService', 'FoodCourt']].sum(axis=1)
        return X

# Define feature groups
categorical_features = ['HomePlanet',
                        'Destination', 
                        'VIP', 
                        'CryoSleep', 
                        'Cabin_Prefix',
                        'GroupSize'
                       ]

age_feature = ['Age']

spending_features = [
    'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'TotalSpend', 'LuxurySpend', 'BasicSpend'
]

# Pipelines
cat_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('to_str', FunctionTransformer(lambda x: x.astype(str))),  \
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

age_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

spending_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, categorical_features),
    ('age', age_pipeline, age_feature),
    ('spend', spending_pipeline, spending_features)
])
        
# Final pipeline
full_pipeline = Pipeline([
    ('features', FeatureEngineer()),
    ('preprocess', preprocessor)
])

In [12]:
X = df

full_pipeline.fit(X)

X_transformed = full_pipeline.fit_transform(X)

preprocessor = full_pipeline.named_steps["preprocess"]
cat_encoder = preprocessor.named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
all_feature_names = list(cat_feature_names) + age_feature + spending_features

X_transformed_df = pd.DataFrame(X_transformed, columns=all_feature_names)

#transported = df['Transported']
#X_transformed_df['Transported'] = transported.values

X_transformed_df.to_csv("Y_cat_Missing_encoded.csv", index=False)
