In [101]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn import set_config
set_config(transform_output="pandas")

In [129]:
data=pd.read_csv('train.csv')

In [130]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [131]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Inital preprocessing

In [132]:
def preprocess_inputs(df):
    df=df.copy()
    # drop unnessecary columns
    df=df.drop(['PassengerId','Name','Cabin'],axis=1)
    
    X=df.drop(['Transported'],axis=1)
    y=df['Transported']
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [133]:
X_train, X_test, y_train, y_test=preprocess_inputs(data)

In [134]:
X_train.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype='object')

In [135]:
type(X_train)

pandas.core.frame.DataFrame

# Building Pipeline

In [136]:
{column: len(X_train[column].unique()) for column in X_train.select_dtypes('object').columns}

{'HomePlanet': 4, 'CryoSleep': 3, 'Destination': 4, 'VIP': 3}

In [150]:
#classify features by type
numerical_features=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
categorical_features=['HomePlanet','CryoSleep','Destination','VIP']

In [151]:
numerical_transformer=Pipeline(steps=[
    ('numerical',SimpleImputer(strategy='median'))
])

categorical_transformer=Pipeline(steps=[
    ('categorical',SimpleImputer(strategy='most_frequent'))
    
])

In [152]:
# Build a preprocessing transformer with ColumnTransformer
imputing=ColumnTransformer(transformers=[
    ('numerical',numerical_transformer, numerical_features),
    ('categorical',categorical_transformer, categorical_features),   
])

In [153]:
# Transformer for date columns that extract year, month and day features
class CabinSplitTransformer:
    def fit(self,X,y):
        return self
    def transform(self, X):
        # Ensure X is a DataFrame
        X = pd.DataFrame(X)
        new_columns = []  # To collect new DataFrame slices

        for column in self.columns:
            if column in X.columns:
                # Split the column into three new columns
                splits = X[column].str.split('/', expand=True)
                if splits.shape[1] == 3:  # Ensure there are exactly three parts
                    splits.columns = ['Deck', 'Num', 'Side']
                    new_columns.append(splits)
                else:
                    raise ValueError(f"The column {column} does not contain exactly three parts separable by '/'.")
            else:
                raise KeyError(f"The column {column} was not found in the DataFrame.")

            # Drop the original column from X
            X = X.drop(column, axis=1)

        # Concatenate all the new columns with the remaining parts of the original DataFrame
        X = pd.concat([X] + new_columns, axis=1)
        return X

In [154]:
# Classify features by type
binary_features=['CryoSleep','VIP']
nominal_features=['HomePlanet','Destination']
#cabin_features=['Cabin']

#Construct transformers to handle each type of features
binary_transformer=Pipeline(steps=[
    ('ordinal',OrdinalEncoder(categories='auto'))
])

nominal_transformer=Pipeline(steps=[
    ('nominal',OneHotEncoder())
])
    

In [155]:
# Build a preprocessing transformer with ColumnTransformer
preprocessor=ColumnTransformer(transformers=[
    ('binary',binary_transformer, binary_features),
    ('nominal',nominal_transformer, nominal_features),
])

In [156]:
# Build the final pipeline
model=Pipeline(steps=[
    ('imputer', imputing),
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('classifier',LogisticRegression())
])

In [157]:
model

In [158]:
model.fit(X_train,y_train)

ValueError: A given column is not a column of the dataframe