In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Building a Pipeline - Titanic
When working on machine learning projects, things can quickly get messy—datasets change, features evolve, and models need to be retrained regularly. This is where ML pipelines come in! They help streamline the process by organizing everything into a structured workflow, making it easier to scale, automate, and reproduce results.

In [2]:
import pandas as pd
import numpy as np

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import set_config
set_config(display='diagram')  # Enables visual display of the pipeline

%matplotlib inline
pd.set_option('display.max_columns', 500)

First, let's upload the dataset so we can start working with it.

In [3]:
df_train_filepath = '/kaggle/input/titanic/train.csv'
df_train = pd.read_csv(df_train_filepath)

df_test_filepath = '/kaggle/input/titanic/test.csv'
df_test = pd.read_csv(df_test_filepath)
 
df_train.info()
df_train.describe().transpose()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


Now, let's define custom classes for preprocessing, which will handle feature selection, missing value imputation, and categorical encoding—all while preserving the DataFrame structure.

In [4]:
class general_preprocessing(BaseEstimator, TransformerMixin):
    '''
    General preprocessing for all the datasets
    '''
    def __init__(self):
        self.columns = []

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        Xtr = X.copy()
        Xtr['_hasCabin'] = Xtr['Cabin'].notnull().astype(int)

        del Xtr['Cabin']
        del Xtr['PassengerId']
        del Xtr['Name']
        del Xtr['Ticket']

        self.columns = Xtr.columns
        return Xtr
    
    def get_features_name(self):
        return self.columns

class feature_selector(BaseEstimator, TransformerMixin):    
    '''
    Select either numerical or categorical columns
    '''
    def __init__(self, dtype='numerical'):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.dtype == 'numerical':
            return X.select_dtypes(include=[np.number])
        else:
            return X.select_dtypes(include=[object])

class df_imputer(BaseEstimator, TransformerMixin):
    '''
    Just a wrapper for the SimpleImputer that keeps the dataframe structure
    Parameters:
    - strategy: 'mean', 'median', 'most_frequent', 'constant'
    '''
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.imp = None
        self.statistics_ = None
        self.columns = []

    def fit(self, X, y=None):
        self.imp = SimpleImputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

    def transform(self, X):
        # X is supposed to be a DataFrame
        Ximp = self.imp.transform(X)
        Xfilled = pd.DataFrame(Ximp, index=X.index, columns=X.columns)
        self.columns = X.columns
        return Xfilled
    
    def get_features_name(self):  # again, it will be useful later
        return self.columns

class df_encoder(BaseEstimator, TransformerMixin):
    '''
    OneHotEncoder that keeps the dataframe structure
    '''
    def __init__(self):
        self.encoder = None
        self.columns = []

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encoder.fit(X.select_dtypes(include=[object]))  # only the categorical columns
        self.columns = self.encoder.get_feature_names_out(X.select_dtypes(include=[object]).columns)
        return self

    def transform(self, X):
        # X is supposed to be a DataFrame
        Xcat = X.select_dtypes(include=[object])
        Xenc = self.encoder.transform(Xcat)
        Xenc = pd.DataFrame(Xenc, index=X.index, columns=self.columns)
        X = X.drop(Xcat.columns, axis=1)
        X = pd.concat([X, Xenc], axis=1)

        return X
    
    def get_features_name(self):
        return self.columns

class gen_features(BaseEstimator, TransformerMixin):
    '''
    Generate new features
    '''
    def __init__(self, new_features = []):
        self.new_features = new_features
        self.columns = []

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        Xmod = X.copy()
        for feature in self.new_features:
            if feature == 'Fam_Age':
                Xmod['FamilySize'] = Xmod['SibSp'] + Xmod['Parch'] + 1
                Xmod.loc[(Xmod['FamilySize'] > 3) & (Xmod['Age'] > 10), 'Fam_Age'] = 'bigfamily_adult'
                Xmod.loc[(Xmod['FamilySize'] > 3) & (Xmod['Age'] <= 10), 'Fam_Age'] = 'bigfamily_child'
                Xmod.loc[(Xmod['FamilySize'] <= 3) & (Xmod['Age'] > 10), 'Fam_Age'] = 'smallfamily_adult'
                Xmod.loc[(Xmod['FamilySize'] <= 3) & (Xmod['Age'] <= 10), 'Fam_Age'] = 'smallfamily_child'
        
                del Xmod['SibSp']
                del Xmod['Parch']
                del Xmod['Age']
                del Xmod['FamilySize']

            elif feature == '_isBaby':
                Xmod['_isBaby'] = 0
                Xmod.loc[X['Age'] < 5, '_isBaby'] = 1
                if 'Age' in Xmod.columns:
                    del Xmod['Age']
                 

        self.columns = Xmod.columns
        return Xmod
    
    def get_features_name(self):
        return self.columns

Let's build two separate pipelines: one for numerical features and another for categorical features to ensure everything works correctly

* **general_preprocessing**: Removes unnecessary features and transforms the Cabin feature into a more useful format.

* **feature_selector**: Differentiates between numerical and categorical features, allowing us to create separate preprocessing pipelines for each.

* **df_imputer**: Handles missing values using a predefined imputation strategy.

* **df_encoder**: Encodes categorical features using one-hot encoding.

* **gen_features**: Generates new features that have a stronger correlation with the target variable.

In [5]:
numerical_pipeline = Pipeline([
    ('selector', feature_selector(dtype='numerical')),
    ('imputer', df_imputer(strategy='mean')),
    ])

categorical_pipeline = Pipeline([
    ('selector', feature_selector(dtype='categorical')),
    ('imputer', df_imputer(strategy='most_frequent')),
    ('encoder', df_encoder())
    ])

numerical_pipeline

In [6]:
categorical_pipeline

Now, let's combine all of these steps into a single pipeline while preserving the DataFrame structure, using a custom feature union class

In [7]:
class FeatureUnion_df(TransformerMixin, BaseEstimator):
    '''
    Wrapper of FeatureUnion but returning a Dataframe, 
    the column order follows the concatenation done by FeatureUnion

    transformer_list: list of Pipelines

    '''
    def __init__(self, transformer_list, n_jobs=None, transformer_weights=None, verbose=False, **kwargs):
        self.transformer_list = transformer_list
        self.n_jobs = n_jobs
        self.transformer_weights = transformer_weights
        self.verbose = verbose  # these are necessary to work inside of GridSearch or similar
        self.kwargs = kwargs
        self.feat_un = FeatureUnion(self.transformer_list, 
                                    n_jobs=self.n_jobs, 
                                    transformer_weights=self.transformer_weights, 
                                    verbose=self.verbose,
                                    **self.kwargs)
        
    def fit(self, X, y=None):
        self.feat_un.fit(X)
        return self

    def transform(self, X, y=None):
        X_tr = self.feat_un.transform(X)
        columns = []
        
        for trsnf in self.transformer_list:
            cols = trsnf[1].steps[-1][1].get_features_name()  # getting the features name from the last step of each pipeline
            columns += list(cols)
   
        X_tr = pd.DataFrame(X_tr, index=X.index, columns=columns)
        
        return X_tr

    def get_params(self, deep=True):  # necessary to well behave in GridSearch
        return self.feat_un.get_params(deep=deep)


processing_pipe = FeatureUnion_df(transformer_list=[
    ('numerical_pipeline', numerical_pipeline),
    ('categorical_pipeline', categorical_pipeline)
])

full_pipeline = Pipeline([
    ('general_preprocessing', general_preprocessing()),
    ('gen_features', gen_features(new_features=['Fam_Age', '_isBaby'])),
    ('processing_pipe', processing_pipe)
    ])


df_train_target = df_train['Survived']
df_train_feat = df_train.drop('Survived', axis=1)    
df_train_prepared = full_pipeline.fit_transform(df_train_feat)

df_train_prepared.head()

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


Unnamed: 0,Pclass,Fare,_hasCabin,_isBaby,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Fam_Age_bigfamily_adult,Fam_Age_bigfamily_child,Fam_Age_smallfamily_adult,Fam_Age_smallfamily_child
0,3.0,7.25,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,71.2833,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,7.925,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,53.1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,3.0,8.05,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


We can visualize all the parameters of the pipeline by plotting them, which will help us understand the flow and structure of the preprocessing steps.

In [8]:
full_pipeline.get_params()

{'memory': None,
 'steps': [('general_preprocessing', general_preprocessing()),
  ('gen_features', gen_features(new_features=['Fam_Age', '_isBaby'])),
  ('processing_pipe',
   FeatureUnion_df(transformer_list=[('numerical_pipeline',
                                      Pipeline(steps=[('selector',
                                                       feature_selector()),
                                                      ('imputer', df_imputer())])),
                                     ('categorical_pipeline',
                                      Pipeline(steps=[('selector',
                                                       feature_selector(dtype='categorical')),
                                                      ('imputer',
                                                       df_imputer(strategy='most_frequent')),
                                                      ('encoder',
                                                       df_encoder())]))]))],
 'verbose': F

Next, let's create the model and integrate it into the pipeline, so we can perform hyperparameter tuning using the Random Search algorithm.

In [9]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

def grid_search(data, target, estimator, param_grid, scoring, cv):
    
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, 
                        cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    
    pd.options.mode.chained_assignment = None  # this is because the gridsearch throws a lot of pointless warnings
    tmp = data.copy()
    grid = grid.fit(tmp, target)
    pd.options.mode.chained_assignment = 'warn'
    
    result = pd.DataFrame(grid.cv_results_).sort_values(by='mean_test_score', 
                                                        ascending=False).reset_index()
    
    del result['params']
    times = [col for col in result.columns if col.endswith('_time')]
    params = [col for col in result.columns if col.startswith('param_')]
    
    result = result[params + ['mean_test_score', 'std_test_score'] + times]
    
    return result, grid.best_params_


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Create XGBoost model with GPU support
model = xgb.XGBClassifier(tree_method='hist', device='cuda')

xgb_pipeline = Pipeline([
    ('full_pipeline', full_pipeline),
    ('xgb', model)
])

# Define parameter grid
param_grid = {'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 6],
    'xgb__learning_rate': [0.01, 0.1],
    'full_pipeline__processing_pipe__numerical_pipeline__imputer__strategy': ['mean', 'median'],
    'full_pipeline__processing_pipe__categorical_pipeline__imputer__strategy': ['most_frequent', 'constant'],
    'full_pipeline__gen_features__new_features': [['Fam_Age'], ['Fam_Age', '_isBaby'], ['_isBaby'], []],
              }

res, bp = grid_search(df_train_feat, df_train_target, xgb_pipeline, 
                param_grid=param_grid, scoring='accuracy', cv=KFold(n_splits=5, shuffle=True, random_state=42))

res



Unnamed: 0,param_full_pipeline__gen_features__new_features,param_full_pipeline__processing_pipe__categorical_pipeline__imputer__strategy,param_full_pipeline__processing_pipe__numerical_pipeline__imputer__strategy,param_xgb__learning_rate,param_xgb__max_depth,param_xgb__n_estimators,mean_test_score,std_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time
0,[],constant,median,0.01,6,200,0.847360,0.023147,0.234617,0.009923,0.020918,0.000427
1,[],most_frequent,median,0.01,6,200,0.847360,0.023147,0.220391,0.004823,0.019875,0.000760
2,[],constant,mean,0.01,6,200,0.845120,0.027187,0.249544,0.022458,0.019740,0.000613
3,[],most_frequent,mean,0.01,6,200,0.845120,0.027187,0.226040,0.005997,0.019111,0.000183
4,"[Fam_Age, _isBaby]",constant,median,0.01,6,200,0.835045,0.029884,0.229064,0.004398,0.032085,0.001158
...,...,...,...,...,...,...,...,...,...,...,...,...
123,[_isBaby],most_frequent,median,0.01,3,100,0.814833,0.013013,0.099256,0.004342,0.021430,0.000868
124,[_isBaby],constant,median,0.01,3,100,0.814833,0.013013,0.099502,0.013009,0.022257,0.000912
125,[_isBaby],most_frequent,mean,0.01,3,100,0.814833,0.013013,0.094836,0.006111,0.021935,0.000673
126,[],constant,median,0.01,3,100,0.812586,0.018833,0.093463,0.005104,0.018835,0.001057


Finally, we’ve found the best parameters for our pipeline, including the optimal hyperparameters to assign to the XGB algorithm.

Now, let’s build a new pipeline using the best parameters we’ve found.

In [10]:
# Best XGBoost model with best parameters
best_numerical_pipeline = Pipeline([
    ('selector', feature_selector(dtype='numerical')),
    ('imputer', df_imputer(strategy='median')),
    ])

best_categorical_pipeline = Pipeline([
    ('selector', feature_selector(dtype='categorical')),
    ('imputer', df_imputer(strategy='most_frequent')),
    ('encoder', df_encoder())
    ])

best_processing_pipe = FeatureUnion_df(transformer_list=[
    ('numerical_pipeline', numerical_pipeline),
    ('categorical_pipeline', categorical_pipeline)
])

best_full_pipeline = Pipeline([
    ('general_preprocessing', general_preprocessing()),
    ('gen_features', gen_features(new_features=[])),
    ('processing_pipe', processing_pipe)
    ])

best_xgb = Pipeline([
    ('full_pipeline', full_pipeline),
    ('xgb', xgb.XGBClassifier(tree_method='hist', device='cuda', n_estimators=200, max_depth=6, learning_rate=0.01))
    ])

best_xgb.fit(df_train_feat, df_train_target)

  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


Now, it's time to make predictions on the test dataset and submit them!

In [11]:
predictions = best_xgb.predict(df_test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output['Survived'] = output['Survived'].astype(int)
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
