## Applying ColumnTransformer and Pipeline from sklearn

ColumnTransformer and Pipeline are two important sklearn tools to control the pre processing steps of a machine learning process. In this tutorial you will have a straight to the point code to implement ColumnTransformer and Pipeline.

Also will you have a example of FunctionTransformer to include your own functions in the ColumnTransformer. 

## If this notebook was useful for you, please vote up!

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
data = data[['PassengerId', 'Name', "HomePlanet", "Age", "VIP", "FoodCourt", "Transported"]]
data.head()

Unnamed: 0,PassengerId,Name,HomePlanet,Age,VIP,FoodCourt,Transported
0,0001_01,Maham Ofracculy,Europa,39.0,False,0.0,False
1,0002_01,Juanna Vines,Earth,24.0,False,9.0,True
2,0003_01,Altark Susent,Europa,58.0,True,3576.0,False
3,0003_02,Solam Susent,Europa,33.0,False,1283.0,False
4,0004_01,Willy Santantines,Earth,16.0,False,70.0,True


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  8693 non-null   object 
 1   Name         8493 non-null   object 
 2   HomePlanet   8492 non-null   object 
 3   Age          8514 non-null   float64
 4   VIP          8490 non-null   object 
 5   FoodCourt    8510 non-null   float64
 6   Transported  8693 non-null   bool   
dtypes: bool(1), float64(2), object(4)
memory usage: 416.1+ KB


In [4]:
def get_group(data, var):
    # Read the PassengerId and return the passanger group
    data['Group'] = data[var].str.split('_').str[1]
    return data

# We could drop this column in the function get_group, but I choose to do it in the function 
# below to show we can pass multiple functions with FunctionTransformer
def drop_column(data, var):
    # Drop the column passed in the function argument
    data = data.drop([var], axis = 1)
    return data

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

X = data[['PassengerId', "HomePlanet", "Age", "VIP", "FoodCourt"]]
y = data['Transported']

X_train, X_val, y_train, y_val = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=0)

columnGroup_creator = Pipeline(
    steps=[
        ("createColumnGroup", FunctionTransformer(get_group, 
                                                  kw_args={'var': 'PassengerId'})),
        ("dropColumnPassengerId", FunctionTransformer(drop_column, 
                                                      kw_args={'var': 'PassengerId'})),
        ("encoderNewColumn", OneHotEncoder(sparse_output=False,
                                           handle_unknown="ignore",
                                           drop = "first"))
    ]
)

categorical_features = ["HomePlanet", "VIP"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(sparse_output=False, 
                                  handle_unknown="ignore",
                                  drop = "first")),
    ]
)

numeric_features = ["Age", "FoodCourt"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())
          ]
)

ct = ColumnTransformer(
    transformers=[
        ("col", columnGroup_creator, ['PassengerId']),
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"
).set_output(transform="pandas")

new_data = ct.fit_transform(data)

new_data

Unnamed: 0,col__Group_02,col__Group_03,col__Group_04,col__Group_05,col__Group_06,col__Group_07,col__Group_08,num__Age,num__FoodCourt,cat__HomePlanet_Europa,cat__HomePlanet_Mars,cat__VIP_True,remainder__Name,remainder__Transported
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.711945,-0.281027,1.0,0.0,0.0,Maham Ofracculy,False
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.334037,-0.275387,0.0,0.0,0.0,Juanna Vines,True
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.036857,1.959998,1.0,0.0,1.0,Altark Susent,False
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293552,0.523010,1.0,0.0,0.0,Solam Susent,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.891895,-0.237159,0.0,0.0,0.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.851410,3.992336,1.0,0.0,1.0,Gravior Noxnuther,False
8689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.752431,-0.281027,0.0,0.0,0.0,Kurta Mondalley,False
8690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.194573,-0.281027,0.0,0.0,0.0,Fayey Connon,True
8691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223820,0.376365,1.0,0.0,0.0,Celeon Hontichre,False


In [6]:
pipe = Pipeline(
    steps = [
        ("preprocessor", ct),
        ("model", CatBoostClassifier(verbose = False,
                                     learning_rate = 0.01,
                                     iterations = 2000)),
    ]
)

pipe.fit(X_train, y_train)
y_pred = pipe.predict_proba(X_val)[:,1]

y_pred_trans = np.where(y_pred > 0.5, 1, 0)

score = accuracy_score(y_val, y_pred_trans)
print(f'Accuracy Score on Test set: {score}') 

Accuracy Score on Test set: 0.6561242093156987
