In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# Load data from https://www.openml.org/d/40945
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [3]:
X.head(5)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   float64 
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   float64 
 5   parch      1309 non-null   float64 
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    float64 
 12  home.dest  745 non-null    object  
dtypes: category(2), float64(6), object(5)
memory usage: 115.4+ KB


In [8]:
y.describe()

count     1309
unique       2
top          0
freq       809
Name: survived, dtype: object

#Column Tranformation

This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space

In [4]:
from sklearn.compose import ColumnTransformer

Select cat and Continuous Features

In [5]:
numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

Create a pipeline of both Continuous and Categorical Variables

In [9]:
cnts_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])


In [10]:
categ_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Apply Column TRansformers
List of (name, transformer, column(s)) tuples specifying the transformer objects to be applied to subsets of the data.<br>


ColumnTransformer [('name', pipeline_name, features)]¶

In [11]:
preprocess_pipeline = ColumnTransformer([
    ('continuous', cnts_pipeline, numeric_features),
    ('cat', categ_pipeline, categorical_features)
    ])  ##remainder is used to get all the columns irrespective of transormation happened or not

#Apply Fit_transform(Data)

In [12]:
X_train_processed = preprocess_pipeline.fit_transform(X)

#Functionaltransformer
applies a function to a column which isn't technically a transformer is common, so Scikit-learn introduced a FunctionTransformer. A FunctionTransformer takes a function (such as np.log) and makes a transformer that does nothing when fit is called, but calls the function when transform is called.

In [13]:
from sklearn.preprocessing import FunctionTransformer

In [14]:
cnts_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('log', FunctionTransformer(np.log1p))   
])

In [15]:
preprocess_pipeline = ColumnTransformer([
    ('continuous', cnts_pipeline, numeric_features)
    ])

In [16]:
X_train_processed = preprocess_pipeline.fit_transform(X)

In [17]:
X_train_processed

array([[3.40119738, 5.35817699],
       [0.65060496, 5.02749241],
       [1.09861229, 5.02749241],
       ...,
       [3.314186  , 2.1071783 ],
       [3.33220451, 2.1071783 ],
       [3.40119738, 2.18323834]])