In [1]:
#                                                               pipeline
# a tool to chain multiple transformers (preprocessing, feature engineering etc) and/or with an estimator into a single cohesive object

In [77]:
import numpy as np
import pandas as pd

In [78]:
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier 

from sklearn.pipeline import Pipeline, make_pipeline

In [79]:
df = pd.read_csv('csv files/titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [80]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True ) 

In [81]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [82]:
# Age and Embarkded have missing values --> SimpleImputer
# sex and Embarked nominal categorical columns --> OneHotEncoding
# Scaling
# feature selection (not suitable for this scenario but for learning purpose)
# Model Training ---> Decision Tree

In [83]:
# pipeline flow : impute missing values, 

In [84]:
X_train, X_test, y_train, y_test = train_test_split ( df.drop(columns=['Survived']), df['Survived'], test_size=.2, random_state=42 )

In [85]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [86]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [87]:
# imputation transformer 
imputer_trf = ColumnTransformer ( transformers=[
    ( 'impute_age', SimpleImputer(), [2] ), # [2] is age column index
    ( 'impute_Embarked', SimpleImputer(strategy='most_frequent'), [6] )
], remainder = 'passthrough' )

In [88]:
# OHE transformer 
ohe_trf=ColumnTransformer(transformers=[
    ( 'ohe_sex_enbarked', OneHotEncoder( sparse_output=False, handle_unknown='ignore' ),[1, 6] )], remainder='passthrough')

In [89]:
# Initially 7 columns -->OHE--> sex (male, female i.e 2 columns) and Embarked (s, q, c i.e 3 columns) . --> 7-2+5 -->10 columns
# no drop='first' as algo is Decision Tree

In [90]:
# scaling transformer 
scaler_trf = ColumnTransformer( transformers=[
    ('scale', MinMaxScaler(), slice (0,10))
] )
# but this transformation is not necessar for DT algorithm 

In [91]:
# Feature Selection 
feature_selection_trf = SelectKBest( score_func = chi2, k=7 )

In [92]:
# model training 
model_training_trf = DecisionTreeClassifier()

In [93]:
# create pipeline
pipe = Pipeline ([
    ( 'Imputation Of Missing Values',imputer_trf ),
    ( 'Encoding Nominal Categorical Columns', ohe_trf ),
    ( 'Scaling All Columns', scaler_trf ),
    ( 'Feature Selection 7/10', feature_selection_trf ),
    ( 'Model Training wiht DT', model_training_trf )
])

In [95]:
pipe.fit(X_train, y_train) 
# pipeline with model training and pipeline without model training
# fit () ---> model training @ last chain 
# pipe.fit_transform (X_train, y_train) ----> if there was no model @ last chain of pipeline

In [109]:
y_pred = pipe.predict(X_test)

In [110]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [111]:
# 1 --> survived
# 0 --> !survived

In [97]:
#                                                           dissection of pipeline object 
pipe.named_steps

{'Imputation Of Missing Values': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_Embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'Encoding Nominal Categorical Columns': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_enbarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'Scaling All Columns': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'Feature Selection 7/10': SelectKBest(k=7, score_func=<function chi2 at 0x000002515D78C5E0>),
 'Model Training wiht DT': DecisionTreeClassifier()}

In [100]:
pipe.named_steps['Imputation Of Missing Values']

In [103]:
pipe.named_steps['Imputation Of Missing Values'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_Embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [104]:
pipe.named_steps['Imputation Of Missing Values'].transformers_[0]

('impute_age', SimpleImputer(), [2])

In [105]:
pipe.named_steps['Imputation Of Missing Values'].transformers_[0][1]

In [108]:
pipe.named_steps['Imputation Of Missing Values'].transformers_[0][1].statistics_
# mean age 

array([29.49884615])