In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer #filling missing values
from sklearn.preprocessing import OneHotEncoder #encoding nominal variables
from sklearn.preprocessing import MinMaxScaler #feature scaling
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2 #feature selection
from sklearn.tree import DecisionTreeClassifier #model dtree

'''
again, except last line, importing everything else is mandatory.
'''

'\nagain, except last line, importing everything else is mandatory. \n'

In [2]:
df = pd.read_csv('titanic.csv')

In [None]:
'''
every logic of the code before using pipeline is same as after using pipeline.
'''

In [3]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [7]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [4]:
#1. imputation transformer.
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')
'''
remainder = 'passthrough' means that leave all the rest columns as they were, and they won't get afftected by the column transformer.

NOTE:- this time, we are specifying the column indices from the X_train/test. the indices are anyway going to be remaining the same
for train and test data.
'''

In [8]:
#2. ohe transformer.
'''
trf_name = ColumnTransformer([
    ('new_col_name', obj(), column_index)
], remainder='passthrough')
'''
trf2 = ColumnTransformer([
    ('ohe_sex', OneHotEncoder(sparse_output = False, handle_unknown='ignore'), [1]),
    ('ohe_embarked', OneHotEncoder(sparse_output = False, handle_unknown='ignore'), [6])
], remainder = 'passthrough')

In [9]:
#3. Feature scaling.
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])
'''
as we know that we did ohe above, on the sex and embarked columns, total columns now are 10. now, when we do feature scaling, it is
MANDATORY for us to apply it on all the columns. hence, slice of 0,10 suggests all the columns (0 to 9th index, exclusive of 10)
'''

In [10]:
#4. Feature selection.
trf4 = SelectKBest(score_func=chi2,k=8)

'''
selects 8 best features automatically based on chi square test, that scored highest according to the chi-squared test.
'''


'\nselects 8 best features automatically based on chi square test, that scored highest according to the chi-squared test.\n'

In [11]:
trf5 = DecisionTreeClassifier()

### NOTE:- the trf4 and trf5 are NOT the column transformers, but they are named so just in order to maintain consistency while using the pipeline.

### Create PIPELINE

In [12]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [13]:
'''
pipeline acts like a model.
we simply give the train data from which the pipeline will learn. hence, pipe.fit() in which X/y_train feeded.
'''
pipe.fit(X_train,y_train)

'''
the diagram below suggests which of the 5 variables are actual col_transformers.
'''

In [14]:
y_pred = pipe.predict(X_test)
'''
after learning happens, simply predict the values using test data.
'''

In [15]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

### Cross-validation using pipeline

In [17]:

from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

np.float64(0.6391214419383433)

In [18]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [19]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S
