In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2
from sklearn import set_config
import pickle

In [2]:
df= pd.read_csv('titanic.csv')

In [3]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
296,297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,
848,849,0,2,"Harper, Rev. John",male,28.0,0,1,248727,33.0,,S
530,531,1,2,"Quick, Miss. Phyllis May",female,2.0,1,1,26360,26.0,,S


In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
# Prepare data 

X_train,X_test, y_train, y_test= train_test_split(df.drop(columns=['Survived']), df[['Survived']], test_size=0.2, shuffle=True)

In [7]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
157,3,male,30.0,0,0,8.05,S
179,3,male,36.0,0,0,0.0,S
541,3,female,9.0,4,2,31.275,S
490,3,male,,1,0,19.9667,S
858,3,female,24.0,0,3,19.2583,C


In [8]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
# Hnadling missing values


# trf1= ColumnTransformer([
#                             ('impute_age',SimpleImputer(), [2]), # for age: index 2
#                              ('impute_embarked',SimpleImputer(strategy='most_frequent'), [6]) # for embarked
#                         ],

#                         remainder='passthrough'
#                        )

trf_age=Pipeline([('impute_num',SimpleImputer()),
               ('scale',MinMaxScaler())

])

# why are we using index instead of column name like [2] instead of ['Age']?
# beacuse output of one step is passed as an input for the next step of the pipeline and 
# result of ColumnTransfomer will give us an array and not dataframe, so there will be no column names but only values
# in such case if next step asks for name instead of index code will break

In [10]:
trf_embark=Pipeline([('impute_cat',SimpleImputer(strategy='most_frequent')),
                      ('ohe_embark', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
157,3,male,30.0,0,0,8.05,S
179,3,male,36.0,0,0,0.0,S
541,3,female,9.0,4,2,31.275,S
490,3,male,,1,0,19.9667,S
858,3,female,24.0,0,3,19.2583,C


In [12]:
# One Hot Encoding for categorical columns 

# trf2=ColumnTransformer([
#                         ('ohe_sex_embarked',OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[1,6])
#                         ], 
#                        remainder='passthrough'
#                       )

trf_sex=Pipeline([('ohe_sex', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

# alternative of ColumnTransformer is make_column_transformer

In [13]:
# X_train['Embarked'].unique()
# After one hot encoding Embarked column will be replaced by 3 new columns and 
# like Embarked_S, Embarked_C, Embarked_Q

In [14]:
X_train['Sex'].unique()
# After One Hot Encoding Sex column will become like 
# Sex_male, Sex_female

array(['male', 'female'], dtype=object)

In [15]:
# now we have 10 columns like
# Pclass, Sex_male, Sex_female,Age, SibSp,Parch,Fare, Embarked_S, Embarked_C, Embarked_Q
# applying Scaling on all columns
trf_fare = Pipeline([('scale_fare', MinMaxScaler()) 
        ])

In [16]:
# Feature Selection
# selecting only 8 most important features for our prediction model i.e., k=8
selector= SelectKBest(score_func=chi2, k=6) 

In [17]:
# choosing model
train_model=DecisionTreeClassifier()

In [18]:
preprocessor=ColumnTransformer(transformers=[
                                            ("trf_age",trf_age,[2]),
                                            ("trf_embark",trf_embark,[6]), 
                                            ("trf_sex",trf_sex,[1]),
                                            ("trf_fare",trf_fare,[5])
                            ])

In [19]:
# buIld pipeline to chain together the required steps
# pipe=Pipeline([('trf1', trf1),
#                ('trf2', trf2),
#                ('trf3', trf3),
#                 ('trf4',selector),
#                ('trf5', train_model)
#               ])

# # alternate syntax is:
# pipe=make_pipeline(trf1,trf2,trf3,selector, train_model)

pipe=Pipeline([
            ("preprocessor",preprocessor),
            ("selector",selector),
            ("classifier",train_model)
])

In [23]:
set_config(display='diagram')

In [24]:
# train the model
pipe.fit(X_train, y_train)

In [25]:
pipe.score(X_test,y_test)

0.776536312849162

In [27]:
y_pred=pipe.predict(X_test)

In [29]:
accuracy_score(y_test,y_pred) # similar as pipe.score

0.776536312849162

# Explore the pipeline

In [34]:
pipe.named_steps

{'preprocessor': ColumnTransformer(transformers=[('trf_age',
                                  Pipeline(steps=[('impute_num',
                                                   SimpleImputer()),
                                                  ('scale', MinMaxScaler())]),
                                  [2]),
                                 ('trf_embark',
                                  Pipeline(steps=[('impute_cat',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('ohe_embark',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 sparse_output=False))]),
                                  [6]),
                                 ('trf_sex',
                                  Pipeline(steps=[('ohe_sex',
                                                   OneHotEncoder(handle_unknown='ignore'

In [33]:
pipe.named_steps['preprocessor'].transformers_

[('trf_age',
  Pipeline(steps=[('impute_num', SimpleImputer()), ('scale', MinMaxScaler())]),
  [2]),
 ('trf_embark',
  Pipeline(steps=[('impute_cat', SimpleImputer(strategy='most_frequent')),
                  ('ohe_embark',
                   OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
  [6]),
 ('trf_sex',
  Pipeline(steps=[('ohe_sex',
                   OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
  [1]),
 ('trf_fare', Pipeline(steps=[('scale_fare', MinMaxScaler())]), [5]),
 ('remainder', 'drop', [0, 3, 4])]

In [40]:
pipe.named_steps['preprocessor'].transformers_[0][1].named_steps['impute_num'].statistics_

array([29.66901582])

In [42]:
cross_val_score(pipe, X_train,y_train, cv=5,scoring='accuracy').mean()

0.7977642076233625

# GridSearch with pipeline

In [50]:
params={'classifier__max_depth': [1,2,3,4,5,None]} # classifier is name for train_model in final Pipeline
# Grid Search is used for hypr tuning the parameters to get the best possible prediction 
grid=GridSearchCV(pipe,params, cv=5,scoring='accuracy')

In [51]:
grid.fit(X_train, y_train)

In [52]:
grid.best_params_

{'classifier__max_depth': None}

# Export the Pipeline

In [54]:
# export 
pickle.dump(pipe,open('pipe.pkl','wb'))