# MACHINE LEARNING PIPELINES (SKLEARN)

## **WITHOUT USING PIPELINE**

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier


In [None]:
df = pd.read_csv('train.csv')
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [None]:
# dropping unnecessary columns
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)


In [None]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


target column = survived

In [None]:
# step 1 : train test split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=13)

In [None]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
711,1,male,,0,0,26.55,S
525,3,male,40.5,0,0,7.75,Q


In [None]:
y_train.head(2)

Unnamed: 0,Survived
711,0
525,0


In [None]:
# check the no of missing values
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


age and embarked has missing values


In [None]:
X_train['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,517
C,135
Q,58


frequency of S is higher

In [None]:
# applying imputation
# filling missing values, for age and embarked

si_age = SimpleImputer() # default fills it with mean
si_embarked = SimpleImputer(strategy='most_frequent')

In [None]:
X_train_age = si_age.fit_transform(X_train[['Age']])
X_test_age = si_age.transform(X_test[['Age']])

In [None]:
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])


In [None]:
# Step 2 : apply one hot encoding on sex and embarked
ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown='ignore')


In [None]:
X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_test_sex = ohe_sex.transform(X_test[['Sex']])

In [None]:
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [None]:
X_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
711,1,male,,0,0,26.55,S
525,3,male,40.5,0,0,7.75,Q
76,3,male,,0,0,7.8958,S


In [None]:
X_train_embarked

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [None]:
print(X_train_embarked.shape)
print(X_train_sex.shape)
print(X_train_age.shape)

(712, 3)
(712, 2)
(712, 1)


now we have 3 transformed columns
1. ohe applied to sex
2. ohe and simple imputing applied to embarked
3. simple imputing applied to age

In [None]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
711,1,male,,0,0,26.55,S
525,3,male,40.5,0,0,7.75,Q


In [None]:
## remaining columns are
X_train_rem = X_train.drop(columns=['Sex','Age','Embarked'])
X_test_rem = X_test.drop(columns=['Sex','Age','Embarked'])

In [None]:
print(X_train_rem.shape)

(712, 4)


In [None]:
# concatenating remaining columns with transformed columns
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)




(1 pclass + 1 fare + 1 sibsp+ 1 parch + 2 genders + 3 embarked + 1 age =  10 columns)

In [None]:
X_train_transformed.shape

(712, 10)

In [None]:
X_test_transformed.shape

(179, 10)

In [None]:
#training the model
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)


In [None]:
y_test.head(5)

Unnamed: 0,Survived
736,0
421,0
442,0
196,0
200,0


In [None]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0])

In [None]:
# calculate accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7597765363128491

In [None]:
import pickle


> pickle.dump(obj, file, protocol=None, *, fix_imports=True, buffer_callback=None)

>Return the pickled representation of the object obj as a bytes object, instead of writing it to a file.

In [None]:
# encode
pickle.dump(ohe_sex,open('/content/sample_data/models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('/content/sample_data/models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('/content/sample_data/models/clf.pkl','wb'))

> pickle.load(file, *, fix_imports=True, encoding='ASCII', errors='strict', buffers=None)
Read the pickled representation of an object from the open file object file and return the reconstituted object hierarchy specified therein.

In [None]:
# convert obj to pickle representation and back to obj representation
ohe_sex = pickle.load(open('/content/sample_data/models/ohe_sex.pkl','rb'))
ohe_embarked = pickle.load(open('/content/sample_data/models/ohe_embarked.pkl','rb'))
clf = pickle.load(open('/content/sample_data/models/clf.pkl','rb'))


In [None]:
X_test_transformed[0]

array([ 3.   ,  1.   ,  3.   , 34.375, 48.   ,  1.   ,  0.   ,  0.   ,
        0.   ,  1.   ])

In [None]:
# Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
test_input = np.array([[2,'male',31.0,0,0,10.5,'S']],dtype=object)
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [None]:
test_input[:,1].reshape(1,1)

array([['male']], dtype=object)

In [None]:
test_input_sex = ohe_sex.transform(test_input[:,1].reshape(1,1))



In [None]:
test_input_sex

array([[0., 1.]])

In [None]:
test_input_embarked = ohe_embarked.transform(test_input[:,-1].reshape(1,1))

In [None]:
test_input_embarked

array([[0., 0., 1.]])

In [None]:
test_input_age = test_input[:,2].reshape(1,1)
test_input_age

array([[31.0]], dtype=object)

In [None]:
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [None]:
test_rem = test_input[:,[0,3,4,5]]
test_rem

array([[2, 0, 0, 10.5]], dtype=object)

In [None]:
test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]],test_input_age,test_input_sex,test_input_embarked),axis=1)
test_input_transformed.shape

(1, 10)

In [None]:
clf.predict(test_input_transformed)

array([0])

## **WITH USING SKLEARN PIPELINE**

In [None]:
import numpy as np
import pandas as pd


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


**columns that doesnt help in analysis to be dropped**

In [None]:

df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

**train test split**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=13)

In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
711,1,male,,0,0,26.55,S
525,3,male,40.5,0,0,7.75,Q
76,3,male,,0,0,7.8958,S
626,2,male,57.0,0,0,12.35,Q
159,3,male,,8,2,69.55,S


In [None]:
y_train.head()

Unnamed: 0,Survived
711,0
525,0
76,0
626,0
159,0


**check if there are missing values**

In [None]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


**missing values in Age,Embarked**

**Pipeline
Step1 : Imputation Transformer**

In [None]:
X_train.head(1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
711,1,male,,0,0,26.55,S


**output after a transformation is an array, therefore we pass the index of the column to apply the transformation on, so that in the pipeline, when the next column transformer wants the input it looks at the index and not at the column name**

In [None]:
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')
trf1

In [None]:
trf1.transformers

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])]

In [None]:
trf1.transformers[0]

('impute_age', SimpleImputer(), [2])

In [None]:
trf1.transformers[0][1]

**Pipeline Step-2 : One Hot Encoding**

on sex and embarked columns

In [None]:
X_train['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,517
C,135
Q,58


In [None]:
X_train.head(1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
711,1,male,,0,0,26.55,S


In [None]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[3,1])
],remainder='passthrough')
trf2

**Pipeline Step-3 :Scaling**

In [None]:
# always pass list of tuples
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
],remainder='passthrough')
trf3

**Pipeline Step-4: Feature selection**

In [None]:
# select the top8 best features
trf4 = SelectKBest(score_func=chi2,k=8)
trf4

**Pipeline Step-5 : Train the model**

In [None]:
trf5 = DecisionTreeClassifier()
trf5

### **Create Pipeline**

In [None]:
pipe = Pipeline([
    ('Impute-transformer',trf1),
    ('OneHotEncoding-transformer',trf2),
    ('Scaling-transformer',trf3),
    ('Feature-selection-stage',trf4),
    ('Train-the-model-stage',trf5)
])

In [None]:
pipe

### Pipeline vs make_pipeline

> Pipeline requires naming of steps, make_pipeline does not.

> Same applies to ColumnTransformer vs make_column_transformer



```
# alternate syntax
# pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)
```



**train**

In [None]:
type(pipe)



```
pipe = Pipeline([
    ('Impute-transformer',trf1),
    ('OneHotEncoding-transformer',trf2),
    ('Scaling-transformer',trf3),
    ('Feature-selection-stage',trf4),
    ('Train-the-model-stage',trf5)
])
```



pipe.fit(X_train,y_train) does
1. Impute-transformer.fit_transform(X_train) parallel computations
2. OneHotEncoding-transformer.fit_transform(X_train_imputed)
3. ScalingTransformer.fit_transform(X_train_one_hot_encoded)
4. Feature-selection-stage.selectKbest(X_train_scaled)
5. train-the-model.fit(X_train_feature_selected,y_train)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
pipe.named_steps

{'Impute-transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'OneHotEncoding-transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [3, 1])]),
 'Scaling-transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'Feature-selection-stage': SelectKBest(k=8, score_func=<function chi2 at 0x7bdbb0fd0900>),
 'Train-the-model-stage': DecisionTreeClassifier()}

In [None]:
type(pipe['Impute-transformer'])

**functiontransformer is used in remainder(i.e for rest of columns)**

**column transformer rearranges columns in such a way that the columns on which transformation is applied is put to front and others put behind it in the same order**

In [None]:
pipe['Impute-transformer'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [None]:
pipe['Impute-transformer'].transformers_[1]

('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])

**find out the most frequent station in embarked**

In [None]:
pipe['Impute-transformer'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [None]:
pipe['Impute-transformer'].transformers_[1][1].n_features_in_

1

In [None]:
pipe['Impute-transformer'].transformers_[1][1].feature_names_in_

array(['Embarked'], dtype=object)

In [None]:
pipe['OneHotEncoding-transformer'].transformers_

[('ohe_sex_embarked',
  OneHotEncoder(handle_unknown='ignore', sparse_output=False),
  [3, 1]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 2, 4, 5, 6])]

In [None]:
pipe['OneHotEncoding-transformer'].transformers_[0][1].categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [None]:
pipe['OneHotEncoding-transformer'].transformers_[0][1].n_features_in_

2

**display pipeline**

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
pipe

**predict**

In [None]:
y_pred = pipe.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0])

**accuracy**

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8491620111731844

**cross validation using pipeline**

In [None]:
type(pipe)

> sklearn.model_selection.cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, params=None, pre_dispatch='2*n_jobs', error_score=nan)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy')

array([0.77622378, 0.7972028 , 0.77464789, 0.73943662, 0.76760563])

In [None]:
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

np.float64(0.7766472963656061)

**GridSearch using Pipeline**

In [None]:
#gridsearchcv
# hyperparameter_tuning
# it will store the best value in max_depth
params = {
    'Train-the-model-stage__max_depth' : [1,2,3,4,5,None]
}

> class sklearn.model_selection.GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [None]:
grid.best_score_

np.float64(0.7949867034374078)

In [None]:
grid.best_params_

{'Train-the-model-stage__max_depth': 3}

**Exporting the Pipeline**

In [None]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

**Predict using Pipeline**

In [None]:
pipe = pickle.load(open('pipe.pkl','rb'))

**assume user input**

In [None]:
test_input2 = np.array([2,'male',31.0,0,0,10.5,'S'],dtype=object).reshape(1,7)

In [None]:
pipe.predict(test_input2)



array([0])