<a href="https://colab.research.google.com/github/dhanushpachabhatla/my-machine-learning-notes/blob/main/Pipeline_example_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
189,190,0,3,"Turcin, Mr. Stjepan",male,36.0,0,0,349247,7.8958,,S
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
789,790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C
389,390,1,2,"Lehmann, Miss. Bertha",female,17.0,0,0,SC 1748,12.0,,C
708,709,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S


In [5]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [6]:
X_train , X_test, Y_train, Y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [7]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [8]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


imputation transformer
* age & embarked columns

In [18]:
trf1 = ColumnTransformer([
    ('imputer_age', SimpleImputer(),[2]),
    ('imputer_embarked', SimpleImputer(strategy='most_frequent'),[6]),
], remainder='passthrough')

one hot encoding
* sex & embarked

In [19]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarker', OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[1,6])
], remainder='passthrough')

scaling  - min-max

In [20]:
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(),slice(0,10))
])

Feature Selection

In [21]:
trf4 = SelectKBest(score_func=chi2, k=8)

training model

In [22]:
trf5 = DecisionTreeClassifier()

#Create and Train Pipeline

In [23]:
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

if including model training in pipeline -
pipe.fit(X_train,Y_train)
else pipe.fit_transform(X_train,Y_train)

In [24]:
pipe.fit(X_train,Y_train)

# Predict

In [27]:
Y_pred = pipe.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,Y_pred)

0.6256983240223464

# Cross Validation using Pipeline

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, Y_train, cv=5, scoring='accuracy').mean()

np.float64(0.6391214419383433)

# Exporting the pipeline

In [31]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))