<a href="https://colab.research.google.com/github/dewansh2004/Data-Scientis-tools/blob/main/prediction_using_pipeline_on_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [7]:
df = pd.read_csv('/content/train.csv')

In [8]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)


In [9]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [10]:
#imputation Transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [11]:
#OneHotEncoding
trf2 = ColumnTransformer([
    ('ohe.sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [17]:
#Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [18]:
#Feature Scaling
trf4 = SelectKBest(score_func=chi2,k=8)

In [19]:
#train the model
trf5 = DecisionTreeClassifier()

## **** ***Create Pipeline***

In [21]:
#pipe = Pipeline([
   # ('trf1',trf1),
    #('trf2',trf2),
    #('trf3',trf3),
    #('trf4',trf4),
    #('trf5',trf5)
#])

In [23]:
#make_pipeline
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [24]:
pipe.fit(x_train,y_train)

# *Explore Pipe*

In [25]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe.sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x7fce416804a0>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [28]:
y_pred = pipe.predict(x_test)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

# ***cross validation using pipeline***

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()

np.float64(0.6391214419383433)

# ***Grid Search using pipeline***

In [36]:
params = {
    'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]
}

In [37]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5,scoring='accuracy')
grid.fit(x_train, y_train)

In [38]:
grid.best_score_

np.float64(0.6391214419383433)

In [39]:
#Exporting Pickle
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))