In [142]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline,make_pipeline

In [143]:
df = pd.read_csv("train.csv")
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S
571,572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S
691,692,1,3,"Karun, Miss. Manca",female,4.0,0,1,349256,13.4167,,C
349,350,0,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,,S
264,265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q


In [144]:
df.drop(columns=["PassengerId","Name","Ticket","Cabin"],inplace = True)

In [145]:
X_train,X_test,Y_train,Y_test = train_test_split(df.drop("Survived", axis =1),
                                                 df["Survived"],
                                                 test_size = 0.2,
                                                 random_state =2)
X_train.sample()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
856,1,female,45.0,1,1,164.8667,S


In [169]:
#imputation transformer on age and embarked

trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [176]:
#one hotencoding for sex and embarked

trf2 = ColumnTransformer([
       ('ohe_sex_embarked',OneHotEncoder(sparse = False,handle_unknown='ignore'),[3,1])
      ],remainder='passthrough')

In [177]:
#Scaling
trf3 = ColumnTransformer([
       ('scaling',MinMaxScaler(),slice(0,10))
      ],remainder='passthrough')

In [178]:
trf4 = SelectKBest(score_func=chi2,k=8)

In [179]:
trf5 = DecisionTreeClassifier()

In [180]:
pipe = Pipeline([
        ('trf1',trf1),
        ('trf2',trf2),
        ('trf3',trf3),
        ('trf4',trf4),
        ('trf5',trf5)
        ])

# Pipeline Vs make_pipeline
Pipeline requires naming of steps,make_pipeline does not (same applies for ColumnTransformer and make_column_transformer)

In [11]:
# Alternate Syntax

pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [181]:
#train 
pipe.fit(X_train,Y_train)


# EXPLORE PIPELINE

In [161]:
from sklearn import set_config
set_config(display ='diagram')

In [182]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [3, 1])]),
 'trf3': ColumnTransformer(remainder='passthrough',
                   transformers=[('scaling', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000000EB535978B0>),
 'trf5': DecisionTreeClassifier()}

In [187]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.78590426])

In [190]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [191]:
y_pred = pipe.predict(X_test)
y_pred

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [194]:
from sklearn.metrics import accuracy_score
score = accuracy_score(Y_test,y_pred)
score

0.7597765363128491

# EXPORTING PIPELINE

In [195]:
import pickle

In [197]:
pickle.dump(pipe,open('models/pipe.pkl','wb'))