In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived', axis=1), df['Survived'], test_size = 0.3, random_state = 2)

In [8]:
X_train.shape, X_test.shape

((623, 7), (268, 7))

In [9]:
y_train.sample(6)

646    0
243    0
435    1
526    1
587    1
440    1
Name: Survived, dtype: int64

In [10]:
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer

trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
     ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
     ], remainder = 'passthrough')

In [11]:
df['Sex'].isnull().sum()

0

In [12]:
from sklearn.preprocessing import OneHotEncoder

trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'), [1,6])
     ],remainder = 'passthrough')

In [13]:
from sklearn.preprocessing import MinMaxScaler

trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0, 10))
     ])

In [14]:
from sklearn.feature_selection import SelectKBest,chi2

trf4 = SelectKBest(score_func = chi2, k=8)

In [15]:
from sklearn.tree import DecisionTreeClassifier

trf5 = DecisionTreeClassifier()

In [16]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
451,3,male,,1,0,19.9667,S
345,2,female,24.0,0,0,13.0,S
687,3,male,19.0,0,0,10.1708,S
279,3,female,35.0,1,1,20.25,S
742,1,female,21.0,2,2,262.375,C


# Create the Pipeline

In [17]:
from sklearn.pipeline import Pipeline,make_pipeline

pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

# pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [18]:
pipe.fit(X_train, y_train)

In [19]:
print(pipe.fit(X_train, y_train).score(X_test, y_test)*100, "%")

64.17910447761194 %


# Let's Explore the Pipeline

In [20]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x7faded9aa5f0>),
 'trf5': DecisionTreeClassifier()}

# Display Pipeline

In [21]:
from sklearn import set_config

set_config(display='diagram')
pipe

# Cross Validation in Pipeline

In [22]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(pipe, X_train, y_train, cv = 10, scoring='accuracy').mean()*100, "%")

62.13261648745521 %


# Grid Search using Pipelines

In [23]:
params = {'trf5__max_depth' : [1, 2, 3, 4, 5, None]}

In [24]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy')

grid.fit(X_train, y_train)

In [25]:
print(grid.best_score_*100, "%")

62.13261648745521 %


In [26]:
grid.best_params_

{'trf5__max_depth': 1}

# Export the Pipeline

In [27]:
import pickle

pickle.dump(pipe,open('pipeline.pkl', 'wb'))