# **ST1508 - Practical AI**
## **Practical 4b: Creating Machine Learning Pipeline**
Developed by Singapore Polytechnic

In [1]:
# Standard Imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pickle

# Transformers
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Modeling Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load data set
df = pd.read_csv("data/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# check missing value
df.isna().sum(axis=0)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# drop useless features
df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

# define target variable, and split train and test set
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Create Data Processing Pipelines

In [5]:
# define numerical and categorical features
numerical_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

# numerical transformer
numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', None)
])

# categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# combine numerical and categorical transformer
data_transformer = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ]
)

# Create preprocessing pipeline and apply PCA
preprocessor = Pipeline(steps=[
    ('data_transformer', data_transformer),
    ('PCA', PCA())
])

# add classifier to the pipeline
classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=10000))
])

# grid search for best transformers and hyperparameters
param_grid = {
    'preprocessor__data_transformer__numerical__imputer__strategy': ['mean', 'median'],
    'preprocessor__data_transformer__categorical__imputer__strategy': ['constant', 'most_frequent'],
    'preprocessor__data_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'classifier__C': [0.1, 1, 10, 100],
    'preprocessor__PCA__n_components': [3, 4, 5],
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}

grid_search = GridSearchCV(classifier, param_grid=param_grid)
grid_search.fit(X_train, y_train.ravel())

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        Pipeline(steps=[('data_transformer',
                                                         ColumnTransformer(transformers=[('numerical',
                                                                                          Pipeline(steps=[('imputer',
                                                                                                           SimpleImputer()),
                                                                                                          ('scaler',
                                                                                                           None)]),
                                                                                          ['Age',
                                                                                           'Fare']),
                                                                                         

In [6]:
# Visualize the machine learning pipeline
from sklearn import set_config                      # to change the display
from sklearn.utils import estimator_html_repr       # to save the diagram into HTML format

# set config to diagram for visualizing the pipelines/composite estimators
set_config(display='diagram')

# Lets visualize the best estimator from grid search.
grid_search.best_estimator_

In [7]:
# to show the best hyperparameters
grid_search.best_params_

{'classifier__C': 1,
 'classifier__solver': 'lbfgs',
 'preprocessor__PCA__n_components': 5,
 'preprocessor__data_transformer__categorical__imputer__strategy': 'constant',
 'preprocessor__data_transformer__numerical__imputer__strategy': 'mean',
 'preprocessor__data_transformer__numerical__scaler': StandardScaler()}

In [8]:
# evaluate your model on the test data
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       130
           1       0.77      0.68      0.72        93

    accuracy                           0.78       223
   macro avg       0.78      0.77      0.77       223
weighted avg       0.78      0.78      0.78       223

