### Pipeline for logistic regression
Pipeline for logistic regression with sklearn on the Titanic dataset

***
#### Environment
`conda activate sklearn-env`

***
#### Goals
- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Save in various formats, load and use it to score


***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/model_persistence.html


In [None]:
#### Basic python imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load using sklearn API from https://www.openml.org site

https://www.openml.org/d/40945

If the URL does not work the dataset can be loaded from the data folder `./data/titanic/`. 
- `train.csv`
- `test.csv`
- `gender_submission.csv`

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
dataset = raw_dataset.copy()
dataset.head(10)

In [None]:
dataset.drop(['boat', 'body', 'home.dest', 'fare', 'cabin'],  axis=1, inplace=True)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('survived')
test_labels = test_features.pop('survived')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['age', 'sibsp', 'parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

custom_features = ['pclass']
custom_transformer = FunctionTransformer(np.square, validate=True)

categorical_features = ['embarked', 'sex']
ohe_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', ohe_transformer)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ohe', categorical_transformer, categorical_features),
        ('cust', custom_transformer, custom_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Use the pipeline to transform data

In [None]:
pipeline_model = pipeline.fit_transform(train_features, train_labels)

transformed_df = pd.DataFrame(data = pipeline_model)
transformed_df.head(10)

#### Build the full pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LogisticRegression())])
pipeline_model = pipeline.fit(train_features, train_labels)
print(pipeline_model)

In [None]:
from sklearn import set_config

set_config(display='diagram')
pipeline

#### Use pipeline to predict values

In [None]:
scored_data = pipeline.predict(test_features)

#### Show predicted values

In [None]:
scored_df = pd.DataFrame(data = scored_data)

scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.sample(10)

### Save then load model in pickle format

In [None]:
import pickle
s = pickle.dumps(pipeline)
pipeline_new = pickle.loads(s)
scored_test = pipeline_new.predict(test_features)
scored_df = pd.DataFrame(data = scored_test)
scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(10)

### Alternative to python internal pickle format

In [None]:
from joblib import dump, load
    
dump(pipeline, 'mypipeline.joblib') 

pipeline2 = load('mypipeline.joblib') 

scored_test = pipeline2.predict(test_features)
scored_df = pd.DataFrame(data = scored_test)
scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(10)