### Cross validation for logistic regression
Cross validation with sklearm for logistic regression on the Titanic dataset

***
#### Environment
`conda activate sklearn-env`


Note: make sure that you have `mlflow` installed in your conda env. Check this by running:

`pip list |grep mlflow`

If result is zero you can install either by recreating the `sklearn-env` conda environment or by running 

`pip install mlflow`

#### Goals
- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Save model in local mlflow repository 
- Programatically load model and score it locally 
- Serve model from local repository and test scoring results via HTTP endpoint 
***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://www.mlflow.org



In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
from pprint import pprint
import mlflow

# enable autologging
mlflow.sklearn.autolog()
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [2]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
dataset = raw_dataset.copy()
dataset.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1.0,1,"Anderson, Mr. Harry",male,48.0,0.0,0.0,19952,26.55,E12,S,3,,"New York, NY"
6,1.0,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1.0,0.0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1.0,0,"Andrews, Mr. Thomas Jr",male,39.0,0.0,0.0,112050,0.0,A36,S,,,"Belfast, NI"
8,1.0,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2.0,0.0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1.0,0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [3]:
dataset.drop(['boat', 'body', 'home.dest', 'fare', 'cabin'],  axis=1, inplace=True)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [4]:
train_dataset = dataset.sample(frac=0.8, random_state=random.randint(0, 1000))
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('survived')
test_labels = test_features.pop('survived')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['age', 'sibsp', 'parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

custom_features = ['pclass']
custom_transformer = FunctionTransformer(np.square, validate=True)

categorical_features = ['embarked', 'sex']
ohe_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', ohe_transformer)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ohe', categorical_transformer, categorical_features),
        ('cust', custom_transformer, custom_features)])

from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LogisticRegression())])
with mlflow.start_run(run_name='run_123') as run:
    pipeline_model = pipeline.fit(train_features, train_labels)
    print('Pipeline model :' +str( pipeline_model))


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                           ...`
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', M...`
                ('scaler', MinMaxScaler())]), ['age', 'sibsp', 'parch']), ('ohe', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('ohe',...`
STOP: T

Pipeline model :Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['age', 'sibsp', 'parch']),
                                                 ('ohe',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                             

#### Extract pipeline metadata

In [6]:
def fetch_logged_data(run_id):
    pprint("RunId:" + str(run_id))
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

# fetch logged data
params, metrics, tags, artifacts = fetch_logged_data(run.info.run_id)

'RunId:02b1eeebd2974a37b3e065635d649ead'


#### Show captured params

In [7]:
from IPython.display import display
display('Params', params)
display('Metrics', metrics)
display('Tags', tags)
display('Artifacts', artifacts)

'Params'

{'memory': 'None',
 'preprocessor': "ColumnTransformer(transformers=[('num',\n                                 Pipeline(steps=[('imputer',\n                                                  SimpleImputer(strategy='median')),\n                                                 ('scaler', M...",
 'preprocessor__cust': "FunctionTransformer(func=<ufunc 'square'>, validate=True)",
 'preprocessor__cust__accept_sparse': 'False',
 'preprocessor__cust__check_inverse': 'True',
 'preprocessor__cust__func': "<ufunc 'square'>",
 'preprocessor__cust__inverse_func': 'None',
 'preprocessor__cust__inv_kw_args': 'None',
 'preprocessor__cust__kw_args': 'None',
 'preprocessor__cust__validate': 'True',
 'preprocessor__num': "Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\n                ('scaler', MinMaxScaler())])",
 'preprocessor__num__imputer': "SimpleImputer(strategy='median')",
 'preprocessor__num__imputer__add_indicator': 'False',
 'preprocessor__num__imputer__copy': 'True',
 'preproce

'Metrics'

{'training_accuracy_score': 0.7956064947468959,
 'training_f1_score': 0.7946886525133585,
 'training_log_loss': 0.4477285284168118,
 'training_precision_score': 0.7942587412038208,
 'training_recall_score': 0.7956064947468959,
 'training_roc_auc_score': 0.851981737831637,
 'training_score': 0.7956064947468959}

'Tags'

{'estimator_class': 'sklearn.pipeline.Pipeline', 'estimator_name': 'Pipeline'}

'Artifacts'

['model/MLmodel',
 'model/conda.yaml',
 'model/model.pkl',
 'model/requirements.txt']

#### Load model and score it programatically from mlflow repository

In [8]:
model = mlflow.sklearn.load_model('runs:/{}/model'.format(run.info.run_id))
scored_data = model.predict(test_features)

scored_data = pipeline.predict(test_features)
scored_df = pd.DataFrame(data = scored_data)
scored_df = pd.concat([scored_df, test_labels], axis=1)
print('Scored DF: '+ str(scored_df.head(10)))


Scored DF:    0 survived
0  1        1
1  0        1
2  0        1
3  1        0
4  1        1
5  1        1
6  0        0
7  0        1
8  1        1
9  1        1


#### Start mlflow UI and notice saved model along with it metadata (metrics , logs artifacts etc)
Note: mlflow UI url is http://localhost:5000 

In [9]:
#!mlflow ui --port 5000

#### Start an `mlflow serve` instance to expose HTTP rest call for scoring
Note: The following lines are disabled because ipython notebook can not run cells in parallel and previous cel "hangs" on `mlflow ui` command

In [None]:
!mlflow models serve -m runs:/{run.info.run_id}/model --port 1234

#### Score loaded mode via http command line tool (`curl`)

In [None]:
#!curl -d '{"columns":["pclass","name","sex","age","sibsp","parch","ticket","embarked"], "data":[[1,"Allen, Miss. Elisabeth Walton","female",29,0,0,"24160","S"]]}' -H 'Content-Type: application/json; format=pandas-split' -X POST localhost:1234/invocations