### Cross validation for logistic regression
Cross validation with sklearm for logistic regression on the Titanic dataset

***
#### Environment
`conda activate sklearn-env`


Note: make sure that you have `mlflow` installed in your conda env. Check this by running:

`pip list |grep mlflow`

If result is zero you can install either by recreating the `sklearn-env` conda environment or by running 

`pip install mlflow`

#### Goals
- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Save model in local mlflow repository 
- Programatically load model and score it locally 
- Serve model from local repository and test scoring results via HTTP endpoint 
***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://www.mlflow.org



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
from pprint import pprint
import mlflow

# enable autologging
mlflow.sklearn.autolog()
# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
dataset = raw_dataset.copy()
dataset.head(10)

In [None]:
dataset.drop(['boat', 'body', 'home.dest', 'fare', 'cabin'],  axis=1, inplace=True)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=random.randint(0, 1000))
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('survived')
test_labels = test_features.pop('survived')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['age', 'sibsp', 'parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

custom_features = ['pclass']
custom_transformer = FunctionTransformer(np.square, validate=True)

categorical_features = ['embarked', 'sex']
ohe_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', ohe_transformer)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ohe', categorical_transformer, categorical_features),
        ('cust', custom_transformer, custom_features)])

from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LogisticRegression())])
with mlflow.start_run(run_name='run_123') as run:
    pipeline_model = pipeline.fit(train_features, train_labels)
    print('Pipeline model :' +str( pipeline_model))


#### Extract pipeline metadata

In [None]:
def fetch_logged_data(run_id):
    pprint("RunId:" + str(run_id))
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

# fetch logged data
params, metrics, tags, artifacts = fetch_logged_data(run.info.run_id)

#### Show captured params

In [None]:
from IPython.display import display
display('Params', params)
display('Metrics', metrics)
display('Tags', tags)
display('Artifacts', artifacts)

#### Load model and score it programatically from mlflow repository

In [None]:
model = mlflow.sklearn.load_model('runs:/{}/model'.format(run.info.run_id))
scored_data = model.predict(test_features)

scored_data = pipeline.predict(test_features)
scored_df = pd.DataFrame(data = scored_data)
scored_df = pd.concat([scored_df, test_labels], axis=1)
print('Scored DF: '+ str(scored_df.head(10)))


#### Start mlflow UI and notice saved model along with it metadata (metrics , logs artifacts etc)
Note: mlflow UI url is http://localhost:5000 

In [None]:
#!mlflow ui --port 5000

#### Start an `mlflow serve` instance to expose HTTP rest call for scoring
Note: The following lines are disabled because ipython notebook can not run cells in parallel and previous cel "hangs" on `mlflow ui` command

In [None]:
!mlflow models serve -m runs:/{run.info.run_id}/model --port 1234

#### Score loaded mode via http command line tool (`curl`)

In [None]:
#!curl -d '{"columns":["pclass","name","sex","age","sibsp","parch","ticket","embarked"], "data":[[1,"Allen, Miss. Elisabeth Walton","female",29,0,0,"24160","S"]]}' -H 'Content-Type: application/json; format=pandas-split' -X POST localhost:1234/invocations