# ML Graphs (with mlflow)

Flypipe allows creation of graphs to train ML models and run predictions.

Here as example, we are building the following graph to train, evaluate and predict.

![ML Graph](ml.png)


The graph above contains the following nodes:

**Training Graph**

- **data**: loads sklearn iris dataset into a dataframe
- **split**: splits the data into train/test data
- **fit_scale**: fit and scale the data using sklearn Standard Scaler
- **train_svm_model**: trains a sklearn SVM model on train data and returns the prediction
- **evaluate**: calculates evaluation metrics


**Prediction Graph**

- **scale**: scales the data using the scaler fit on node `fit_scale`
- **predict**: loads the SVM model trained in the node `train_svm_model` and does the predictions


**Governance**

- **graph**: dummy node used to plot all related graphs to the model

> In this section, we are using [mlflow](https://mlflow.org/docs/latest/index.html) to save and loand ML artifacts

## Training Graph

### Data

In [None]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, Integer

import pandas as pd
from sklearn import datasets


@node(
    type="pandas",
    description="Load Iris dataset",
    tags=["data"],
    output=Schema(
      Column('sepal_length', Float(), 'sepal length'),  
      Column('sepal_width', Float(), 'sepal width'),  
      Column('petal_length', Float(), 'petal length'),  
      Column('petal_width', Float(), 'petal width'),  
      Column('target', Integer(), '0: Setosa, 1: Versicolour, and 2: Virginica'),
    ))
def data():
    iris = datasets.load_iris()
    df = pd.DataFrame(data = {
        'sepal_length': iris.data[:,0],
        'sepal_width': iris.data[:,1],
        'petal_length': iris.data[:,2],
        'petal_width': iris.data[:,3],
        'target': iris.target
    })
    
    return df
    
data.run()

### Split data as train (70%) and test (30%)

In [None]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, String
from sklearn.model_selection import train_test_split

@node(
    type="pandas",
    description="Split train (70%) and test (30%) data",
    tags=["data", "split"],
    dependencies=[
        data.select(
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width',
            'target',
        )
    ],
    output=Schema(
      Column('data_type', String(), 'train (70%), test (30%)'),  
      data.output.get("sepal_length"),
      data.output.get("sepal_width"),
      data.output.get("petal_length"),
      data.output.get("petal_width"),
      data.output.get("target"),
      
    ))
def split(data):
    data['data_type'] = "train"
    
    X_cols = [
        'sepal_length',
        'sepal_width',
        'petal_length',
        'petal_width'
    ]
    y_col = 'target'
    
    X_train, X_test, y_train, y_test = train_test_split(data[X_cols], 
                                                        data[y_col], 
                                                        test_size=0.3, 
                                                        random_state=1)

    X_train['data_type'] = 'train'
    X_train['target'] = y_train
    
    X_test['data_type'] = 'test'
    X_test['target'] = y_test
    
    data = pd.concat([X_train, X_test])
    return data
    
df = split.run()
display(df)


### Fit and Scale

In [None]:
import os
ARTIFACT_LOCATION = "/data/tmp/artifacts/"
os.makedirs(ARTIFACT_LOCATION, exist_ok=True)

In [None]:
import os
import pickle 
import mlflow

from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, String
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

@node(
    type="pandas",
    description="Fits a standard scaler",
    tags=["data", "train", "scaler"],
    dependencies=[
        split.select(
            'data_type',
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width',
            'target',
        )
    ],
    output=Schema(
      Column('data_type', String(), 'train (70%), test (30%)'),  
      split.output.get("sepal_length"),
      split.output.get("sepal_width"),
      split.output.get("petal_length"),
      split.output.get("petal_width"),
      split.output.get("target"),      
    ))    
def fit_scale(split):
    
    X_cols = [
        'sepal_length',
        'sepal_width',
        'petal_length',
        'petal_width'
    ]
    
    scaler = StandardScaler()
    scaler = scaler.fit(split[split['data_type']=='train'][X_cols])
    
    if mlflow.active_run():
        artifact_path = f"{ARTIFACT_LOCATION}{mlflow.active_run().info.run_id}/model"
        if not os.path.exists(artifact_path):
            os.makedirs(artifact_path, exist_ok=True)

        pickle.dump(scaler, open(os.path.join(artifact_path, 'scaler.pkl'), 'wb'))
    
    split[X_cols] = scaler.transform(split[X_cols])
    
    return split
    
df = fit_scale.run()
display(df.head(10))

### Train SVM Model

In [None]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, String, Integer
from sklearn.model_selection import train_test_split
from sklearn import svm
from mlflow.models.signature import infer_signature

@node(
    type="pandas",
    description="Model training using SVM",
    tags=["model", "svm"],
    dependencies=[
        fit_scale.select(
            'data_type',
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width',
            'target',
        ).alias("df")
    ],
    output=Schema(
      Column('data_type', String(), 'train (70%), test (30%)'),  
      fit_scale.output.get("sepal_length"),
      fit_scale.output.get("sepal_width"),
      fit_scale.output.get("petal_length"),
      fit_scale.output.get("petal_width"),
      fit_scale.output.get("target"),      
      Column('prediction', Integer(), 'prediction'),  
    ))
def train_svm_model(df):
    
    X_cols = [
        'sepal_length',
        'sepal_width',
        'petal_length',
        'petal_width'
    ]
    
    X_train = df[df['data_type']=='train']
    y_train = X_train['target']
    X_train = X_train[X_cols]
    
    clf = svm.SVC().fit(X_train, y_train)
        
    if mlflow.active_run():
        signature = infer_signature(X_train, y_train)
        mlflow.sklearn.log_model(clf, 
                                 "model", 
                                 signature=signature, 
                                 input_example=X_train.head(5))
  

    df['prediction'] = clf.predict(df[X_cols])
    return df
    
df = train_svm_model.run()
display(df)

### Evaluate

In [None]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, String, Integer
from sklearn.model_selection import train_test_split
from sklearn import svm
from mlflow.models.signature import infer_signature
from sklearn.metrics import f1_score

@node(
    type="pandas",
    description="Model training using SVM",
    tags=["model", "svm"],
    dependencies=[
        train_svm_model.select(
            'data_type',
            'target',
            'prediction'
        ).alias("df")
    ],
    output=Schema(
      Column('data_type', String(), 'all, train or test'),  
      Column('metric', String(), 'score metric'),  
      Column('value', Float(), 'value of the metric'),        
    ))
def evaluate(df):
    result = pd.DataFrame(columns=['data_type', 'metric', 'value'])
    
    # All data
    score = f1_score(df['target'], df['prediction'], average='macro')
    result.loc[result.shape[0]] = ['all', 'f1_score macro', score]
    
    # Train data
    df_ = df[df['data_type']=='train']
    score = f1_score(df_['target'], df_['prediction'], average='macro')
    result.loc[result.shape[0]] = ['train', 'f1_score macro', score]
    
    # Test data
    df_ = df[df['data_type']=='test']
    score = f1_score(df_['target'], df_['prediction'], average='macro')
    result.loc[result.shape[0]] = ['test', 'f1_score macro', score]
    
    return result
    
df = evaluate.run()
display(df)

displayHTML(evaluate.html())

### Executing Training & Evaluation

In [None]:
import os
import mlflow
from mlflow import log_metric, log_param, log_artifacts
from mlflow.exceptions import MlflowException


# Ends any actve mlflow run
try:
    mlflow.end_run()
except Exception as e:
    pass

"""
Creates or gets an experiment from /Shared folder
Sets the artifact location to the mounted blob
"""

try:
    experiment_id = mlflow.create_experiment('flypipe_demo', artifact_location=ARTIFACT_LOCATION)
except MlflowException as m:
    pass
finally:
    experiment = mlflow.get_experiment_by_name('flypipe_demo')
    experiment_id = experiment.experiment_id
    

"""
Starts the mlflow run with the experiment
"""    
mlflow.start_run(experiment_id=experiment_id) 
RUN_ID = mlflow.active_run().info.run_id
print(f"Training run_id: {RUN_ID}")

df = evaluate.run()

display(df)
mlflow.end_run()

## Prediction Graph

### Scale data for prediction

In [None]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, String
from sklearn.model_selection import train_test_split

@node(
    type="pandas",
    description="Split train (70%) and test (30%) data",
    tags=["data", "split"],
    dependencies=[
        data.select(
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width',
        )
    ],
    output=Schema(
      data.output.get("sepal_length"),
      data.output.get("sepal_width"),
      data.output.get("petal_length"),
      data.output.get("petal_width"),
    ))
def scale(data):
    
    X_cols = [
        'sepal_length',
        'sepal_width',
        'petal_length',
        'petal_width'
    ]
    
    with open(f'{ARTIFACT_LOCATION}{RUN_ID}/model/scaler.pkl', 'rb') as fp:
        scaler = pickle.load(fp)
        data[X_cols] = scaler.transform(data[X_cols])
        
    return data

df = scale.run()
display(df)

### Predict

In [None]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Float, String
from sklearn.model_selection import train_test_split

@node(
    type="pandas",
    description="Split train (70%) and test (30%) data",
    tags=["data", "split"],
    dependencies=[
        scale.select(
            'sepal_length',
            'sepal_width',
            'petal_length',
            'petal_width'
        ).alias("df")
    ],
    output=Schema(
      Column('prediction', Integer(), 'prediction'),  
      
    ))
def predict(df):
    model_path = f'runs:/{RUN_ID}/model'
    loaded_model = mlflow.pyfunc.load_model(model_path)

    df['prediction'] = loaded_model.predict(df)
    return df

df = predict.run()
display(df)


## Document Training and Prediction Graphs

In [None]:
@node(
    type="pandas",
    description="Graph to train and predict Iris Data set",
    dependencies=[
        evaluate,
        predict
    ])
def graph(evaluate, predict):
    raise NotImplemented('Not supposed to run, only used to display the graph')
    
displayHTML(graph.html())

## Run predictions with provided data

We can make use of `inputs` when running the node `predict`.
It allow us to give a custom data to be scaled and retrieve the predictions.

This feature is useful when making analysis or using the same pipeline in other environments like APIs (discussed later on).

Here we are giving 1 example of sepal and petal lenght and widths. Note that the execution graph will **skip** `data` node and use the data we are providing as input.

In [None]:
# Hard coded data to be predicted
df = pd.DataFrame(data = {
        'sepal_length': [6.6],
        'sepal_width': [3.1],
        'petal_length': [5.1],
        'petal_width': [2.4]
    })

# Run the predictions
predictions = (
    predict
    .run(inputs={
        data: df
    })
)

# Show predictions
display(predictions)

# How the execution graph
displayHTML(
    predict
    .html(inputs={
        data: df
    })
)