# TODO

**MUST**
- [X] Lecture article SAAI sur mllflow
- [ ] Log model - mlflow.sklearn.log_model(lr, "model")
- [ ] save the model - mlflow.sklearn.save_model(lr, modelpath)
- [ ] log artefact - mlflow.log_artifact("ElasticNet-paths.png")
- [ ] load MLflow model from run and use it in inference
    - https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html

**NICE**
- [ ] Use [mlflow autologging](https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.autolog) and compare it with the from scratch way
- [ ] Customize the experiment saving
    - run name, ..
- [ ] Advaanced https://towardsdatascience.com/5-tips-for-mlflow-experiment-tracking-c70ae117b03f
- [ ] Use encapsulation of runs
- [ ] Check the databricks tutorial and see what can be added

mlflow ui --backend-store-uri path/to/mlruns/folder

In [2]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

In [3]:
import warnings
warnings.filterwarnings('ignore')

  and should_run_async(code)


# Read and setup dataset

- The diabetes dataset: https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset
- https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html

In [7]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

def get_dataset() -> pd.DataFrame:
    diabetes = datasets.load_diabetes()
    X, y = diabetes.data, diabetes.target
    Y = np.array([y]).transpose()
    d = np.concatenate((X, Y), axis=1)
    cols = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'progression']
    data = pd.DataFrame(d, columns=cols)
    return data

def split_data_in_train_and_test_sets(data: pd.DataFrame):
    target_column = 'progression'
    train, test = train_test_split(data)
    X_train, y_train = train.drop([target_column], axis=1), train[[target_column]]
    X_test, y_test = test.drop([target_column], axis=1), test[[target_column]]
    return X_train, X_test, y_train, y_test
    
data = get_dataset()
X_train, X_test, y_train, y_test = split_data_in_train_and_test_sets(data)

In [8]:
data.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,progression
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


# Modeling

In [11]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'RMSE = {rmse:.2f}, MAE = {mae:.2f}, R2 = {r2:.2f}')
    return rmse, mae, r2

def train_model(X_train, X_test, y_train, y_test: pd.DataFrame, alpha: float, l1_ratio: float) -> int:
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    model.fit(X_train, y_train)        
    print(f'Elasticnet model (alpha={alpha:.2f}, l1_ratio={l1_ratio:.2f}): ', end='')
    evaluate_model(model, X_test, y_test)

In [12]:
train_model(X_train, X_test, y_train, y_test, 0.01, 0.75)

Elasticnet model (alpha=0.01, l1_ratio=0.75): RMSE = 59.32, MAE = 50.51, R2 = 0.40


# Other

- Setup mlflow tracking directory

In [50]:
from pathlib import Path

#Path('/Users/alaa.bakhti/Documents/EPITA/data-science-in-production/tp/data_science_in_production/'
PROJECT_DIR = Path('.').resolve().parents[0].absolute()
DB_PATH = PROJECT_DIR / 'output' / 'mlruns.db'
MLFLOW_TRACKING_URI = f'sqlite:////{DB_PATH}'

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.get_tracking_uri()

'sqlite://///Users/alaa.bakhti/Documents/EPITA/data-science-in-production/tp/data_science_in_production/output/mlruns.db'

In [51]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)
    mlflow.log_metric('mae', mae)
    
    print(f'RMSE = {rmse:.2f}, MAE = {mae:.2f}, R2 = {r2:.2f}')
    return rmse, mae, r2

In [84]:
from sklearn.linear_model import ElasticNet

def train_diabetes(alpha: float, l1_ratio: float, run_id: int) -> int:
    # TODO : add return type : check sklearn base model https://scikit-learn.org/stable/modules/classes.html
    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run():
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        model.fit(X_train, y_train)
        mlflow.sklearn.log_model(model, 'model')
        model_uri = f'../output/models/model-{alpha:.2f}-{l1_ratio:.2f}'
        mlflow.sklearn.save_model(model, model_uri)


        mlflow.log_param('alpha', alpha)
        mlflow.log_param('l1_ratio', l1_ratio)
        
        print(f'Elasticnet model (alpha={alpha:.2f}, l1_ratio={l1_ratio:.2f}): ', end='')
        evaluate_model(model, X_test, y_test)

In [87]:
rm -r ../output mlruns &&  mkdir ../output

In [88]:
for param, run_id in zip(np.linspace(0, 1, 4), range(5)):
    train_diabetes(alpha=param, l1_ratio=param, run_id=run_id)

2021/04/24 08:13:03 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2021/04/24 08:13:03 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

Elasticnet model (alpha=0.00, l1_ratio=0.00): RMSE = 55.30, MAE = 44.92, R2 = 0.41


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume

Elasticnet model (alpha=0.33, l1_ratio=0.33): RMSE = 72.36, MAE = 62.82, R2 = -0.00


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume

Elasticnet model (alpha=0.67, l1_ratio=0.67): RMSE = 72.61, MAE = 63.03, R2 = -0.01


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume

Elasticnet model (alpha=1.00, l1_ratio=1.00): RMSE = 58.97, MAE = 49.75, R2 = 0.33


In [90]:
model_path = '../output/models/model-0.33-0.33'
model = mlflow.sklearn.load_model(model_uri=model_path)
model.predict(X_test)

array([155.05762926, 157.34383472, 152.81372888, 153.1123571 ,
       153.18120197, 154.13763815, 155.94451321, 157.4327528 ,
       155.62259753, 156.41163823, 154.75512524, 153.18202123,
       154.87590175, 154.11210213, 155.13613465, 154.94131108,
       152.13951671, 154.42871339, 155.99762989, 152.50811365,
       153.20623017, 157.37329331, 155.1202496 , 155.11646794,
       154.52373079, 154.1274801 , 156.074097  , 156.1550807 ,
       157.14322665, 155.55653707, 156.16165413, 152.89074172,
       153.74055819, 155.55204306, 157.95311036, 152.63416916,
       154.199903  , 153.22013503, 154.47125656, 153.90867564,
       153.07155911, 156.76003434, 154.59279844, 156.42952971,
       155.38355027, 156.97568187, 153.89963709, 157.88747119,
       152.20331616, 157.1351395 , 152.61610314, 156.2026441 ,
       152.2445898 , 154.14161542, 154.237096  , 156.07533126,
       156.26718387, 152.4305429 , 153.09203456, 153.57544511,
       156.58078829, 155.90507913, 156.39321464, 156.78

## Notes
### mlflow.sklearn.log_model
**Links**
- [MLflow models](https://www.mlflow.org/docs/latest/models.html)
- [Packaging & serving the mode tutorial](https://www.mlflow.org/docs/latest/tutorials-and-examples/tutorial.html)
- [Log model documentation](https://www.mlflow.org/docs/latest/python_api/mlflow.sklearn.html#mlflow.sklearn.log_model)

**Notes**
- MLmodel: is a metadata file that tells MLflow how to load the model.
- model.pkl: is a serialized version of the trained model.

In [16]:



from sklearn.linear_model import lasso_path, enet_path

# train_diabetes
#   Uses the sklearn Diabetes dataset to predict diabetes progression using ElasticNet
#       The predicted "progression" column is a quantitative measure of disease progression one year after baseline
#       http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html
def f(data, in_alpha, in_l1_ratio):
    with mlflow.start_run():

        # Log mlflow attributes for mlflow UI
        mlflow.sklearn.log_model(lr, "model")
        modelpath = "../output/mlflow/model-%f-%f" % (alpha, l1_ratio)
        mlflow.sklearn.save_model(lr, modelpath)
                
        # Log artifacts (output files)
        mlflow.log_artifact("ElasticNet-paths.png")

In [67]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import lasso_path, enet_path

# train_diabetes
#   Uses the sklearn Diabetes dataset to predict diabetes progression using ElasticNet
#       The predicted "progression" column is a quantitative measure of disease progression one year after baseline
#       http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html
def train_diabetes(data, in_alpha, in_l1_ratio):
    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2
 

   
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)
   
    # The predicted column is "progression" which is a quantitative measure of disease progression
    # one year after baseline
    train_x = train.drop(["progression"], axis=1)
    test_x = test.drop(["progression"], axis=1)
    train_y = train[["progression"]]
    test_y = test[["progression"]]
   
    if float(in_alpha) is None:
        alpha = 0.05
    else:
        alpha = float(in_alpha)
      
    if float(in_l1_ratio) is None:
        l1_ratio = 0.05
    else:
        l1_ratio = float(in_l1_ratio)
      
    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)
     
        predicted_qualities = lr.predict(test_x)
     
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
     
        # Print out ElasticNet model metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
     
        # Log mlflow attributes for mlflow UI
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(lr, "model")
        modelpath = "../output/mlflow/model-%f-%f" % (alpha, l1_ratio)
        mlflow.sklearn.save_model(lr, modelpath)
        
        # Call plot_enet_descent_path
        image = plot_enet_descent_path(X, y, l1_ratio)
        
        # Log artifacts (output files)
        mlflow.log_artifact("ElasticNet-paths.png")
        
train_diabetes(data, 0.01, 0.01)

  and should_run_async(code)


Elasticnet model (alpha=0.010000, l1_ratio=0.010000):
  RMSE: 62.489686689390275
  MAE: 53.578585510041506
  R2: 0.2501994650614182
Computing regularization path using ElasticNet.


In [70]:
train_diabetes(data, 0.01, 0.75)

Elasticnet model (alpha=0.010000, l1_ratio=0.750000):
  RMSE: 56.210678873069156
  MAE: 46.70928025348322
  R2: 0.421451889289591
Computing regularization path using ElasticNet.


In [71]:
train_diabetes(data, 0.01, .5)


Elasticnet model (alpha=0.010000, l1_ratio=0.500000):
  RMSE: 59.13740610249833
  MAE: 50.548714527938465
  R2: 0.4119956543115493
Computing regularization path using ElasticNet.


  and should_run_async(code)
