In [9]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd, numpy as np

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [11]:
# load 'train' dataset as a Pandas dataframe
df = dataiku.Dataset("flight_data").get_dataframe()

In [12]:
#-----------------------------------------------------------------
# Dataset Settings
#-----------------------------------------------------------------

# Select a subset of features to use for training
SCHEMA = {
    'target': 'Late',
    'features_num': ['dep_month', 'dep_woy', 'dep_hour','Distance','Late_avg'],
    'features_cat': ['UniqueCarrier', 'Origin','Dest']
}

In [13]:
#-----------------------------------------------------------------
# Preprocessing on Training Set
#-----------------------------------------------------------------

# Numerical variables
df_num = df[SCHEMA['features_num']]

trf_num = Pipeline([
    ('imp', SimpleImputer(strategy='mean')),
    ('sts', StandardScaler()),
])

# Categorical variables
df_cat = df[SCHEMA['features_cat']]

trf_cat = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", trf_num, SCHEMA['features_num']),
        ("cat", trf_cat, SCHEMA['features_cat'])
    ]
)

In [24]:
#-------------------------------------------------------------------------
# TRAINING
#-------------------------------------------------------------------------
##### TO-DO: add experiment tracking code here
##### but watch out for lineage (don't use the deploy button from the xperiment tracking UI)
import dataiku
import mlflow
from sklearn.linear_model import ElasticNet

client = dataiku.api_client()
project = client.get_project(dataiku.default_project_key())

# Setup mlflow integration, storing artefacts in managed folder. Managed folder must already exist.
mlflow_handle = project.setup_mlflow('3hOB5aod')

# MLflow run and experiment will be sent to DSS backend.
mlflow.set_experiment("sample autolog")

# activate Mflow autologging
mlflow.sklearn.autolog()

with mlflow.start_run(run_name="my_run"):
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("clf", RandomForestClassifier())]
    )

    param_grid = {
        "clf__max_depth"        : [3],
        "clf__max_features"     : [1],
        "clf__min_samples_split": [2],
        "clf__min_samples_leaf" : [1],
        "clf__bootstrap"        : [False],
            "clf__criterion"        : ["gini"],
    "clf__n_estimators"     : [10]
    }

    gs = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv=3)
    X = df[SCHEMA['features_num'] + SCHEMA['features_cat']]
    Y = df[SCHEMA['target']].values
    gs.fit(X, Y)
    clf = gs.best_estimator_

2022/11/10 21:10:56 INFO mlflow.tracking.fluent: Experiment with name 'sample autolog' does not exist. Creating a new experiment.
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imp',
                                                                   SimpleImputer()),
                                                                  ('sts',
                                                                   StandardScaler())]),
                                                  ['dep_month', 'dep_woy',
           ...`
  _warn_prf(average, modifier, msg_start, len(result))
2022/11/10 21:11:12 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
                                 Pipeline(steps=[('imp', SimpleImputer()),
                                                 ('sts', StandardScaler())]),
                                 ['dep_month', 'dep_woy', 'dep_hour',
                          

{'projectKey': 'MLFLOWO16N',
 'directoryBasedPartitioning': False,
 'name': 'my_pkl_model',
 'id': '3hOB5aod',
 'accessInfo': {'bucket': 'gis-data-us-east-1',
  'root': '/space-5b568791-dku/node-0874aa03/managed-dss-data/MLFLOWO16N/3hOB5aod'},
 'type': 'S3'}

In [17]:
import mlflow

In [18]:
mlflow.sklearn.save_model(clf, mlflow_models)

Traceback (most recent call last):
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 570, in _protected_refresh
    metadata = self._refresh_using()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 717, in fetch_credentials
    return self._get_cached_credentials()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 727, in _get_cached_credentials
    response = self._get_credentials()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 956, in _get_credentials
    kwargs = self._assume_role_kwargs()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 966, in _assume_role_kwargs
    identity_token = self._web_identity_token_loader()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/utils.py", line 3069, in __call__
    with self._open(self._web_identity_token_path) as token_file:
Fi

In [12]:
#Save models as mlflow models to a folder
from datetime import datetime
import os
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
model_dir = mlflow_models + "/custom-random-forest-{}".format(ts)
mlflow.sklearn.save_model(clf, model_dir)
print("Model saved at {} !".format(os.path.abspath(model_dir)))

Traceback (most recent call last):
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 570, in _protected_refresh
    metadata = self._refresh_using()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 717, in fetch_credentials
    return self._get_cached_credentials()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 727, in _get_cached_credentials
    response = self._get_credentials()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 956, in _get_credentials
    kwargs = self._assume_role_kwargs()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/credentials.py", line 966, in _assume_role_kwargs
    identity_token = self._web_identity_token_loader()
  File "/opt/dataiku/code-env/lib/python3.8/site-packages/botocore/utils.py", line 3069, in __call__
    with self._open(self._web_identity_token_path) as token_file:
Fi

Model saved at /home/dataiku/s3:/gis-data-us-east-1/space-5b568791-dku/node-0874aa03/managed-dss-data/MLFLOWO16N/3hOB5aod/custom-random-forest-20221110-202652 !


In [13]:
import dataikuapi
import dataiku

client = dataiku.api_client()
project = client.get_default_project()


In [16]:

# Get or create saved models
if dataiku.get_custom_variables()["saved_model_id"] == "":
    saved_model = project.create_mlflow_pyfunc_model("mlflow_model", "BINARY_CLASSIFICATION")
    project.update_variables({"saved_model_id": saved_model.id})
else:
    saved_model = project.get_saved_model(dataiku.get_custom_variables()["saved_model_id"])
saved_model

<dataikuapi.dss.savedmodel.DSSSavedModel at 0x7f84cb5bd250>

In [17]:

mlflow_version = saved_model.import_mlflow_version_from_path(dataiku.get_custom_variables()["custom_model_version"], model_dir, code_env_name="mlflow")
project.update_variables({"custom_model_version": int(dataiku.get_custom_variables()["custom_model_version"]) + 1})

DataikuException: com.dataiku.dip.server.controllers.NotFoundException: saved model does not exist: MLFLOWO16N.AwjDsbSf

In [0]:
mlflow_version.set_core_metadata(SCHEMA['target'], class_labels=["false", "true"], get_features_from_dataset="flight_ground_truth")
mlflow_version.evaluate("flight_ground_truth")