In [None]:
import os
import numpy as np
import itertools
import shutil
from faculty import client
import mlflow
import tempfile

import faculty_distributed
from faculty_distributed.utils import job_name_to_job_id

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Create temporary directory to save data during example

In [None]:
path = tempfile.mkdtemp(prefix="/project/temp-data-")

Get project id and job id

In [None]:
project_id = os.getenv("FACULTY_PROJECT_ID")
job_id = job_name_to_job_id("distributed")

Load data for training model

In [None]:
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
np.save(os.path.join(path, "x_train.npy"), x_train)
np.save(os.path.join(path, "y_train.npy"), y_train)
np.save(os.path.join(path, "x_test.npy"), x_test)
np.save(os.path.join(path, "y_test.npy"), y_test)

Define a function to run in parallel on jobs. Log results on MLFlow

In [None]:
def train_and_predict(estimators, features):
    with mlflow.start_run():
        mlflow.log_param("features", str(features))
        x_train = np.load(os.path.join(path, "x_train.npy"))
        y_train = np.load(os.path.join(path, "y_train.npy"))
        x_test = np.load(os.path.join(path, "x_test.npy"))
        y_test = np.load(os.path.join(path, "y_test.npy"))
        clf = RandomForestClassifier(
            n_estimators=estimators, n_jobs=2, verbose=1
        )
        clf.fit(x_train[:, features], y_train)
        predictions = clf.predict(x_test[:, features])
        mlflow.log_metric("accuracy", accuracy_score(y_test, predictions))
        return predictions

Give list of arguments to for function to run. Here we are training over different pairs of features and comaparing results.

In [None]:
args_list = [[5, list(x)] for x in itertools.combinations([0, 1, 2, 3], 2)]
args_list

Initiate `faculty_distributed.FacultyJobExecutor` class with project id and job id

In [None]:
fje = faculty_distributed.FacultyJobExecutor(project_id, job_id)

Run jobs to compute predictions. `map` takes a function and a list of lists containing the functions arguments. See `args_list` above for example 

In [None]:
predictions = fje.map(train_and_predict, args_list)

In [None]:
predictions

In [None]:
[accuracy_score(y_test, prediction) for prediction in predictions]

Remove temporary data path created for this example

In [None]:
shutil.rmtree(path)