In [None]:
import time
import cloudpickle
import distributed
import os
import subprocess
import numpy as np
import itertools
from faculty import client
import mlflow

import faculty_parallel
from faculty_parallel.serialize import ParallelJobs
from faculty_parallel.utils import job_name_to_job_id

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Currently needs a path to save outputs, needs to be changed. This path needs to contain the folders `saved_funcs`, `output` and `data`

In [None]:
path = "/project/laurence/parallel_tests/"
project_id = os.getenv('FACULTY_PROJECT_ID')
job_id = job_name_to_job_id("parallel_laurence")

Load data for training model

In [None]:
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
np.save(os.path.join(path, "data/x_train.npy"), x_train)
np.save(os.path.join(path, "data/y_train.npy"), y_train)
np.save(os.path.join(path, "data/x_test.npy"), x_test)
np.save(os.path.join(path, "data/y_test.npy"), y_test)

Define a function to run in parallel on jobs. Log results on MLFlow

In [None]:
def train_and_predict(features):
    with mlflow.start_run():
        mlflow.log_param('features', str(features))
        x_train = np.load(os.path.join(path, "data/x_train.npy"))
        y_train = np.load(os.path.join(path, "data/y_train.npy"))
        x_test = np.load(os.path.join(path, "data/x_test.npy"))
        y_test = np.load(os.path.join(path, "data/y_test.npy"))
        clf = RandomForestClassifier(n_estimators=5, n_jobs=4, verbose=1)
        clf.fit(x_train[:, features], y_train)
        predictions = clf.predict(x_test[:, features])
        mlflow.log_metric('accuracy', accuracy_score(y_test, predictions))
        return predictions
    

Give list of arguments to for function to run. Here we are training over different pairs of features and comaparing results.

In [None]:
args_list = [[list(x)] for x in itertools.combinations([0, 1, 2, 3], 2)]
args_list

Initiate class

In [None]:
pj = ParallelJobs(path, project_id, job_id)

Run jobs to compute predictions

In [None]:
predictions = pj.parmap(train_and_predict, args_list)

In [None]:
[accuracy_score(y_test, prediction) for prediction in predictions]