In [0]:
from sklearn.metrics            import accuracy_score, balanced_accuracy_score
from sklearn.metrics            import recall_score, precision_score
from sklearn.linear_model       import LogisticRegression
from sklearn.ensemble           import RandomForestClassifier
from sklearn.neighbors          import KNeighborsClassifier

from hyperopt.pyll              import scope
from hyperopt                   import hp
import hyperopt.pyll.stochastic as hs

from pyspark.sql import functions as F
from pyspark.sql import Window as W
from pyspark.sql import types as T

from mlflow.tracking.client import MlflowClient
import pandas               as pd
import numpy                as np
import mlflow

In [0]:
experimentName = "TESTEXP"; nTrial = 6; nNodes = 2; keyMetric = "balancedAccuracy"
recipePath     = f"/mnt/data-dev/datasets/classification/adult/v3/recipe-A/"
experimentPath = f"/Shared/dev/recipes-v3/recipe-A/{experimentName}"
experimentId   = mlflow.create_experiment(experimentPath)
mlflow.set_experiment(experimentPath)

In [0]:
def standardClassMetrics(t, h):
    return {"balancedAccuracy" : balanced_accuracy_score(t, h),
            "precision"        : precision_score(t, h), 
            "accuracy"         : accuracy_score(t, h),
            "recall"           : recall_score(t, h)}


def makeExecuteTrialRandomForest(logRunId, seed):
    
    paramSpace = {
        "class_weight"         : hp.choice("class_weight", ["balanced", "balanced_subsample"]),
        "n_estimators"         : scope.int(hp.quniform("n_estimators", 20, 500, 1)),
        "max_depth"            : scope.int(hp.quniform("max_depth", 2, 15, 1)),
        "criterion"            : hp.choice("criterion", ["gini", "entropy"]),
        "min_samples_split"    : hp.uniform("min_samples_split", 0.00, 0.10),
        "max_features"         : hp.uniform("max_features", 0.10, 1.00),
        "max_samples"          : hp.uniform("max_samples", 0.25, 1.00),
        "random_state"         :  0,
        "n_jobs"               : -1,
    }
    
    def executeTrial(trialId):
        
        with mlflow.start_run(experiment_id=experimentId, nested=True):
            
            mlflow.set_tag("experimentId", experimentId)
            mlflow.set_tag("trialId", str(trialId))
            
            param = flattenDict(hs.sample(paramSpace))
            tx    = trainX.value; ty = trainY.value
            vx    = validX.value; vy = validY.value
            model = RandomForestClassifier(**param)
            model = model.fit(tx, ty).predict(vx)
            evals = standardClassMetrics(vy, model)

            mlflow.log_param ("model", "RandomForestClassifier")
            mlflow.log_metric("metric", evals.get(keyMetric))
            mlflow.log_param ("paramSet", str(param))
            mlflow.log_param ("metrics" , str(evals))
            
        return "success"
    
    return executeTrial


def makeExecuteTrialLogisticRegression(logRunId, seed):
    
    paramSpace = {
        "penalty"              : hp.choice(
            "penalty", [
                {"penalty"     : "l1",   "solver": "liblinear"},
                {"penalty"     : "l2",   "solver": "liblinear"},
                {"penalty"     : "none", "solver": "newton-cg"},
                {
                    "penalty"  : "elasticnet", "solver": "saga",
                    "l1_ratio" : hp.uniform("l1_ratio", 0.10, 0.90)
                }
            ]
        ),
        "class_weight"         : hp.choice("class_weight", ["balanced", None]),
        "C"                    : hp.uniform("C", 0.01, 1.00),
        "random_state"         : seed,
        "n_jobs"               : -1,
    }
    
    def executeTrial(trialId):
        
        with mlflow.start_run(experiment_id=experimentId, nested=True):
            
            mlflow.set_tag("experimentId", experimentId)
            mlflow.set_tag("trialId", str(trialId))
            
            param = flattenDict(hs.sample(paramSpace))
            tx    = trainX.value; ty = trainY.value
            vx    = validX.value; vy = validY.value
            model = LogisticRegression(**param)
            model = model.fit(tx, ty).predict(vx)
            evals = standardClassMetrics(vy, model)
            
            mlflow.log_metric("metric", evals.get(keyMetric))
            mlflow.log_param ("model", "LogisticRegression")
            mlflow.log_param ("paramSet", str(param))
            mlflow.log_param ("metrics" , str(evals))
            
        return "success"
    
    return executeTrial


def makeExecuteTrialKNeighborsClassifier(logRunId, seed):
    
    paramSpace = {
        "algorithm"            : hp.choice("algorithm", ["ball_tree", "kd_tree", "brute"]),
        "n_neighbors"          : scope.int(hp.quniform("n_neighbors", 3, 50, 1)),
        "weights"              : hp.choice("weights", ["uniform", "distance"]),
        "p"                    : hp.choice("p", [1, 2]),
        "n_jobs"               : -1,
    }
    
    def executeTrial(trialId):
        
        with mlflow.start_run(experiment_id=experimentId, nested=True):
            
            mlflow.set_tag("experimentId", experimentId)
            mlflow.set_tag("trialId", str(trialId))
            
            param = flattenDict(hs.sample(paramSpace))
            tx    = trainX.value; ty = trainY.value
            vx    = validX.value; vy = validY.value
            model = KNeighborsClassifier(**param)
            model = model.fit(tx, ty).predict(vx)
            evals = standardClassMetrics(vy, model)
            
            mlflow.log_param ("model", "KNeighborsClassifier")
            mlflow.log_metric("metric", evals.get(keyMetric))
            mlflow.log_param ("paramSet", str(param))
            mlflow.log_param ("metrics" , str(evals))
            
        return "success"
    
    return executeTrial

In [0]:
df     = spark.read.parquet(recipePath + "0/splitDataset")
train  = df.filter(F.col("valid") == 0).toPandas( )
valid  = df.filter(F.col("valid") == 1).toPandas( )
trainX = sc.broadcast(np.vstack(train["x"].values))
validX = sc.broadcast(np.vstack(valid["x"].values))
trainY = sc.broadcast(train["y"].values)
validY = sc.broadcast(valid["y"].values)

In [0]:
dataIter = sc.parallelize(range(nTrial), nNodes)
trialFun = makeExecuteTrialRandomForest(0, 0)
dataIter = dataIter.map(trialFun).collect()