In [1]:
import yaml
from hyperopt import hp, fmin, Trials, tpe, STATUS_OK
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

  import pkg_resources


In [2]:
with open('config.yaml') as file:
    conf = yaml.safe_load(file)

In [3]:
spark = (
    SparkSession.builder.appName('iot')
    .master('local[*]')
    .config('spark.driver.host', 'localhost')
    .config('spark.driver.bindAddress', '127.0.0.1')
    .config('spark.driver.memory', '4g')
    .getOrCreate()
)
spark.sparkContext.setLogLevel('ERROR')

In [4]:
# Loading previuos df and model
df_train = spark.read.parquet(conf['filepath_train_df'])
df_test = spark.read.parquet(conf['filepath_test_df'])
loaded_pipeline = Pipeline.load('./pipeline')
string_indexer = loaded_pipeline.getStages()[0]
vector_assembler = loaded_pipeline.getStages()[1]

## Hyper Opt
 Optimization library that allows for a range of values instead of discrete values like GridSearch

In [5]:
evaluator = BinaryClassificationEvaluator(labelCol='is_bad', metricName="areaUnderROC")

def objective(params):
    rf = RandomForestClassifier(
        featuresCol = 'features',
        labelCol = 'is_bad',
        numTrees = params['numTrees'],
        maxDepth = params['maxDepth'],
    )
    
    inter_pipeline = Pipeline(
        stages=[
            string_indexer,
            vector_assembler,
            rf
        ]
    )
    
    pipeline = inter_pipeline.fit(df_train)
    val_df = pipeline.transform(df_test)
        
    score = evaluator.evaluate(val_df)
    return {"loss":-score, "status": STATUS_OK}

In [11]:
search_space = {
    'numTrees' : hp.uniformint('numTrees', 10, 500),
    'maxDepth' : hp.uniformint('maxDepth', 2, 10)
}

rf_trials = Trials()

argmin = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 3,
    trials = rf_trials
)
print('Final numTrees: ',argmin['numTrees'])
print('Final maxDepth: ',argmin['maxDepth'])


100%|██████████| 3/3 [04:15<00:00, 85.13s/trial, best loss: -0.9999983774583976] 
Final numTrees:  108.0
Final maxDepth:  10.0


In [12]:
# Create the new optimized Pipeline
best_rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="is_bad",
    numTrees=argmin['numTrees'],
    maxDepth=argmin['maxDepth']
)

best_pipeline = Pipeline(
    stages=[
        string_indexer,
        vector_assembler,
        best_rf
    ]
)

In [13]:
# Fit the data to the new Pipeline
best_pl_fit = best_pipeline.fit(df_train)
test_preds = best_pl_fit.transform(df_test)

In [14]:
# Evaluate new Pipeline
score = evaluator.evaluate(test_preds)
print("ROC AUC", score)

ROC AUC 0.9999983770988485


In [15]:
best_pipeline.write().overwrite().save("pipeline")