# Random Forest Regressor

In [16]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.sql import SparkSession
import pyspark.pandas as ps
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

conexión al servidor MLFLOW

In [17]:
import mlflow

# Conectandose a MLFlow colocar el nombre asignado por mlflow
mlflow.set_tracking_uri("http://ibnodo3:25319")

# Generando el experimento o cargandolo si existe
experiment_name = "RFregressor-pyspark"
mlflow.set_experiment(experiment_name)

# Cargando la información
client = mlflow.tracking.MlflowClient()
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id

# Vamos a ver si es cierto
print(f"MLflow Version: {mlflow.__version__}")
print(f"Tracking URI: {mlflow.tracking.get_tracking_uri()}")
print(f"Nombre del experimento: {experiment_name}")
print(f"ID del experimento: {experiment_id}")

2022/05/17 22:41:49 INFO mlflow.tracking.fluent: Experiment with name 'RFregressor-pyspark' does not exist. Creating a new experiment.


MLflow Version: 1.25.1
Tracking URI: http://ibnodo3:25319
Nombre del experimento: RFregressor-pyspark
ID del experimento: 3


Preparación de los datos

In [18]:
spark = SparkSession \
    .builder \
    .config('spark.sql.debug.maxToStringFields', 2000) \
    .getOrCreate()

dataset = spark.read.parquet('/LUSTRE/home/mcd-01/dataset.parquet')
dataset = dataset.drop('TOPOSLPX', 'TOPOSLPY', 'SEAICE', 'UOCE', 'VOCE', 'FRC_URB2D', 'SR', 'PCB', 'PC', 'CANWAT')

label_column = "TEMP"
stages = []

# Preparing the independent variables (Features)
# Defining the variables to be used
x_df = dataset.drop('TEMP')
variables = x_df.columns
vectorAssembler = VectorAssembler(inputCols = variables, outputCol = 'features')
va_df = vectorAssembler.transform(dataset)
feature_vector = va_df.select('features', 'TEMP')

# Initialize the `standardScaler`
scaler = StandardScaler(inputCol = 'features',
                        outputCol = 'scaledFeatures',
                        withMean = True, withStd = True
                        ).fit(feature_vector)

preppedDataDF = scaler.transform(feature_vector)

# Split the featurized training data for training and validating the model
(train_data, test_data) = preppedDataDF.randomSplit([0.7, 0.3], seed=123)

print('Data preparation work completed.')

[Stage 4087:====>                                                 (1 + 12) / 13]                                                                                

Data preparation work completed.


Gráfica de desempeño

In [19]:
def plot_regression_quality(predictions):
  p_df = predictions.select(["TEMP",  "prediction"]).toPandas()
  true_value = p_df.TEMP
  predicted_value = p_df.prediction

  fig = plt.figure(figsize=(15,15))
  plt.scatter(true_value, predicted_value, c='crimson')
 
  p1 = max(max(predicted_value), max(true_value))
  p2 = min(min(predicted_value), min(true_value))
  plt.plot([p1, p2], [p1, p2], 'b-')
  plt.xlabel('True Values', fontsize=15)
  plt.ylabel('Predictions', fontsize=15)
  plt.axis('equal')
  
  global image

  image = fig
  fig.savefig("RFregressorPrediction.png")
  plt.close(fig)
  return image

print('Created regression quality plot function')

Created regression quality plot function


In [26]:

def RF_regressor(train_data, test_data, label_column, features_column, maxDepth,
                            numTrees, model_name = None):
  # Evaluate metrics
  def eval_metrics(predictions):
      evaluator = RegressionEvaluator(
          labelCol=label_column, predictionCol="prediction", metricName="rmse")
      rmse = evaluator.evaluate(predictions)
      evaluator = RegressionEvaluator(
          labelCol=label_column, predictionCol="prediction", metricName="mae")
      mae = evaluator.evaluate(predictions)
      evaluator = RegressionEvaluator(
          labelCol=label_column, predictionCol="prediction", metricName="r2")
      r2 = evaluator.evaluate(predictions)
      return rmse, mae, r2

  # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
  with mlflow.start_run():   
    rfr = RandomForestRegressor(featuresCol = "scaledFeatures", labelCol = "TEMP")
    rfr_model = rfr.fit(train_data)
    predictions = rfr_model.transform(test_data)
    
    (rmse, mae, r2) = eval_metrics(predictions)

    # Print out model metrics
    print("RF regressor model (maxDepth=%f, numTrees=%f):" % (maxDepth, numTrees))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # Log hyperparameters for mlflow UI
    mlflow.log_param("maxDepth", maxDepth)
    mlflow.log_param("numTrees", numTrees)
    # Log evaluation metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    # Log the model itself
    if model_name is None:
      mlflow.spark.log_model(rfr_model, "model")
    else:
      mlflow.spark.log_model(gbm_model, artifact_path="model", registered_model_name=model_name)
    modelpath = "/LUSTRE/home/mcd-01/artifucks_regresion/model-%f-%f" % (maxDepth, numTrees)
    mlflow.spark.save_model(rfr_model, modelpath)
    
    # Generate a plot
    image = plot_regression_quality(predictions)
    
    # Log artifacts (in this case, the regression quality image)
    mlflow.log_artifact("RFregressorPrediction.png")

print('Created training and evaluation method')

Created training and evaluation method


se ejecuta el modelo

In [31]:
RF_regressor(train_data, test_data, label_column='TEMP', features_column= 'scaledFeatures', 
                        maxDepth=15, numTrees=8, model_name = None)

RF regressor model (maxDepth=15.000000, numTrees=8.000000):
  RMSE: 0.6803422511687448
  MAE: 0.44071196248894196
  R2: 0.9939678617175822
