In [0]:
#Leendo el Dataset
dataset = spark.read.format("parquet").load("dbfs:/FileStore/_AbandonoClientesAnalisis/output/dataset_EstadoClientes")
dataset.show()

+----------+------+----+------+---------------+---------------------+-------------------+---------------+-------------------+----------------+--------------------+----------------+------------------+-----------------+----------------------------------+---------------+------------+-------------------+----------------+----------------+--------------+---------------------+---------------+------------------+------------------+-------------------+-------------------------------------+-----------------------------------+------------------+------------------+------------------------+--------------------+---------------+---------------+----------------------------+-------------------------+--------------------------+----------------------+-----------------------+----------------------+--------------------------------+------------------------+---------------------+----------------------------+-------------------------+-------------------------+-----------------+------------------------------+--

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
#Configuramos el algoritmo
algoritmo = RandomForestClassifier(
  labelCol = "Estado_del_Cliente_indexado", 
  featuresCol = "features"
)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
evaluador = MulticlassClassificationEvaluator(
  labelCol = "Estado_del_Cliente_indexado",
  predictionCol = "prediction",
  metricName="accuracy"
)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder

In [0]:
mallaDeParametros = ParamGridBuilder().\
addGrid(algoritmo.numTrees, [5, 8, 12]).\
addGrid(algoritmo.maxDepth, [5, 8, 10]).\
addGrid(algoritmo.impurity, ["entropy", "gini"]).\
build()

In [0]:
from pyspark.ml.tuning import CrossValidator

In [0]:
#Configuración de la validación cruzada
validacionCruzada = CrossValidator(
  estimator = algoritmo,
  estimatorParamMaps = mallaDeParametros,
  evaluator = evaluador,
  numFolds = 5 
)  

In [0]:
modelosGenerados = validacionCruzada.fit(dataset)

In [0]:
#Extracción del mejor modelo
modelo = modelosGenerados.bestModel

In [0]:
dfPrediccion = modelo.transform(dataset)
dfPrediccion.show()

+----------+------+----+------+---------------+---------------------+-------------------+---------------+-------------------+----------------+--------------------+----------------+------------------+-----------------+----------------------------------+---------------+------------+-------------------+----------------+----------------+--------------+---------------------+---------------+------------------+------------------+-------------------+-------------------------------------+-----------------------------------+------------------+------------------+------------------------+--------------------+---------------+---------------+----------------------------+-------------------------+--------------------------+----------------------+-----------------------+----------------------+--------------------------------+------------------------+---------------------+----------------------------+-------------------------+-------------------------+-----------------+------------------------------+--

In [0]:
dfPrediccion.select(
  dfPrediccion["Estado_del_Cliente_indexado"],
  dfPrediccion["prediction"],
  dfPrediccion["probability"], 
  dfPrediccion["rawPrediction"]
).show(20, False)

+---------------------------+----------+-----------------------------------------+----------------------------------------+
|Estado_del_Cliente_indexado|prediction|probability                              |rawPrediction                           |
+---------------------------+----------+-----------------------------------------+----------------------------------------+
|0.0                        |0.0       |[0.5566388807964286,0.44336111920357135] |[6.679666569557144,5.320333430442856]   |
|0.0                        |0.0       |[0.8414413046320336,0.1585586953679664]  |[10.097295655584402,1.9027043444155964] |
|0.0                        |0.0       |[0.8535789118321687,0.1464210881678313]  |[10.242946941986023,1.7570530580139756] |
|1.0                        |1.0       |[0.486140171912572,0.513859828087428]    |[5.833682062950864,6.166317937049136]   |
|0.0                        |0.0       |[0.6540791612244966,0.3459208387755033]  |[7.84894993469396,4.151050065306039]    |
|0.0    

In [0]:
evaluador = MulticlassClassificationEvaluator(
  labelCol="Estado_del_Cliente_indexado", 
  predictionCol="prediction", 
  metricName="accuracy"
)
evaluador.evaluate(dfPrediccion)

Out[14]: 0.8624937572831697

In [0]:
#Almacenamos el modelo
modelo.write().overwrite().save("dbfs:/FileStore/_AbandonoClientesAnalisis/output/modelo_RandomForest/")