In [0]:
#Leendo el Dataset
dataset = spark.read.format("parquet").load("dbfs:/FileStore/_AbandonoClientesAnalisis/output/dataset_EstadoClientes")
dataset.show()

+----------+------+----+------+---------------+---------------------+-------------------+---------------+-------------------+----------------+--------------------+----------------+------------------+-----------------+----------------------------------+---------------+------------+-------------------+----------------+----------------+--------------+---------------------+---------------+------------------+------------------+-------------------+-------------------------------------+-----------------------------------+------------------+------------------+------------------------+--------------------+---------------+---------------+----------------------------+-------------------------+--------------------------+----------------------+-----------------------+----------------------+--------------------------------+------------------------+---------------------+----------------------------+-------------------------+-------------------------+-----------------+------------------------------+--

In [0]:
#Creando el modelo de Arbol de Desicion
from pyspark.ml.classification  import DecisionTreeClassifier

In [0]:
algoritmo = DecisionTreeClassifier(
    labelCol="Estado_del_Cliente_indexado",
    featuresCol="features"
)

In [0]:
#Configurando el evaluador
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
evaluador = MulticlassClassificationEvaluator(
    labelCol="Estado_del_Cliente_indexado",
    predictionCol="prediction",
    metricName="accuracy"
)

In [0]:
#Utlitario para definir los valores los valores que se evaluaran en los parametros de mi algoritmo
from pyspark.ml.tuning import ParamGridBuilder

**Malla de Parametros a calibrar**

In [0]:
mallaParametros = ParamGridBuilder().\
addGrid(algoritmo.maxDepth,[5, 8, 10]).\
addGrid(algoritmo.impurity,["entropy", "gini"]).\
build()

**Validacion Cruzada**

In [0]:
from pyspark.ml.tuning import CrossValidator

In [0]:
validacionCruzada = CrossValidator(
    estimator=algoritmo,
    estimatorParamMaps=mallaParametros,
    evaluator=evaluador,
    numFolds=5
)

In [0]:
#Ejecucion de la validacion cruzada para obtener el mejor modelo
modeloGenerados = validacionCruzada.fit(dataset)

In [0]:
#Extrayendo el mejor modelo
modelo=modeloGenerados.bestModel

**Validacion del Modelo**

In [0]:
dfPrediccion = modelo.transform(dataset)
dfPrediccion.show()

+----------+------+----+------+---------------+---------------------+-------------------+---------------+-------------------+----------------+--------------------+----------------+------------------+-----------------+----------------------------------+---------------+------------+-------------------+----------------+----------------+--------------+---------------------+---------------+------------------+------------------+-------------------+-------------------------------------+-----------------------------------+------------------+------------------+------------------------+--------------------+---------------+---------------+----------------------------+-------------------------+--------------------------+----------------------+-----------------------+----------------------+--------------------------------+------------------------+---------------------+----------------------------+-------------------------+-------------------------+-----------------+------------------------------+--

In [0]:
dfPrediccion.select(
  dfPrediccion["Estado_del_Cliente_indexado"],
  dfPrediccion["prediction"],
  dfPrediccion["probability"]
).show(20, False)

+---------------------------+----------+-----------------------------------------+
|Estado_del_Cliente_indexado|prediction|probability                              |
+---------------------------+----------+-----------------------------------------+
|0.0                        |0.0       |[0.5514780835881753,0.44852191641182465] |
|0.0                        |0.0       |[0.848,0.152]                            |
|0.0                        |0.0       |[0.7907608695652174,0.20923913043478262] |
|1.0                        |0.0       |[0.6347826086956522,0.3652173913043478]  |
|0.0                        |0.0       |[0.848,0.152]                            |
|0.0                        |1.0       |[0.2079207920792079,0.7920792079207921]  |
|0.0                        |1.0       |[0.2079207920792079,0.7920792079207921]  |
|0.0                        |0.0       |[0.9862490450725745,0.013750954927425516]|
|1.0                        |0.0       |[0.5514780835881753,0.44852191641182465] |
|0.0

In [0]:
#Configuramos el evaluador
evaluador = MulticlassClassificationEvaluator(
  labelCol="Estado_del_Cliente_indexado", 
  predictionCol="prediction", 
  metricName="accuracy"
)
 
#Evaluamos las predicciones
evaluador.evaluate(dfPrediccion)

Out[14]: 0.8163808889628766

In [0]:
#Almacenamos el modelo
modelo.write().overwrite().save("dbfs:/FileStore/_AbandonoClientesAnalisis/output/modelo_arbol_de_decision/")