# Random forest clasificación
---

## SparkContext y SparkSession

In [1]:
from pyspark import SparkContext
sc = SparkContext(master = 'local')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

## Arbol Random forest con pyspark

In [2]:
cuse = spark.read.csv('data/cuse_binary.csv', header=True, inferSchema=True)
cuse.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows



## Procesar columnas categóricas

### Columnas categóricas

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

categorical_columns = cuse.columns[:-1]
categorical_columns

['age', 'education', 'wantsMore']

### Construir etapas StringIndexe

In [9]:
stringindexer_stages = [StringIndexer(inputCol=c, outputCol='stringindexed_' + c) for c in categorical_columns]
# encode label column and add it to stringindexer stages
stringindexer_stages += [StringIndexer(inputCol='y', outputCol='label')]

### Construir etapas OneHotEncoder

In [10]:
onehotencoder_stages = [OneHotEncoder(inputCol='stringindexed_' + c, outputCol='onehot_'+c) for c in categorical_columns]

### Construir etapas VectorAssembler

In [13]:
feature_columns = ['onehot_' + c for c in categorical_columns]
vectorassembler_stage = VectorAssembler(inputCols=feature_columns, outputCol='features')

### Construir modelo pipeline

In [15]:
all_stages = stringindexer_stages + onehotencoder_stages + [vectorassembler_stage]
pipeline = Pipeline(stages=all_stages)

### Ajustar modelo pipeline

In [17]:
pipeline_model = pipeline.fit(cuse)

### Transformar datos

In [19]:
final_columns = feature_columns + ['features', 'label']
cuse_df = pipeline_model.transform(cuse).select(final_columns)
cuse_df.show(5)

+-------------+----------------+----------------+-------------------+-----+
|   onehot_age|onehot_education|onehot_wantsMore|           features|label|
+-------------+----------------+----------------+-------------------+-----+
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
+-------------+----------------+----------------+-------------------+-----+
only showing top 5 rows



## Divida los datos en conjuntos de datos de entrenamiento y de pruebas

In [20]:
train, test = cuse_df.randomSplit([0.8, 0.2], seed=1234)

## Construir un modelo de validación cruzada

### Estimador

In [22]:
from pyspark.ml.classification import RandomForestClassifier

random_forest = RandomForestClassifier(featuresCol='features', labelCol='label')

### Cuadrícula de parámetros

In [23]:
from pyspark.ml.tuning import ParamGridBuilder

param_grid = ParamGridBuilder().\
    addGrid(random_forest.maxDepth, [2, 3, 4]).\
    addGrid(random_forest.minInfoGain, [0.0, 0.1, 0.2, 0.3]).\
    build()

### Evaluator

In [24]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()

### Construir un modelo de validación cruzada

In [27]:
from pyspark.ml.tuning import CrossValidator

crossvalidation = CrossValidator(estimator=random_forest, estimatorParamMaps=param_grid, evaluator=evaluator)

### Ajustar un modelo de validación cruzada

In [29]:
crossvalidation_mod = crossvalidation.fit(cuse_df)

### Predicción

#### Predicción en los datos de entrenamiento

In [30]:
pred_train = crossvalidation_mod.transform(train)
pred_train.show(5)

+----------+----------------+----------------+---------+-----+--------------------+--------------------+----------+
|onehot_age|onehot_education|onehot_wantsMore| features|label|       rawPrediction|         probability|prediction|
+----------+----------------+----------------+---------+-----+--------------------+--------------------+----------+
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
+----------+----------------+----------------+---------+-----+----------

#### Predicción en los datos de test

In [31]:
pred_test = crossvalidation_mod.transform(test)
pred_test.show(5)

+----------+----------------+----------------+---------+-----+--------------------+--------------------+----------+
|onehot_age|onehot_education|onehot_wantsMore| features|label|       rawPrediction|         probability|prediction|
+----------+----------------+----------------+---------+-----+--------------------+--------------------+----------+
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
| (3,[],[])|       (1,[],[])|       (1,[],[])|(5,[],[])|  0.0|[9.61727693784312...|[0.48086384689215...|       1.0|
+----------+----------------+----------------+---------+-----+----------

### Rendimiento de la predicción

Calculamos el **Área bajo la curva característica de funcionamiento del receptor**.

In [39]:
print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_train), "\n"
     'Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_test))


Accuracy on training data (areaUnderROC):  0.681918715706039 
Accuracy on training data (areaUnderROC):  0.6755505721350122


### Confusion matrix

#### Matriz de confusión a partir de los datos de entrenamiento

In [43]:
label_pred_train = pred_train.select('label', 'prediction')
label_pred_train.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 746,
             Row(label=0.0, prediction=1.0): 167,
             Row(label=1.0, prediction=0.0): 220,
             Row(label=1.0, prediction=1.0): 194})

#### Matriz de confusión a partir de los datos de test

In [44]:
label_pred_test = pred_test.select('label', 'prediction')
label_pred_test.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 151,
             Row(label=0.0, prediction=1.0): 36,
             Row(label=1.0, prediction=0.0): 50,
             Row(label=1.0, prediction=1.0): 43})

## Mejor modelo y parámetros

In [47]:
print('max depth: ', crossvalidation_mod.bestModel._java_obj.getMaxDepth(), "\n",
     'min information gain: ', crossvalidation_mod.bestModel._java_obj.getMinInfoGain())


max depth:  4 
 min information gain:  0.0
