In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

## Regresión logística con pyspark

## Importar datos

In [4]:
cuse = spark.read.csv('data/cuse_binary.csv', header=True, inferSchema=True)
cuse.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows



## Procesar columnas categóricas

El siguiente código hace tres cosas con pipeline:

* **`StringIndexer`** todas las columnas categóricas
* **`OneHotEncoder`** todas las columnas de índice categóricas
* **`VectorAssembler`** todas las columnas de características en una columna vectorial

### Columnas categóricas

In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Columnas categóricas
categorical_columns = cuse.columns[0:3]

### Construir etapas StringIndexer

In [6]:
stringindexer_stages = [StringIndexer(inputCol=c, outputCol='strindexed_' + c) for c in categorical_columns]
# encode label column and add it to stringindexer_stages
stringindexer_stages += [StringIndexer(inputCol='y', outputCol='label')]

### Construir etapas OneHotEncoder

In [7]:
onehotencoder_stages = [OneHotEncoder(inputCol='strindexed_' + c, outputCol='onehot_' + c) for c in categorical_columns]

### Construir etapas VectorAssembler

In [8]:
feature_columns = ['onehot_' + c for c in categorical_columns]
vectorassembler_stage = VectorAssembler(inputCols=feature_columns, outputCol='features') 

### Construir modelo pipeline

In [9]:
# all stages
all_stages = stringindexer_stages + onehotencoder_stages + [vectorassembler_stage]
pipeline = Pipeline(stages=all_stages)

### Ajustar modelo pipeline

In [10]:
pipeline_model = pipeline.fit(cuse)

### Transformar datos

In [11]:
final_columns = feature_columns + ['features', 'label']
cuse_df = pipeline_model.transform(cuse).\
            select(final_columns)
            
cuse_df.show(5)

+-------------+----------------+----------------+-------------------+-----+
|   onehot_age|onehot_education|onehot_wantsMore|           features|label|
+-------------+----------------+----------------+-------------------+-----+
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
+-------------+----------------+----------------+-------------------+-----+
only showing top 5 rows



## Divida los datos en conjuntos de datos de entrenamiento y de pruebas

In [12]:
training, test = cuse_df.randomSplit([0.8, 0.2], seed=1234)

## Construir un modelo de validación cruzada

### Estimador

In [13]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='features', labelCol='label')

### Cuadrícula de parámetros

In [14]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(logr.regParam, [0, 0.5, 1, 2]).\
    addGrid(logr.elasticNetParam, [0, 0.5, 1]).\
    build()

### Evaluador

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

### Modelo de validación cruzada

In [16]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=logr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

### Ajustar un modelo de validación cruzada

In [17]:
cv_model = cv.fit(cuse_df)

### Predicción

In [18]:
show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']

#### Predicción en los datos de entrenamiento

In [19]:
pred_training_cv = cv_model.transform(training)
pred_training_cv.select(show_columns).show(5, truncate=False)

+---------+-----+----------+------------------------------------------+---------------------------------------+
|features |label|prediction|rawPrediction                             |probability                            |
+---------+-----+----------+------------------------------------------+---------------------------------------+
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
+---------+-----+----------+------------------------------------------+---------------------------------

#### Predicción en los datos de test

In [20]:
pred_test_cv = cv_model.transform(test)
pred_test_cv.select(show_columns).show(5, truncate=False)

+---------+-----+----------+------------------------------------------+---------------------------------------+
|features |label|prediction|rawPrediction                             |probability                            |
+---------+-----+----------+------------------------------------------+---------------------------------------+
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
|(5,[],[])|0.0  |1.0       |[-0.05602431718564116,0.05602431718564116]|[0.4859975829890087,0.5140024170109914]|
+---------+-----+----------+------------------------------------------+---------------------------------

## Intercepción y coeficientes del modelo de regresión

In [21]:
print('Intercept: ' + str(cv_model.bestModel.intercept) + "\n"
     'coefficients: ' + str(cv_model.bestModel.coefficients))

Intercept: 0.05602431718564116
coefficients: [-0.280625539774,-0.799857435517,-1.18923909827,0.324994746147,-0.832954766261]


## Mejor modelo y parámetros

In [22]:
print('The best RegParam is: ', cv_model.bestModel._java_obj.getRegParam(), "\n",
     'The best ElasticNetParam is: cv_model.bestModel._java_obj.getElasticNetParam()')

The best RegParam is:  0.0 
 The best ElasticNetParam is: cv_model.bestModel._java_obj.getElasticNetParam()


# Regresión lineal con R

## Datos de Importación (abajo es el código R!)

```
#====== This is R code! =========
cuse = read.table('http://data.princeton.edu/wws509/datasets/cuse.dat', header = T)

# convert count data to binary data
not_using = rep(1:nrow(cuse), times=cuse$notUsing)
using = rep(1:nrow(cuse), times=cuse$using)
cuse_binary = cuse[c(not_using, using), 1:3]
cuse_binary$y = c(rep(0, length(not_using)), rep(1, length(using)))

# write data into a file
write.csv(cuse_binary, file='data/cuse_binary.csv', row.names = FALSE)
```

## Variables categóricas de proceso 
Procese las variables categóricas para que tengan el mismo patrón que en pyspar. Los niveles de los elementos están en el orden descendente de las frecuencias de los elementos.

```
#====== This is R code! =========
cuse_binary$age = factor(cuse_binary$age, 
                         levels = names(sort(table(cuse_binary$age), decreasing = TRUE)))
cuse_binary$education = factor(cuse_binary$education,
                               levels = names(sort(table(cuse_binary$education), decreasing = TRUE)))
cuse_binary$wantsMore = factor(cuse_binary$wantsMore,
                               levels = names(sort(table(cuse_binary$wantsMore), decreasing = TRUE)))

# encode label column
cuse_binary$y = factor(cuse_binary$y,
                               levels = names(sort(table(cuse_binary$y))))
glm_cuse = glm(y~age + education + wantsMore, data = cuse_binary, family = binomial(link = "logit"))
```

## Resultados

```
#====== This is R code! =========
glm_cuse$coefficients

 (Intercept)     age25-29       age<25     age40-49 educationlow  wantsMoreno 
   0.7325613    0.5192319    0.9086135   -0.2806254    0.3249947   -0.8329548 
```