# Intro ML SUP en BD

Creación de la sesión Spark:

In [1]:
#import SparkSession

from pyspark.sql import SparkSession

In [2]:
# create spark session object

spark = SparkSession.builder.appName('supervised_ml').getOrCreate()

## Regression 

Carga de datos, archivo *Linear_regression_dataset.csv*:

In [3]:
df = spark.read.csv('Linear_regression_dataset.csv', inferSchema=True, header=True)

Se invocan las librerias correcpondientes a **LinearRegression**, asi como las de OneHotEncoder, StringIndexer, VectorAssembler:

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

Se visualizan algunos datos: X

In [5]:
print((df.count(), len(df.columns))) 

(1232, 6)


Se muestran los primeros 10 datos:

In [6]:
df.show(10)

+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



## Feature Engineering

Creamos un solo vector con todos los features i.e 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', a este le llamaremos "features" y como salida colocamos a 'label':

In [7]:
df_assembler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'], outputCol = "features")
df = df_assembler.transform(df) 

In [8]:
df.select(['features','label']).show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
|[583.0,794.0,55.0...|0.371|
|[676.0,746.0,72.0...|  0.4|
|[767.0,699.0,89.0...|0.433|
|[637.0,597.0,86.0...|0.374|
|[609.0,724.0,69.0...|0.382|
|[776.0,733.0,83.0...|0.437|
|[701.0,832.0,66.0...| 0.39|
|[650.0,709.0,74.0...|0.386|
|[804.0,668.0,95.0...|0.453|
|[713.0,614.0,94.0...|0.404|
+--------------------+-----+
only showing top 20 rows



Partimos a continuación el set de datos en 75% training y 25% testing:

In [9]:
train, test = df.randomSplit([0.75, 0.25])
print(f"Size of train Dataset : {train.count()}" )
print(f"Size of test Dataset : {test.count()}" )

Size of train Dataset : 927
Size of test Dataset : 305


Llamamos a la Regresión Lineal: X

In [11]:
lr = LinearRegression()

Entrenamos el modelo de regresión lineal:

In [12]:
# Fit the model
lr_model = lr.fit(train)

Creamos el dataframe de prediciones (*predictions_df*) a partir del modelo de entrenamiento y el conjunto de datos test: X

In [13]:
predictions_df = lr_model.transform(test)

Visualizamos el contenido de *predictions_df*:

In [14]:
predictions_df.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|0.33080780383118386|
|  527|  569|   75|0.297|0.239|0.341|[527.0,569.0,75.0...|0.33394050835997224|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...| 0.3368023685331818|
|  552|  683|   71| 0.31|0.244|0.335|[552.0,683.0,71.0...|0.34092306559469227|
|  554|  536|   77|0.306| 0.24|0.339|[554.0,536.0,77.0...|0.33666621064803975|
|  556|  675|   67|0.292|0.233|0.348|[556.0,675.0,67.0...|0.34810563160275215|
|  558|  688|   67|0.298|0.233| 0.35|[558.0,688.0,67.0...| 0.3453557908329167|
|  558|  740|   60|0.301| 0.24| 0.36|[558.0,740.0,60.0...| 0.3482180529336537|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...|0.34334128841584893|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

Ahora, evaluamos el modelo de Regresión Lineal, con los datos de TEST:

In [15]:
model_predictions = lr_model.evaluate(test)


Imprimimos el valor de R2:

In [16]:
print(model_predictions.r2)

0.8667564389069377


Imprimimos el valor del meanSquaredError:

In [17]:
print(model_predictions.meanSquaredError)


0.00014383760230078483


## Regresión con Árboles de Decisión

Importamos la librería *DecisionTreeRegressor*: 

In [20]:
from pyspark.ml.regression import DecisionTreeRegressor


Creamos el Regresor DT:

In [21]:
dec_tree = DecisionTreeRegressor()

Entrenamos el modelo:

In [24]:
# Train model. 
dec_tree_model = dec_tree.fit(train)


Cuánto es la profundidad máxima por defecto?

Desplegamos las *featureImportances*: X

In [25]:
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9684, 1: 0.0088, 2: 0.0059, 3: 0.0075, 4: 0.0094})

Evaluamos el modelo con los datos de entrenamiento:

In [26]:
# Make predictions.
model_predictions = dec_tree_model.transform(test)

In [27]:
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...| 0.3443333333333333|
|  527|  569|   75|0.297|0.239|0.341|[527.0,569.0,75.0...|            0.33075|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...| 0.3519880952380952|
|  552|  683|   71| 0.31|0.244|0.335|[552.0,683.0,71.0...| 0.3519880952380952|
|  554|  536|   77|0.306| 0.24|0.339|[554.0,536.0,77.0...| 0.3519880952380952|
|  556|  675|   67|0.292|0.233|0.348|[556.0,675.0,67.0...| 0.3519880952380952|
|  558|  688|   67|0.298|0.233| 0.35|[558.0,688.0,67.0...| 0.3519880952380952|
|  558|  740|   60|0.301| 0.24| 0.36|[558.0,740.0,60.0...| 0.3519880952380952|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...| 0.3519880952380952|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

Importamos el **RegressionEvaluator**

In [29]:
from pyspark.ml.evaluation import RegressionEvaluator


Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [30]:
# R2 value of the model on test data 
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

# RMSE value of the model on test data 
dt_evaluator = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator.evaluate(model_predictions)
print(f'The rmse value of DecisionTreeRegressor is {dt_rmse}')



The r-square value of DecisionTreeRegressor is 0.8205888870252986
The rmse value of DecisionTreeRegressor is 0.013916748282450318


## RandomForestRegressor

Importamos a *RandomForestRegressor*

In [33]:
from pyspark.ml.regression import RandomForestRegressor


Creamos el Regresor RF:

In [34]:
rf = RandomForestRegressor()

Entrenamos el modelo:

In [36]:
# Train model. 
rf_model = rf.fit(train)


Desplegamos las *featureImportances*:

In [37]:
rf_model.featureImportances

SparseVector(5, {0: 0.4865, 1: 0.0333, 2: 0.0197, 3: 0.2518, 4: 0.2088})

Desplegamos el numero de arboles (Num of Trees)

In [38]:
rf_model.getNumTrees

20

Evaluamos el modelo con los datos de entrenamiento:


In [39]:
model_predictions = rf_model.transform(test)

Desplegamos los valores del *model_predictions*

In [40]:
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...| 0.3397480065197819|
|  527|  569|   75|0.297|0.239|0.341|[527.0,569.0,75.0...|0.33863136508008124|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...|   0.35189234524107|
|  552|  683|   71| 0.31|0.244|0.335|[552.0,683.0,71.0...| 0.3530088803560934|
|  554|  536|   77|0.306| 0.24|0.339|[554.0,536.0,77.0...|   0.35189234524107|
|  556|  675|   67|0.292|0.233|0.348|[556.0,675.0,67.0...| 0.3405007821844655|
|  558|  688|   67|0.298|0.233| 0.35|[558.0,688.0,67.0...|0.34691693394647594|
|  558|  740|   60|0.301| 0.24| 0.36|[558.0,740.0,60.0...| 0.3526783380862997|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...| 0.3524942133729381|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

Usando *RegressionEvaluator* calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [41]:
# Select (prediction, true label) and compute test error
# R2 value of the model on test data 
rf_evaluator = RegressionEvaluator(metricName='r2')
rf_r2 = rf_evaluator.evaluate(model_predictions)
print(f'The r-square value of RandomForestRegressor is {rf_r2}')

# RMSE value of the model on test data 
rf_evaluator = RegressionEvaluator(metricName='rmse')
rf_rmse = rf_evaluator.evaluate(model_predictions)
print(f'The rmse value of RandomForestRegressor is {rf_rmse}')


The r-square value of RandomForestRegressor is 0.830007622223091
The rmse value of RandomForestRegressor is 0.013546522610373079


## Gradient-Boosted Tree Regressor

Importamos a GBTRegressor


In [42]:
from pyspark.ml.regression import GBTRegressor


Creamos el Regresor GBTR:


In [43]:
gbt = GBTRegressor()

Entrenamos el modelo:

In [45]:
# Train model.  
gbt_model = gbt.fit(train)


Desplegamos las featureImportances:

In [46]:
gbt_model.featureImportances

SparseVector(5, {0: 0.2353, 1: 0.2351, 2: 0.1922, 3: 0.1754, 4: 0.1621})

Evaluamos el modelo con los datos de entrenamiento:

In [48]:
model_predictions = gbt_model.transform(test)

Desplegamos los valores del *model_predictions*

In [49]:
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...| 0.3408141425096789|
|  527|  569|   75|0.297|0.239|0.341|[527.0,569.0,75.0...| 0.3349061901475704|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...|0.34619106623178414|
|  552|  683|   71| 0.31|0.244|0.335|[552.0,683.0,71.0...| 0.3468438238014841|
|  554|  536|   77|0.306| 0.24|0.339|[554.0,536.0,77.0...|   0.34967715751417|
|  556|  675|   67|0.292|0.233|0.348|[556.0,675.0,67.0...|0.34832111114385683|
|  558|  688|   67|0.298|0.233| 0.35|[558.0,688.0,67.0...| 0.3484689044144408|
|  558|  740|   60|0.301| 0.24| 0.36|[558.0,740.0,60.0...| 0.3503177083105877|
|  562|  546|   79|0.299|0.237| 0.35|[562.0,546.0,79.0...|   0.34967715751417|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

Usando RegressionEvaluator calculamos e imprimimos el valor de las metricas R2 y RMSE:

In [51]:
 #Select (prediction, true label) and compute test error
# R2 value of the model on test data 
gbt_evaluator = RegressionEvaluator(metricName='r2')
gbt_r2 = gbt_evaluator.evaluate(model_predictions)
print(f'The r-square value of GradientBoostedRegressor is {gbt_r2}')

# RMSE value of the model on test data 
gbt_evaluator = RegressionEvaluator(metricName='rmse')
gbt_rmse = gbt_evaluator.evaluate(model_predictions)
print(f'The rmse value of GradientBoostedRegressor is {gbt_rmse}')


The r-square value of GradientBoostedRegressor is 0.8205459288992225
The rmse value of GradientBoostedRegressor is 0.013918414293086918


 ## Exploracion de datos...

Usaremos el dataset https://archive.ics.uci.edu/ml/datasets/Bank+Marketing 

Indique a grandes razgos de que se trata este dataset:


Carga de datos, archivo bank_data.csv:


In [56]:
# Load csv Dataset 
df=spark.read.csv('bank_data.csv',inferSchema=True,header=True)

Determine la cantidad de datos en el dataset:

In [57]:
#number of records
df.count()

41188

A que dato corresponde cada columna?

In [59]:
df.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'target_class']

Imprima el Schema:

In [60]:
#dataype of input data 
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- target_class: string (nullable = true)



En cuanto a la salida, como es la distrubución de clases?

In [61]:
df.groupBy('target_class').count().show()

+------------+-----+
|target_class|count|
+------------+-----+
|          no|36548|
|         yes| 4640|
+------------+-----+



Una tarea típica, resulta de convertir los valores binarios en 1 y 0, usando como referencia "label", convierta los no/yes en 0/1:

In [66]:
from pyspark.sql import functions as F
from pyspark.sql import *

In [67]:
# Ingrese acá la instrucción: 

df = df.withColumn("label", F.when(df.target_class =='no', F.lit(0)).otherwise(F.lit(1)))

In [68]:
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 4640|
|    0|36548|
+-----+-----+



# Deep Learning 

Importamos las librerias necesarias:

In [78]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer

Inicializamos la sesion SPARK:

In [79]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

Leemos el dataset:

In [80]:
data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)

In [81]:
data.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- Orders_Normalized: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



Renombramos la columna TARGET:

In [82]:
data = data.withColumnRenamed('Orders_Normalized', 'label')

In [83]:
data.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



Partimos lo datos en Train, Validation y Test:

In [85]:
train, validation, test  = data.randomSplit([0.7, 0.2, 0.1], 1234)

Construimos el Pipeline

In [86]:
categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]
numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]

indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]

featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol="features")

layers = [len(featuresCreator.getInputCols()), 4, 2, 2]

classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)

pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])

Entrenamos...

In [88]:
model = pipeline.fit(train)

Validamos y Evaluamos

In [89]:
train_output_df = model.transform(train)
validation_output_df = model.transform(validation)
test_output_df = model.transform(test)

Llevamos a cabo, algunas predicciones:

In [90]:
train_predictionAndLabels = train_output_df.select("prediction", "label")
validation_predictionAndLabels = validation_output_df.select("prediction", "label")
test_predictionAndLabels = test_output_df.select("prediction", "label")

metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))
    print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))
    print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))

Train weightedPrecision = 0.9678672585062094
Validation weightedPrecision = 0.9691383210897103
Test weightedPrecision = 0.9694943540959478
Train weightedRecall = 0.967375637879319
Validation weightedRecall = 0.9685636856368564
Test weightedRecall = 0.9691266079891672
Train accuracy = 0.9673756378793191
Validation accuracy = 0.9685636856368564
Test accuracy = 0.9691266079891673


Puede mejorar el test accuracy del modelo variando alguno de los hyperparametros?