# Ejercicio Práctico_Predicción en Streaming con Spark ML y Spark Streaming

En este notebook vamos a cargar un pipeline que tiene un conjunto de fases de pre-procesamiento y un modelo de clasificacion predecir la probabilidad de un paciente de sufrir un ataque al corazón. La predicción se realizará sobre datos en streaming optenidos a partir del csv de heart.csv

In [32]:
import findspark
findspark.init()

In [44]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import LogisticRegression

In [33]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('UCI Heart disease').getOrCreate()

In [1]:
heart = spark.read.csv('heart.csv', 
                       inferSchema = True, 
                       header = True)
heart.show(3)

In [39]:
schema = StructType( \
                     [StructField("age", LongType(),True), \
                      StructField("sex", LongType(), True), \
                      StructField("cp", LongType(), True), \
                      StructField('trtbps', LongType(), True), \
                      StructField("chol", LongType(), True), \
                      StructField("fbs", LongType(), True), \
                      StructField("restecg", LongType(), True), \
                      StructField("thalachh", LongType(), True),\
                      StructField("exng", LongType(), True), \
                      StructField("oldpeak", DoubleType(), True), \
                      StructField("slp", LongType(),True), \
                      StructField("caa", LongType(), True), \
                      StructField("thall", LongType(), True), \
                      StructField("output", LongType(), True), \
                        ])

In [35]:
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType,StructField,LongType, StringType,DoubleType,TimestampType


df = heart.withColumnRenamed("output","label")
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trtbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalachh: integer (nullable = true)
 |-- exng: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slp: integer (nullable = true)
 |-- caa: integer (nullable = true)
 |-- thall: integer (nullable = true)
 |-- label: integer (nullable = true)



In [36]:
testDF, trainDF = df.randomSplit([0.3, 0.7])

### Carga del Pipeline

In [79]:
pModel.load("Ejercicios\pipelines")

PipelineModel_f8d4b32a5360

In [2]:
# We transform the data.
trainingPred = pModel.transform(trainDF)

# # We select the actual label, probability and predictions
trainingPred.select('label','probability','prediction').show()


In [81]:
testData = testDF.repartition(10)

testData.write.format("CSV").option("header",False).save("/data/heart_streaming/")

## Creando predicciones en Streaming

In [82]:
sourceStream = (
    spark.readStream.schema(schema)
    .option("maxFilesPerTrigger", 1)
    .csv("heart_streaming/")
    .withColumnRenamed("output","label")
)

In [83]:
prediction1 = pModel.transform(sourceStream).select('label',
                                                   'probability',
                                                   'prediction')

In [84]:
display(prediction1)

DataFrame[label: bigint, probability: vector, prediction: double]

#### Mostrando las predicciones en consola

In [85]:
query1 = prediction1.writeStream.queryName("prediction1") \
            .format("console")\
            .trigger(once=True)\
            .start()\
            .awaitTermination()

#### Guardando las perdicciones en Memoria

In [3]:
query2 = (
            prediction1.writeStream.queryName("prediction4") 
            .format("memory")
            .outputMode("append")
            .start())

In [88]:
for x in range(2):
    df = spark.sql(
        "SELECT * FROM prediction4")
    df.show(10)
df

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.04086978924170...|       1.0|
|    0|[0.98184892212735...|       0.0|
|    1|[0.00474279761632...|       1.0|
|    1|[0.35775366097494...|       1.0|
|    1|[0.05755909903937...|       1.0|
|    0|[0.95305536703752...|       0.0|
|    0|[0.94079962605713...|       0.0|
|    0|[0.13017480179914...|       1.0|
|    0|[0.99807916786174...|       0.0|
|    1|[0.15541832735450...|       1.0|
+-----+--------------------+----------+
only showing top 10 rows

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.04086978924170...|       1.0|
|    0|[0.98184892212735...|       0.0|
|    1|[0.00474279761632...|       1.0|
|    1|[0.35775366097494...|       1.0|
|    1|[0.05755909903937...|       1.0|
|    0|[0.95305536703752...|       0.0|
|    0|[0.94079962605713...|       0.0|
|    0|[0.1301

DataFrame[label: bigint, probability: vector, prediction: double]

In [None]:
spark.streams.active[0].isActive

In [None]:
query2.status