# Predicción en Streaming con Spark ML y Spark Streaming

En este notebook vamos a entrenar un modelo de clasificación para predecir la probabilidad de un paciente de sufrir un ataque al corazón

In [1]:
import findspark
findspark.init()

In [7]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import *

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('UCI Heart disease').getOrCreate()

In [5]:
heart = spark.read.csv('data/heart.csv', 
                       inferSchema = True, 
                       header = True)
heart.show(3)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 3 rows



In [15]:
schema = StructType( \
                     [StructField("age", LongType(),True), \
                      StructField("sex", LongType(), True), \
                      StructField("cp", LongType(), True), \
                      StructField('trestbps', LongType(), True), \
                      StructField("chol", LongType(), True), \
                      StructField("fbs", LongType(), True), \
                      StructField("restecg", LongType(), True), \
                      StructField("thalach", LongType(), True),\
                      StructField("exang", LongType(), True), \
                      StructField("oldpeak", DoubleType(), True), \
                      StructField("slope", LongType(),True), \
                      StructField("ca", LongType(), True), \
                      StructField("thal", LongType(), True), \
                      StructField("target", LongType(), True), \
                        ])

In [16]:
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType,StructField,LongType, StringType,DoubleType,TimestampType

df = heart.withColumnRenamed("target","label")
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [17]:
testDF, trainDF = df.randomSplit([0.3, 0.7])

### Carga del Pipeline

In [21]:
from pyspark.ml import PipelineModel


pModel = PipelineModel.load("\pipelines")

In [24]:
# We transform the data.
trainingPred = pModel.transform(trainDF)

# # We select the actual label, probability and predictions
trainingPred.select('target','probability','prediction').show()

+------+--------------------+----------+
|target|         probability|prediction|
+------+--------------------+----------+
|     1|[0.02547572141032...|       1.0|
|     1|[0.03186502726837...|       1.0|
|     0|[0.91655787910351...|       0.0|
|     0|[0.76284874224638...|       0.0|
|     1|[0.02615684728437...|       1.0|
|     1|[0.00285662444432...|       1.0|
|     1|[0.03955333515046...|       1.0|
|     1|[0.03960488368718...|       1.0|
|     1|[0.03960488368718...|       1.0|
|     1|[0.00119850446011...|       1.0|
|     0|[0.70706060938886...|       0.0|
|     0|[0.94157151280181...|       0.0|
|     0|[0.30476847727903...|       1.0|
|     1|[0.19848850346177...|       1.0|
|     1|[0.04391393058917...|       1.0|
|     0|[0.23590393417504...|       1.0|
|     1|[0.04085986608171...|       1.0|
|     1|[0.01597241898798...|       1.0|
|     1|[0.11070342370298...|       1.0|
|     0|[0.94326722147048...|       0.0|
+------+--------------------+----------+
only showing top

In [28]:
testData = testDF.repartition(10)
testData

#Create a directory
testData.write.format("CSV").option("header",False).save("/heart_streaming/")

## Creando predicciones en Streaming

In [31]:
sourceStream = (
    spark.readStream.schema(schema)
    .option("maxFilesPerTrigger", 1)
    .csv("/heart_streaming")
    .withColumnRenamed("output","label")
)

In [32]:
prediction1 = pModel.transform(sourceStream).select('label',
                                                   'probability',
                                                   'prediction')

In [33]:
display(prediction1)

DataFrame[label: bigint, probability: vector, prediction: double]

#### Mostrando las predicciones en consola

In [34]:
query1 = prediction1.writeStream.queryName("prediction1") \
            .format("console")\
            .trigger(once=True)\
            .start()\
            .awaitTermination()

#### Guardando las perdicciones en Memoria

In [35]:
query2 = (
            prediction1.writeStream.queryName("prediction4") 
            .format("memory")
            .outputMode("append")
            .start())

In [38]:
for x in range(2):
    df = spark.sql(
        "SELECT * FROM prediction4")
    df.show(10)
df

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.23530048884955...|       1.0|
|    1|[0.78298087714178...|       0.0|
|    0|[0.25205521541893...|       1.0|
|    1|[0.60683421041110...|       0.0|
|    1|[0.36136135664308...|       1.0|
|    1|[0.05312797284297...|       1.0|
|    1|[0.00415141451628...|       1.0|
|    1|[0.11690508521511...|       1.0|
|    1|[0.00398241426990...|       1.0|
|    0|[0.98820996845845...|       0.0|
+-----+--------------------+----------+
only showing top 10 rows

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.23530048884955...|       1.0|
|    1|[0.78298087714178...|       0.0|
|    0|[0.25205521541893...|       1.0|
|    1|[0.60683421041110...|       0.0|
|    1|[0.36136135664308...|       1.0|
|    1|[0.05312797284297...|       1.0|
|    1|[0.00415141451628...|       1.0|
|    1|[0.1169

DataFrame[label: bigint, probability: vector, prediction: double]

In [None]:
spark.streams.active[0].isActive

In [None]:
spark.streams

In [37]:
query2.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}