In [21]:
from pyspark.sql import SparkSession
import pandas as pd
import uuid
import random
import json
from pyspark.sql.types import *
from pyspark.sql.functions import *
import requests


In [22]:

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('json-ml-predict-diabetes')
         # Add kafka package
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")
         .getOrCreate())
sc = spark.sparkContext

In [23]:

# Create stream dataframe setting kafka server, topic and offset option
df = (spark
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("subscribe", "patient-data") \
  .option("startingOffsets", "earliest") \
  .option("group_id", "my-group")   \
  .load())


In [24]:

# read a small batch of data from kafka and display to the console

mySchema = StructType([
 StructField("id", IntegerType()),
 StructField("nome", StringType()),
 StructField("idade", IntegerType()),
 StructField("sexo", IntegerType()),
 StructField("peso", DoubleType()),
 StructField("altura", IntegerType()),
 StructField("bpm", DoubleType()),
 StructField("pressao", DoubleType()),
 StructField("respiracao", DoubleType()),
 StructField("temperatura", DoubleType()),
 StructField("glicemia", DoubleType()),
 StructField("saturacao_oxigenio", DoubleType()),
 StructField("estado_atividade", IntegerType()),
 StructField("dia_de_semana", IntegerType()),
 StructField("periodo_do_dia", IntegerType()),
 StructField("semana_do_mes", IntegerType()),
 StructField("estacao_do_ano", IntegerType()),
 StructField("passos", IntegerType()),
 StructField("calorias", DoubleType()),
 StructField("distancia", DoubleType()),
 StructField("tempo", DoubleType()),
 StructField("total_sleep_last_24", DoubleType()),
 StructField("deep_sleep_last_24", DoubleType()),
 StructField("light_sleep_last_24", DoubleType()),
 StructField("awake_last_24", DoubleType()),
 StructField("fumante", IntegerType()),
 StructField("genetica", IntegerType()),
 StructField("gestante", IntegerType()),
 StructField("frutas", IntegerType()),
 StructField("vegetais", IntegerType()),
 StructField("alcool", IntegerType()),
 StructField("doenca_coracao", IntegerType()),     
 StructField("avc", IntegerType()),
 StructField("colesterol_alto", IntegerType()),   
 StructField("exercicio", IntegerType()),   
 StructField("timestampstr", TimestampType()),
 StructField("timestamp_epoch", StringType())
 
])


In [25]:

df_json = df.selectExpr('CAST(value AS STRING) as json')


In [26]:

df_json.select(from_json(df_json.json, mySchema).alias('raw_data')) \
  .select('raw_data.*') \
  .filter("nome is not NULL") \
  .writeStream \
  .trigger(once=True) \
  .format("console") \
  .start() 
  #.awaitTermination()


<pyspark.sql.streaming.StreamingQuery at 0x7fb33880ca30>

In [27]:

def predict_diabetes(patient):

    print(patient[0])

    patient_dict = {}
    patient_dict['id'] = patient[0]
    patient_dict['nome'] = patient[1]
    patient_dict['idade'] = patient[2]
    patient_dict['sexo'] = patient[3]
    patient_dict['peso'] = patient[4]
    patient_dict['altura'] = patient[5]
    patient_dict['bpm'] = patient[6]
    patient_dict['pressao'] = patient[7]
    patient_dict['respiracao'] = patient[8]
    patient_dict['temperatura'] = patient[9]
    patient_dict['glicemia'] = patient[10]
    patient_dict['saturacao_oxigenio'] = patient[11]
    patient_dict['estado_atividade'] = patient[12]
    patient_dict['dia_de_semana'] = patient[13]
    patient_dict['periodo_do_dia'] = patient[14]
    patient_dict['semana_do_mes'] = patient[15]
    patient_dict['estacao_do_ano'] = patient[16]
    patient_dict['passos'] = patient[17]
    patient_dict['calorias'] = patient[18]
    patient_dict['distancia'] = patient[19]
    patient_dict['tempo'] = patient[20]
    patient_dict['total_sleep_last_24'] = patient[21]
    patient_dict['deep_sleep_last_24'] = patient[22]
    patient_dict['light_sleep_last_24'] = patient[23]
    patient_dict['awake_last_24'] = patient[24]
    patient_dict['fumante'] = patient[25]
    patient_dict['genetica'] = patient[26]
    patient_dict['gestante'] = patient[27]
    patient_dict['frutas'] = patient[28]
    patient_dict['vegetais'] = patient[29]
    patient_dict['alcool'] = patient[30]
    patient_dict['doenca_coracao'] = patient[31]
    patient_dict['avc'] = patient[32]
    patient_dict['colesterol_alto'] = patient[33]
    patient_dict['exercicio'] = patient[34]
    patient_dict['timestampstr'] = patient[35]
    patient_dict['timestamp_epoch'] = patient[36]

    data_jsons = json.dumps(patient_dict)

    print()
    print(data_jsons)
    print()

    result = requests.post('http://127.0.0.1:5000/predict-diabetes', json=data_jsons)
        
    result_json = json.dumps(result.json().replace("[","").replace("]",""))

    result_json = result_json.replace('\\', '')[1:-1]
    
    #result_json = '{"label":1, "score":2.1}'
    print()
    print(result_json)
    print()
    
    return result_json

vader_udf = udf(lambda patient: predict_diabetes(patient), StringType())

In [28]:

schema_output = StructType([StructField('label', IntegerType()),\
                            StructField('score', DoubleType())])

df_json.select(from_json(df_json.json, mySchema).alias('raw_data')) \
  .select('raw_data.*') \
  .filter("nome is not NULL") \
  .filter("idade is not NULL") \
  .filter("pressao is not NULL") \
  .filter("peso is not NULL") \
  .filter("altura is not NULL") \
  .select('nome', \
          from_json(vader_udf(array('*')), schema_output).alias('response'))\
  .select('nome', 'response.*') \
  .writeStream \
  .trigger(once=True) \
  .format("console") \
  .start() \
  .awaitTermination()  

In [30]:
df_json.select(from_json(df_json.json, mySchema).alias('raw_data')) \
  .select('raw_data.*') \
  .filter("nome is not NULL") \
  .filter("idade is not NULL") \
  .filter("pressao is not NULL") \
  .filter("peso is not NULL") \
  .filter("altura is not NULL") \
  .select('nome', \
          from_json(vader_udf(array('*')), schema_output).alias('response'))\
  .select('nome', 'response.*') \
  .select(
      expr("CAST(nome AS STRING)").alias("key"),
      expr("'{\"label\":' || CAST(label AS STRING) || ',' || '\"score\":' || CAST(score AS STRING) || '}'").alias("value")            
   ) \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("checkpointLocation", "/home/jovyan/work/json/predict_diabetes") \
  .option("topic", "predict-diabetes-data")        \
  .start()  \
  .awaitTermination()

StreamingQueryException: Writing job aborted.
=== Streaming Query ===
Identifier: [id = f847db55-d962-49a3-a024-a0cda5b14b0c, runId = bf478abb-eacf-469f-8ce4-eff5a9422b89]
Current Committed Offsets: {KafkaV2[Subscribe[patient-data]]: {"patient-data":{"0":191}}}
Current Available Offsets: {KafkaV2[Subscribe[patient-data]]: {"patient-data":{"0":201}}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
WriteToMicroBatchDataSource org.apache.spark.sql.kafka010.KafkaStreamingWrite@52a65cee
+- Project [cast(nome#1735 as string) AS key#1817, concat(concat(concat(concat(concat({"label":, cast(label#1812 as string)), ,), "score":), cast(score#1813 as string)), }) AS value#1818]
   +- Project [nome#1735, response#1809.label AS label#1812, response#1809.score AS score#1813]
      +- Project [nome#1735, from_json(StructField(label,IntegerType,true), StructField(score,DoubleType,true), <lambda>(array(cast(id#1734 as string), nome#1735, cast(idade#1736 as string), cast(sexo#1737 as string), cast(peso#1738 as string), cast(altura#1739 as string), cast(bpm#1740 as string), cast(pressao#1741 as string), cast(respiracao#1742 as string), cast(temperatura#1743 as string), cast(glicemia#1744 as string), cast(saturacao_oxigenio#1745 as string), cast(estado_atividade#1746 as string), cast(dia_de_semana#1747 as string), cast(periodo_do_dia#1748 as string), cast(semana_do_mes#1749 as string), cast(estacao_do_ano#1750 as string), cast(passos#1751 as string), cast(calorias#1752 as string), cast(distancia#1753 as string), cast(tempo#1754 as string), cast(total_sleep_last_24#1755 as string), cast(deep_sleep_last_24#1756 as string), cast(light_sleep_last_24#1757 as string), ... 13 more fields)), Some(Etc/UTC)) AS response#1809]
         +- Filter isnotnull(altura#1739)
            +- Filter isnotnull(peso#1738)
               +- Filter isnotnull(pressao#1741)
                  +- Filter isnotnull(idade#1736)
                     +- Filter isnotnull(nome#1735)
                        +- Project [raw_data#1732.id AS id#1734, raw_data#1732.nome AS nome#1735, raw_data#1732.idade AS idade#1736, raw_data#1732.sexo AS sexo#1737, raw_data#1732.peso AS peso#1738, raw_data#1732.altura AS altura#1739, raw_data#1732.bpm AS bpm#1740, raw_data#1732.pressao AS pressao#1741, raw_data#1732.respiracao AS respiracao#1742, raw_data#1732.temperatura AS temperatura#1743, raw_data#1732.glicemia AS glicemia#1744, raw_data#1732.saturacao_oxigenio AS saturacao_oxigenio#1745, raw_data#1732.estado_atividade AS estado_atividade#1746, raw_data#1732.dia_de_semana AS dia_de_semana#1747, raw_data#1732.periodo_do_dia AS periodo_do_dia#1748, raw_data#1732.semana_do_mes AS semana_do_mes#1749, raw_data#1732.estacao_do_ano AS estacao_do_ano#1750, raw_data#1732.passos AS passos#1751, raw_data#1732.calorias AS calorias#1752, raw_data#1732.distancia AS distancia#1753, raw_data#1732.tempo AS tempo#1754, raw_data#1732.total_sleep_last_24 AS total_sleep_last_24#1755, raw_data#1732.deep_sleep_last_24 AS deep_sleep_last_24#1756, raw_data#1732.light_sleep_last_24 AS light_sleep_last_24#1757, ... 13 more fields]
                           +- Project [from_json(StructField(id,IntegerType,true), StructField(nome,StringType,true), StructField(idade,IntegerType,true), StructField(sexo,IntegerType,true), StructField(peso,DoubleType,true), StructField(altura,IntegerType,true), StructField(bpm,DoubleType,true), StructField(pressao,DoubleType,true), StructField(respiracao,DoubleType,true), StructField(temperatura,DoubleType,true), StructField(glicemia,DoubleType,true), StructField(saturacao_oxigenio,DoubleType,true), StructField(estado_atividade,IntegerType,true), StructField(dia_de_semana,IntegerType,true), StructField(periodo_do_dia,IntegerType,true), StructField(semana_do_mes,IntegerType,true), StructField(estacao_do_ano,IntegerType,true), StructField(passos,IntegerType,true), StructField(calorias,DoubleType,true), StructField(distancia,DoubleType,true), StructField(tempo,DoubleType,true), StructField(total_sleep_last_24,DoubleType,true), StructField(deep_sleep_last_24,DoubleType,true), StructField(light_sleep_last_24,DoubleType,true), ... 15 more fields) AS raw_data#1732]
                              +- Project [cast(value#1002 as string) AS json#1015]
                                 +- StreamingDataSourceV2Relation [key#1001, value#1002, topic#1003, partition#1004, offset#1005L, timestamp#1006, timestampType#1007], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@50c8d772, KafkaV2[Subscribe[patient-data]]
