In [8]:
# import libraries
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType



# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('json-producer')
         # Add kafka package
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")
         .getOrCreate())
sc = spark.sparkContext

In [9]:
mySchema = StructType([
 StructField("id", IntegerType()),
 StructField("nome", StringType()),
 StructField("idade", IntegerType()),
 StructField("sexo", IntegerType()),
 StructField("peso", DoubleType()),
 StructField("altura", IntegerType()),
 StructField("bpm", DoubleType()),
 StructField("pressao", DoubleType()),
 StructField("respiracao", DoubleType()),
 StructField("temperatura", DoubleType()),
 StructField("glicemia", DoubleType()),
 StructField("saturacao_oxigenio", DoubleType()),
 StructField("estado_atividade", IntegerType()),
 StructField("dia_de_semana", IntegerType()),
 StructField("periodo_do_dia", IntegerType()),
 StructField("semana_do_mes", IntegerType()),
 StructField("estacao_do_ano", IntegerType()),
 StructField("passos", IntegerType()),
 StructField("calorias", DoubleType()),
 StructField("distancia", DoubleType()),
 StructField("tempo", DoubleType()),
 StructField("total_sleep_last_24", DoubleType()),
 StructField("deep_sleep_last_24", DoubleType()),
 StructField("light_sleep_last_24", DoubleType()),
 StructField("awake_last_24", DoubleType()),
 StructField("fumante", IntegerType()),
 StructField("genetica", IntegerType()),
 StructField("gestante", IntegerType()),
 StructField("frutas", IntegerType()),
 StructField("vegetais", IntegerType()),
 StructField("alcool", IntegerType()),
 StructField("doenca_coracao", IntegerType()),     
 StructField("avc", IntegerType()),
 StructField("colesterol_alto", IntegerType()), 
 StructField("exercicio", IntegerType()), 
 StructField("timestampstr", StringType()),
 StructField("timestamp_epoch", StringType())    
])

In [10]:
#import time

#import os

#timestr = time.strftime("%Y%m%d-%H%M%S")

#json_name = "patient-data-" + timestr + '.json' 

#os.system('curl "https://api.mockaroo.com/api/e172bfb0?count=10&key=42e8f800" > ' + json_name)

#os.system('mv ' + json_name + ' ../json')


In [11]:

json_path = "/home/jovyan/work/json"
json_topic = "patient-data"
kafka_server = "kafka:29092"

streamingDataFrame = spark.readStream.schema(mySchema).json(json_path).filter("id is not null") # filter null records



In [12]:
display(streamingDataFrame)

DataFrame[id: int, nome: string, idade: int, sexo: int, peso: double, altura: int, bpm: double, pressao: double, respiracao: double, temperatura: double, glicemia: double, saturacao_oxigenio: double, estado_atividade: int, dia_de_semana: int, periodo_do_dia: int, semana_do_mes: int, estacao_do_ano: int, passos: int, calorias: double, distancia: double, tempo: double, total_sleep_last_24: double, deep_sleep_last_24: double, light_sleep_last_24: double, awake_last_24: double, fumante: int, genetica: int, gestante: int, frutas: int, vegetais: int, alcool: int, doenca_coracao: int, avc: int, colesterol_alto: int, exercicio: int, timestampstr: string, timestamp_epoch: string]

In [13]:

streamingDataFrame.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value") \
  .writeStream \
  .format("kafka") \
  .option("topic", json_topic) \
  .option("kafka.bootstrap.servers", kafka_server) \
  .option("checkpointLocation", json_path) \
  .start()


<pyspark.sql.streaming.StreamingQuery at 0x7f7843539940>

In [6]:

streamingDataFrame.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value") \
    .writeStream \
    .format("console") \
    .start()


<pyspark.sql.streaming.StreamingQuery at 0x7fabfb8e4610>

In [1]:
# Check active streams
for s in spark.streams.active:
    print("ID:{} | NAME:{}".format(s.id, s.name))

NameError: name 'spark' is not defined

In [15]:
import requests
import os
import time

In [37]:
# Get just one file from API

timestr = time.strftime("%Y%m%d-%H%M%S")

json_name = "patient-data-" + timestr + '.json' 

with open('/home/jovyan/work/json/sample.json', 'rb') as file:
    files = {'f': ('sample.json', file)}
    response = requests.post("https://api.mockaroo.com/api/e172bfb0?count=10&key=42e8f800",files=files)  #https://638d3c7e4190defdb74041ac.mockapi.io/patients

response.raise_for_status() # ensure we notice bad responses

content_str = response.content.decode("utf-8") # convert bytes to str

content_str = content_str[:1] + '\n' + content_str[1:] # insert line break after the first char

content_str = content_str[:len(content_str)-1] + '\n' + content_str[len(content_str)-1:] # insert line break before the last char

content_bytes = bytes(content_str, 'utf-8') # convert str back to bytes

print(content_bytes)

with open(json_name, "wb") as file:
    file.write(content_bytes)
    
os.system('mv ' + json_name + ' /home/jovyan/work/json')


b'[\n{"id":3,"nome":"antonio","idade":66,"sexo":0,"peso":71,"altura":190,"bpm":117,"pressao":13,"respiracao":20,"temperatura":38,"glicemia":119,"saturacao_oxigenio":96,"estado_atividade":2,"dia_de_semana":0,"periodo_do_dia":2,"semana_do_mes":1,"estacao_do_ano":2,"passos":396,"calorias":31.68,"distancia":495,"tempo":6.336,"total_sleep_last_24":10,"deep_sleep_last_24":5,"light_sleep_last_24":4,"awake_last_24":13,"fumante":0,"genetica":1,"gestante":0,"frutas":1,"vegetais":0,"alcool":0,"doenca_coracao":0,"avc":1,"colesterol_alto":1,"exercicio":0,"timestampstr":"2022-03-05 00:05:17","timestamp_epoch":"1646438717"},\n{"id":2,"nome":"maria","idade":45,"sexo":1,"peso":50,"altura":171,"bpm":72,"pressao":13,"respiracao":19,"temperatura":34,"glicemia":102,"saturacao_oxigenio":95,"estado_atividade":2,"dia_de_semana":0,"periodo_do_dia":3,"semana_do_mes":0,"estacao_do_ano":0,"passos":310,"calorias":24.8,"distancia":387.5,"tempo":4.96,"total_sleep_last_24":7,"deep_sleep_last_24":3,"light_sleep_last_2

0

In [39]:
# loop getting n files from API

while True:
    timestr = time.strftime("%Y%m%d-%H%M%S")

    json_name = "patient-data-" + timestr + '.json' 

    with open('/home/jovyan/work/json/sample.json', 'rb') as file:
        files = {'f': ('sample.json', file)}
        response = requests.post("https://api.mockaroo.com/api/e172bfb0?count=10&key=42e8f800",files=files)  #https://638d3c7e4190defdb74041ac.mockapi.io/patients

    response.raise_for_status() # ensure we notice bad responses

    content_str = response.content.decode("utf-8") # convert bytes to str

    content_str = content_str[:1] + '\n' + content_str[1:] # insert line break after the first char

    content_str = content_str[:len(content_str)-1] + '\n' + content_str[len(content_str)-1:] # insert line break before the last char

    content_bytes = bytes(content_str, 'utf-8') # convert str back to bytes

    with open(json_name, "wb") as file:
        file.write(content_bytes)

    os.system('mv ' + json_name + ' /home/jovyan/work/json')

    time.sleep(15)

KeyboardInterrupt: 

In [40]:
spark.stop()