In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .config("spark.streaming.stopGracefullyOnShutdown", True) # when close the stream make sure that all data on the fly they are processed and connections closed
    .getOrCreate()
)

spark

In [2]:
input_path = "/home/jovyan/data/input"
# set the conf to allo stream inference schema while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)


streaming_df = (
    spark
    .readStream
    .option("cleanSource", "archive") # archive or delete or dont set the optin: delete => delete file after process it, archive => move it to other folder 
    .option("sourceArchiveDir", "/home/jovyan/data/input_processed") # if we set cleanSource to archive where to archive the processed files
    .option("maxFilesPerTrigger",1) # how many files to processed in each microbatch, by default will try to processed as much as possible
    .format("json")
    .load(input_path)
)

streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [3]:
from pyspark.sql.functions import explode, col

streaming_df = streaming_df.withColumn("data_devices", explode("data.devices"))
streaming_df_flatten = (
    streaming_df.drop("data")
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop("data_devices")
)


In [None]:
''' 
    
'''


(
 streaming_df_flatten
 .writeStream.format("csv")
 .outputMode("append")
 .option("path","/home/jovyan/data/output/output.device.csv")
 .option("checkpointLocation","/home/jovyan/data/checkpoint")
 .start()
 .awaitTermination()
)

In [None]:
'''
    Checkpoint:
        
        metadata file: store the id of streaming query that is running now
                       if the checkpoint directory is removed a new checkpoint id will be created
                       if a new job will resume from the existing checkpoint then the id will remain the same and a new run id will be created
                       
        sources folder: keep track the data has been consumed from stream in each batch (batch id)
                        when we process a new file another file will be created with the details of the processed file
                        
        offset: keeps track what is been processed by stream
                in case of consuming from kafka the offset of last message processed is here in logoffset property
        
        commits: it confirm from spark that the data readed from offset has been processed
        
        
        Use sources to keep tract what saw from source
        Once it read the data adding files in offset
        Once processed the data adding files in commit
        
        If a file readed (included in sources) but not processed (not included in commit), if we put it again in the source location it will not read it again
        If we want to process it again we shoult delete the record from sources files
        OR
        We can rename the file and add it again to the input folder
                 
'''