In [1]:
# from pyspark.context import SparkContext
# from pyspark.sql.session import SparkSession

# spark = SparkSession\
#         .builder \
#         .appName("File Streaming Demo") \
#         .master("local[3]") \
#         .config("spark.streaming.stopGracefullyOnShutdown", "true") \
#         .getOrCreate()

In [1]:
from pyspark.sql.functions import expr

In [3]:
#Se crea la sesión de spark y se habilita la inferencia de Schema el cual no esta habilitado por defecto en streaming
spark = SparkSession \
        .builder \
        .appName("File Streaming Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.streaming.schemaInference", "true") \
        .getOrCreate()

In [4]:
# Se inicia el read Stream especificando el directorio de donde leera los archivos, e indicando que por Trigger
# solo procesará un archivo
raw_df = spark.readStream \
        .format("json") \
        .option("path", "./data/streaming/input/") \
        .option("maxFilesPerTrigger", 1) \
        .load()

In [5]:
# inspeccionamos el esquema
raw_df.printSchema()

root
 |-- CESS: double (nullable = true)
 |-- CGST: double (nullable = true)
 |-- CashierID: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- CustomerCardNo: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- DeliveryAddress: struct (nullable = true)
 |    |-- AddressLine: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- ContactNumber: string (nullable = true)
 |    |-- PinCode: string (nullable = true)
 |    |-- State: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- InvoiceLineItems: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCode: string (nullable = true)
 |    |    |-- ItemDescription: string (nullable = true)
 |    |    |-- ItemPrice: double (nullable = true)
 |    |    |-- ItemQty: long (nullable = true)
 |    |    |-- TotalValue: double (nullable = true)
 |-- InvoiceNumber: string (nullable = true)
 |-- NumberOfItems: long (nullable = t

In [6]:
# creamos una estructura plana usando la información proveniente del json
explode_df = raw_df.selectExpr("InvoiceNumber", "CreatedTime", "StoreID", "PosID",
                                   "CustomerType", "PaymentMethod", "DeliveryType", "DeliveryAddress.City",
                                   "DeliveryAddress.State",
                                   "DeliveryAddress.PinCode", "explode(InvoiceLineItems) as LineItem")

In [7]:
# Se adecuan el contenido del campo LineItem
flattened_df = explode_df \
        .withColumn("ItemCode", expr("LineItem.ItemCode")) \
        .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
        .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
        .withColumn("ItemQty", expr("LineItem.ItemQty")) \
        .withColumn("TotalValue", expr("LineItem.TotalValue")) \
        .drop("LineItem")

In [None]:
# se generan los archivos de salida, indicamos cada cuanto tiempo se ejecutará el trigger
invoiceWriterQuery = flattened_df.writeStream \
        .format("json") \
        .queryName("Flattened Invoice Writer") \
        .outputMode("append") \
        .option("path", "./data/streaming/output/") \
        .option("checkpointLocation", "./checkpoints/checkpoint-stream-file") \
        .trigger(processingTime="1 minute") \
        .start()
invoiceWriterQuery.awaitTermination()