# Spark Streaming


## title 2

In [1]:
import json
from time import time
import os
import uuid
import random

JSON_LOCATION = "./tmp/stream/json"

os.makedirs(JSON_LOCATION, exist_ok=True)

def add_sale(quantity):
    
    for _ in range(quantity):
        sale_id = str(uuid.uuid4())
        out_file = open(f"{JSON_LOCATION}/{sale_id}.json", "w")
        
        data = {
            'sale_id': sale_id,
            'product_id': random.randrange(1, 5, 1),
            'price': random.randrange(100, 1000, 1),
            'quantity': random.randrange(1, 10, 1),
            'sale_datetime': int(time())
        }

        json.dump(data, out_file)
        
add_sale(1)


In [2]:
import shutil

spark.sql("drop table if exists total_sales")
shutil.rmtree("/spark-warehouse/total_sales", ignore_errors=True)
shutil.rmtree("./tmp/stream/_checkpoints", ignore_errors=True)

In [3]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Decimal, String
from flypipe.datasource.spark import Spark
import pyspark.sql.functions as F
@node(
    type="pyspark",
    dependencies=[
        Spark("sales")
    ],
    output=Schema(
     Column("product_id", String(), "product identifier"),   
     Column("total_sales", Decimal(18,2), "total sales amount"),
    )
)
def total_sales_node(sales):
    df = sales.groupBy("product_id").agg(F.sum(F.col("price") * F.col("quantity")).alias("total_sales"))
    return df

In [4]:
from pyspark.sql.types import StructType, ArrayType, StructField, StringType, DecimalType, IntegerType, TimestampType


def total_sales(batch_df, batch_id):
    
    print("Batch dataframe received:")
    display(batch_df)
    
    total_sales_df = (
        total_sales_node
        .run(inputs = {
            Spark("sales"): batch_df
        })
    )
    
    print("Total sales dataframe:")
    display(total_sales_df)
    
    (
      total_sales_df
      .write
      .format('delta')
      .mode('append')
      .saveAsTable("total_sales")
    )
    
    return total_sales_df




# Create Stream
json_schema = StructType([
    StructField("sale_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("price", DecimalType(18,2), True),
    StructField("quantity", IntegerType(), True),
    StructField("sale_datetime", TimestampType(), True),
])


(
  spark
  .readStream
  .json(JSON_LOCATION, schema=json_schema)
  .writeStream
  .trigger(availableNow=True)
  .option("checkpointLocation", "./tmp/stream/_checkpoints/")
  .foreachBatch(total_sales)
  .start()
)

# Waitting process
from time import sleep

while True:
    try:
        spark.sql("select * from total_sales")
        break
    except Exception as e:
        sleep(2)

Batch dataframe received:


sale_id,product_id,price,quantity,sale_datetime
802c8edf-5204-41a...,4,683.0,3,2022-11-29 05:34:34
eb4eeb9d-0db3-4c8...,4,923.0,5,2022-11-29 05:34:59
2902cdc7-8601-483...,1,498.0,1,2022-11-29 05:54:39
7462eca8-1c82-480...,4,586.0,3,2022-11-29 05:56:15
8c16c051-a0c7-482...,2,654.0,4,2022-11-29 05:58:22
c24d8880-dbba-480...,4,442.0,3,2022-11-29 06:00:21
554de741-18bc-489...,3,638.0,3,2022-11-29 06:19:20
ed7f28ca-6221-4dc...,2,873.0,5,2022-11-29 06:20:00
4ecb2cc1-fcf9-45f...,1,155.0,3,2022-11-29 06:35:47
02e40608-afcd-4d4...,2,873.0,3,2022-11-29 06:39:05


Total sales dataframe:


                                                                                

product_id,total_sales
3,5860.0
1,2457.0
4,13692.0
2,14484.0


                                                                                

In [5]:
display(spark.sql("select * from total_sales"))

product_id,total_sales
2,14484.0
3,5860.0
4,13692.0
1,2457.0
