# Spark Streaming


## title 2

In [1]:
import shutil

spark.sql("drop table if exists total_sales")
shutil.rmtree("/spark-warehouse/total_sales", ignore_errors=True)
shutil.rmtree("./tmp/stream/_checkpoints", ignore_errors=True)
shutil.rmtree("./tmp/stream/json", ignore_errors=True)

In [2]:
import json
from time import time
import os
import uuid
import random

JSON_LOCATION = "./tmp/stream/json"

os.makedirs(JSON_LOCATION, exist_ok=True)

def add_sale(quantity):
    
    for _ in range(quantity):
        sale_id = str(uuid.uuid4())
        out_file = open(f"{JSON_LOCATION}/{sale_id}.json", "w")
        
        data = {
            'sale_id': sale_id,
            'product_id': random.randrange(1, 5, 1),
            'price': random.randrange(100, 1000, 1),
            'quantity': random.randrange(1, 10, 1),
            'sale_datetime': int(time())
        }

        json.dump(data, out_file)
        
        print(f"Added {JSON_LOCATION}/{sale_id}.json")
        
add_sale(10)


Added ./tmp/stream/json/54b2a9b8-dafd-4ca0-8629-30760ac91c3b.json
Added ./tmp/stream/json/9d29b975-f35e-4239-9a7b-d2a6d8b5e820.json
Added ./tmp/stream/json/e1b29230-e96c-4068-a707-4d0c4286b7c6.json
Added ./tmp/stream/json/0c346cc4-2bec-4454-b1be-bf9b17f86310.json
Added ./tmp/stream/json/12643850-ff04-43d4-a5fc-4b35d58b9269.json
Added ./tmp/stream/json/697e9e13-094f-4111-af71-5b059e52eac6.json
Added ./tmp/stream/json/11b3f107-de61-4427-bad6-e888b22155e2.json
Added ./tmp/stream/json/073b6134-af33-48cf-90ee-fea72e3c48ed.json
Added ./tmp/stream/json/db8cb76b-3a2c-41e7-ae4d-527a84748555.json
Added ./tmp/stream/json/fb33ae14-8c25-4ce3-8fac-ccdf7a6070ca.json


In [3]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Decimal, String
from flypipe.datasource.spark import Spark
import pyspark.sql.functions as F
@node(
    type="pyspark",
    dependencies=[
        Spark("sales")
    ],
    output=Schema(
     Column("product_id", String(), "product identifier"),   
     Column("total_sales", Decimal(18,2), "total sales amount"),
    )
)
def total_sales_node(sales):
    df = sales.groupBy("product_id").agg(F.sum(F.col("price") * F.col("quantity")).alias("total_sales"))
    return df

In [4]:
from pyspark.sql.types import StructType, ArrayType, StructField, StringType, DecimalType, IntegerType, TimestampType


def total_sales(batch_df, batch_id):
    
    print("Batch dataframe received:")
    display(batch_df)
    
    total_sales_df = (
        total_sales_node
        .run(inputs = {
            Spark("sales"): batch_df
        })
    )
    
    print("Total sales dataframe:")
    display(total_sales_df)
    
    (
      total_sales_df
      .write
      .format('delta')
      .mode('overwrite')
      .saveAsTable("total_sales")
    )
    
    return total_sales_df




# Create Stream
json_schema = StructType([
    StructField("sale_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("price", DecimalType(18,2), True),
    StructField("quantity", IntegerType(), True),
    StructField("sale_datetime", TimestampType(), True),
])


(
  spark
  .readStream
  .json(JSON_LOCATION, schema=json_schema)
  .writeStream
  .trigger(availableNow=True)
  .option("checkpointLocation", "./tmp/stream/_checkpoints/")
  .foreachBatch(total_sales)
  .start()
)

# Waitting process
from time import sleep

while True:
    try:
        spark.sql("select * from total_sales")
        break
    except Exception as e:
        sleep(2)

Batch dataframe received:


sale_id,product_id,price,quantity,sale_datetime
54b2a9b8-dafd-4ca...,2,806.0,8,2022-11-29 06:45:42
9d29b975-f35e-423...,1,458.0,1,2022-11-29 06:45:42
e1b29230-e96c-406...,4,845.0,6,2022-11-29 06:45:42
0c346cc4-2bec-445...,4,744.0,8,2022-11-29 06:45:42
12643850-ff04-43d...,3,649.0,3,2022-11-29 06:45:42
697e9e13-094f-411...,2,341.0,6,2022-11-29 06:45:42
11b3f107-de61-442...,1,794.0,4,2022-11-29 06:45:42
073b6134-af33-48c...,4,185.0,2,2022-11-29 06:45:42
db8cb76b-3a2c-41e...,3,427.0,3,2022-11-29 06:45:42
fb33ae14-8c25-4ce...,4,972.0,8,2022-11-29 06:45:42


Total sales dataframe:


                                                                                

product_id,total_sales
3,3228.0
1,3634.0
4,19168.0
2,8494.0


                                                                                

In [5]:
display(spark.sql("select * from total_sales"))

product_id,total_sales
3,3228.0
1,3634.0
4,19168.0
2,8494.0
