# Spark Streaming


## title 2

In [1]:
import shutil

spark.sql("drop table if exists total_sales")
shutil.rmtree("/spark-warehouse/total_sales", ignore_errors=True)
shutil.rmtree("./tmp/stream/_checkpoints", ignore_errors=True)
shutil.rmtree("./tmp/stream/json", ignore_errors=True)

In [2]:
import json
from time import time
import os
import uuid
import random
from pprint import pprint

JSON_LOCATION = "./tmp/stream/json"

os.makedirs(JSON_LOCATION, exist_ok=True)

def add_sale(quantity):
    
    for _ in range(quantity):
        sale_id = str(uuid.uuid4())
        out_file = open(f"{JSON_LOCATION}/{sale_id}.json", "w")
        
        data = {
            'sale_id': sale_id,
            'product_id': random.randrange(1, 5, 1),
            'price': random.randrange(100, 1000, 1),
            'quantity': random.randrange(1, 10, 1),
            'sale_datetime': int(time())
        }
        

        json.dump(data, out_file)
        
        print(f"\nAdded {JSON_LOCATION}/{sale_id}.json")
        pprint(data)
        
        
        
add_sale(5)



Added ./tmp/stream/json/8e86d225-fd46-46c0-972b-98f671e21439.json
{'price': 220,
 'product_id': 1,
 'quantity': 2,
 'sale_datetime': 1669704769,
 'sale_id': '8e86d225-fd46-46c0-972b-98f671e21439'}

Added ./tmp/stream/json/7bda9211-d603-4efa-aea2-c7cf34ec3baa.json
{'price': 802,
 'product_id': 3,
 'quantity': 1,
 'sale_datetime': 1669704769,
 'sale_id': '7bda9211-d603-4efa-aea2-c7cf34ec3baa'}

Added ./tmp/stream/json/0306e840-3a1f-4255-b470-477bda4ba014.json
{'price': 600,
 'product_id': 2,
 'quantity': 2,
 'sale_datetime': 1669704769,
 'sale_id': '0306e840-3a1f-4255-b470-477bda4ba014'}

Added ./tmp/stream/json/f52fc13e-4368-4947-a165-e12768273934.json
{'price': 583,
 'product_id': 4,
 'quantity': 2,
 'sale_datetime': 1669704769,
 'sale_id': 'f52fc13e-4368-4947-a165-e12768273934'}

Added ./tmp/stream/json/2e855c7c-1098-443f-8fa8-ca73ae97eae9.json
{'price': 128,
 'product_id': 2,
 'quantity': 1,
 'sale_datetime': 1669704769,
 'sale_id': '2e855c7c-1098-443f-8fa8-ca73ae97eae9'}


In [3]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Decimal, String
from flypipe.datasource.spark import Spark
import pyspark.sql.functions as F
@node(
    type="pyspark",
    dependencies=[
        Spark("sales")
    ],
    output=Schema(
     Column("product_id", String(), "product identifier"),   
     Column("total_sales", Decimal(18,2), "total sales amount"),
    )
)
def total_sales_node(sales):
    df = sales.groupBy("product_id").agg(F.sum(F.col("price") * F.col("quantity")).alias("total_sales"))
    return df

In [4]:
from pyspark.sql.types import StructType, ArrayType, StructField, StringType, DecimalType, IntegerType, TimestampType


def total_sales(batch_df, batch_id):
    
    print("Batch dataframe received:")
    display(batch_df)
    
    total_sales_df = (
        total_sales_node
        .run(inputs = {
            Spark("sales"): batch_df
        })
    )
    
    print("===> Saving dataframe calculated with node `total_sales_node` into table `total_sales`")
    
    (
      total_sales_df
      .write
      .format('delta')
      .mode('overwrite')
      .saveAsTable("total_sales")
    )
    
    return total_sales_df




# Create Stream
json_schema = StructType([
    StructField("sale_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("price", DecimalType(18,2), True),
    StructField("quantity", IntegerType(), True),
    StructField("sale_datetime", TimestampType(), True),
])


(
  spark
  .readStream
  .json(JSON_LOCATION, schema=json_schema)
  .writeStream
  .trigger(availableNow=True)
  .option("checkpointLocation", "./tmp/stream/_checkpoints/")
  .foreachBatch(total_sales)
  .start()
)

# Waitting process
from time import sleep

while True:
    try:
        spark.sql("select * from total_sales")
        break
    except Exception as e:
        sleep(2)

Batch dataframe received:


sale_id,product_id,price,quantity,sale_datetime
8e86d225-fd46-46c...,1,220.0,2,2022-11-29 06:52:49
7bda9211-d603-4ef...,3,802.0,1,2022-11-29 06:52:49
0306e840-3a1f-425...,2,600.0,2,2022-11-29 06:52:49
f52fc13e-4368-494...,4,583.0,2,2022-11-29 06:52:49
2e855c7c-1098-443...,2,128.0,1,2022-11-29 06:52:49


===> Saving dataframe calculated with node `total_sales_node` into table `total_sales`


                                                                                

In [5]:
display(spark.sql("select * from total_sales"))

product_id,total_sales
2,1328.0
3,802.0
1,440.0
4,1166.0
