# Spark Streaming


## title 2

In [1]:
import shutil

spark.sql("drop table if exists total_sales")
shutil.rmtree("/spark-warehouse/total_sales", ignore_errors=True)
shutil.rmtree("./tmp/stream/_checkpoints", ignore_errors=True)
shutil.rmtree("./tmp/stream/json", ignore_errors=True)

In [2]:
import json
from time import time
import os
import uuid
import random

JSON_LOCATION = "./tmp/stream/json"

os.makedirs(JSON_LOCATION, exist_ok=True)

def add_sale(quantity):
    
    for _ in range(quantity):
        sale_id = str(uuid.uuid4())
        out_file = open(f"{JSON_LOCATION}/{sale_id}.json", "w")
        
        data = {
            'sale_id': sale_id,
            'product_id': random.randrange(1, 5, 1),
            'price': random.randrange(100, 1000, 1),
            'quantity': random.randrange(1, 10, 1),
            'sale_datetime': int(time())
        }

        json.dump(data, out_file)
        
        print(f"Added {JSON_LOCATION}/{sale_id}.json")
        
add_sale(10)


Added ./tmp/stream/json/1c967e6a-37ec-4778-a7c0-f9d853e00c4a.json
Added ./tmp/stream/json/fbce3dc5-afcd-43cc-a68e-b1aaf2347e16.json
Added ./tmp/stream/json/4cb9762b-3b31-4da5-ba9e-164f3adbfa35.json
Added ./tmp/stream/json/0714f0c1-48d4-4e46-9818-6f5b0f5ff1a2.json
Added ./tmp/stream/json/37a55a44-69a3-4738-aa30-b368240522b6.json
Added ./tmp/stream/json/4a0cd365-ebfe-411c-8da2-e7137c2d8f2a.json
Added ./tmp/stream/json/c63d4fa9-e130-4f92-adc8-d86e0ee5dc1f.json
Added ./tmp/stream/json/58bcd6e3-9a74-4472-a044-a68d2e7a67c1.json
Added ./tmp/stream/json/e7e12635-2dbc-447b-8358-835e5ddf382f.json
Added ./tmp/stream/json/7240f18e-35a7-4372-b66c-7d8426d157bf.json


In [3]:
from flypipe import node
from flypipe.schema import Schema, Column
from flypipe.schema.types import Decimal, String
from flypipe.datasource.spark import Spark
import pyspark.sql.functions as F
@node(
    type="pyspark",
    dependencies=[
        Spark("sales")
    ],
    output=Schema(
     Column("product_id", String(), "product identifier"),   
     Column("total_sales", Decimal(18,2), "total sales amount"),
    )
)
def total_sales_node(sales):
    df = sales.groupBy("product_id").agg(F.sum(F.col("price") * F.col("quantity")).alias("total_sales"))
    return df

In [4]:
from pyspark.sql.types import StructType, ArrayType, StructField, StringType, DecimalType, IntegerType, TimestampType


def total_sales(batch_df, batch_id):
    
    print("Batch dataframe received:")
    display(batch_df)
    
    total_sales_df = (
        total_sales_node
        .run(inputs = {
            Spark("sales"): batch_df
        })
    )
    
    print("===> Saving dataframe calculated with node `total_sales_node` into table `total_sales`")
    
    (
      total_sales_df
      .write
      .format('delta')
      .mode('overwrite')
      .saveAsTable("total_sales")
    )
    
    return total_sales_df




# Create Stream
json_schema = StructType([
    StructField("sale_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("price", DecimalType(18,2), True),
    StructField("quantity", IntegerType(), True),
    StructField("sale_datetime", TimestampType(), True),
])


(
  spark
  .readStream
  .json(JSON_LOCATION, schema=json_schema)
  .writeStream
  .trigger(availableNow=True)
  .option("checkpointLocation", "./tmp/stream/_checkpoints/")
  .foreachBatch(total_sales)
  .start()
)

# Waitting process
from time import sleep

while True:
    try:
        spark.sql("select * from total_sales")
        break
    except Exception as e:
        sleep(2)

Batch dataframe received:


sale_id,product_id,price,quantity,sale_datetime
1c967e6a-37ec-477...,1,293.0,3,2022-11-29 06:49:29
fbce3dc5-afcd-43c...,2,126.0,4,2022-11-29 06:49:29
4cb9762b-3b31-4da...,1,704.0,9,2022-11-29 06:49:29
0714f0c1-48d4-4e4...,1,301.0,1,2022-11-29 06:49:29
37a55a44-69a3-473...,2,526.0,7,2022-11-29 06:49:29
4a0cd365-ebfe-411...,3,290.0,5,2022-11-29 06:49:29
c63d4fa9-e130-4f9...,4,142.0,8,2022-11-29 06:49:29
58bcd6e3-9a74-447...,2,182.0,8,2022-11-29 06:49:29
e7e12635-2dbc-447...,1,760.0,3,2022-11-29 06:49:29
7240f18e-35a7-437...,2,300.0,7,2022-11-29 06:49:29


===> Saving dataframe calculated with node `total_sales_node` into table `total_sales`


                                                                                

In [5]:
display(spark.sql("select * from total_sales"))

product_id,total_sales
3,1450.0
1,9796.0
4,1136.0
2,7742.0
