In [1]:
#Carga de librerias
from pyspark.sql.functions import from_json, col, to_timestamp, window, expr, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
#creación de session spark
spark = SparkSession \
        .builder \
        .appName("Tumbling Window Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 2) \
        .getOrCreate()

In [3]:
#creamos el esquema para leer la data en formato JSON que recibimos de KAFKA
stock_schema = StructType([
        StructField("CreatedTime", StringType()),
        StructField("Type", StringType()),
        StructField("Amount", IntegerType()),
        StructField("BrokerCode", StringType())
    ])

In [4]:
#Creamos el objeto de lectura del KAFKA
kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "trades") \
        .option("startingOffsets", "earliest") \
        .load()

In [5]:
#Transformamos el campo 'value' a una estructura del tipo JSON usando el esquema definido previamente
value_df = kafka_df.select(from_json(col("value").cast("string"), stock_schema).alias("value"))

In [6]:
#Realizamos calculos y casteamos el campo 'CreatedTime' a un tipo de dato TimeStamp
trade_df = value_df.select("value.*") \
        .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
        .withColumn("Buy", expr("case when Type == 'BUY' then Amount else 0 end")) \
        .withColumn("Sell", expr("case when Type == 'SELL' then Amount else 0 end"))

In [7]:
#Se realiza una agregación considerando una ventana de 15 minutos a partir del campo 'CreatedTime'
window_agg_df = trade_df \
        .groupBy(window(col("CreatedTime"), "15 minute")) \
        .agg(sum("Buy").alias("TotalBuy"),
             sum("Sell").alias("TotalSell"))

In [8]:
#Se realiza una agregación considerando una ventana de 15 minutos a partir del campo 'CreatedTime'
#Se declara como limite para la expiracion (Watermark) de los datos el rango de 30 minutos a partir del maximo 'CreatedTime'
window_agg_df = trade_df \
        .withWatermark("CreatedTime", "30 minute") \
        .groupBy(window(col("CreatedTime"), "15 minute")) \
        .agg(sum("Buy").alias("TotalBuy"),
             sum("Sell").alias("TotalSell"))

In [9]:
#Se selecciona los resultados que se mostraran
output_df = window_agg_df.select("window.start", "window.end", "TotalBuy", "TotalSell")

In [10]:
#Se imprime los resultados en la consola
window_query = output_df.writeStream \
        .format("console") \
        .outputMode("update") \
        .option("checkpointLocation", "./checkpoint/tumbling-window") \
        .trigger(processingTime="30 second") \
        .start()

In [None]:
window_query.awaitTermination()