<a href="https://colab.research.google.com/github/drmartins2/EDIT_DE/blob/main/5.%20Data%20Streaming/examples/example_3_api_json.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

In [3]:
!rm -rf /content/landing
!rm -rf /content/bronze
!mkdir -p /content/landing

## Simulate producer:
- extract data from API
- store data as json in the lake
- run task async

In [4]:
import requests
from pyspark.sql.types import *
import json
import datetime
import asyncio


# Check async function? - This will just read the data from the assigned API and save it as JSON in the landing folder
async def ingest_from_api(url: str, table: str, schema: StructType = None):
  response = requests.get(url)
  timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  if response.status_code == 200:
    data = response.json()
    with open(f"/content/landing/{table}_{int(timestamp)}.json", "w") as f:
        json.dump(data, f)

#  This will execute the API ingest method for the assigned loop times with th provided interval_time
async def producer(loop: int, interval_time: int):
  for i in range(loop):
    await ingest_from_api("https://api.carrismetropolitana.pt/vehicles", "vehicles")
    await ingest_from_api("https://api.carrismetropolitana.pt/lines", "lines")
    await asyncio.sleep(interval_time)

# Executes the producer method for 10 times with an interval of 30 sec (Check asyncio?)
async def main():
  asyncio.create_task(producer(10, 30))

await main()

- Read from /content/landing as streaming
- store data in memory (for testing)
- store data in the bronze layer

In [5]:
from pyspark.sql.types import *

vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                             StructField('block_id', StringType(), True),
                             StructField('current_status', StringType(), True),
                             StructField('id', StringType(), True),
                             StructField('lat', FloatType(), True),
                             StructField('line_id', StringType(), True),
                             StructField('lon', FloatType(), True),
                             StructField('pattern_id', StringType(), True),
                             StructField('route_id', StringType(), True),
                             StructField('schedule_relationship', StringType(), True),
                             StructField('shift_id', StringType(), True),
                             StructField('speed', FloatType(), True),
                             StructField('stop_id', StringType(), True),
                             StructField('timestamp', TimestampType(), True),
                             StructField('trip_id', StringType(), True)])

stream = spark.readStream.format("json").schema(vehicle_schema).load("/content/landing/vehicles*")

dedup = stream.dropDuplicates()

In [9]:
# Validates if dedup method is a streaming process and if it's running
dedup.isStreaming

True

In [12]:
# using memory for testing, needs this to run this cell as many times as we want
# once Spark streaming doens't allow to name a query with the same name in the same session
try:
  if query.isActive:
    query.stop()
except:
  pass

# format("memory") writes directly to memory instead of a physical target - only used for testing purposes
# this will create a querable table "vehicles"
query = (dedup.writeStream.format("memory").option("queryName", "vehicles").start())

In [25]:
# AUX functions for streaming troubleshooting

# Gives us the current status of the query method defined above every time we run this
query.status

# Useful for troubleshooting, gives us the metadata fo the micro batch running
query.lastProgress

# If we want to see the metadata info from all the micro batches executions
query.recentProgress

[{'id': '3cc67683-8494-4644-9fa9-fee399ff797e',
  'runId': '17f9249b-09b2-4214-af99-b677a223495c',
  'name': 'vehicles',
  'timestamp': '2024-11-30T09:58:28.200Z',
  'batchId': 0,
  'numInputRows': 3999,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 338.3821289558301,
  'durationMs': {'addBatch': 11594,
   'commitOffsets': 74,
   'getBatch': 23,
   'latestOffset': 61,
   'queryPlanning': 22,
   'triggerExecution': 11818,
   'walCommit': 41},
  'stateOperators': [{'operatorName': 'dedupe',
    'numRowsTotal': 3594,
    'numRowsUpdated': 3594,
    'allUpdatesTimeMs': 3137,
    'numRowsRemoved': 0,
    'allRemovalsTimeMs': 0,
    'commitTimeMs': 4975,
    'memoryUsedBytes': 1573832,
    'numRowsDroppedByWatermark': 0,
    'numShufflePartitions': 200,
    'numStateStoreInstances': 200,
    'customMetrics': {'loadedMapCacheHitCount': 0,
     'loadedMapCacheMissCount': 0,
     'numDroppedDuplicateRows': 405,
     'stateOnCurrentVersionSizeBytes': 1545032}}],
  'sources': [{'descri

In [28]:
spark.sql("select * from vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    183|                1155| IN_TRANSIT_TO| 42|1155|38.917503|   2626|-9.322189|  2626_0_1|  2626_0|            SCHEDULED|       44537|11.666667| 080448|2024-11-30 09:53:02|2626_0_1|160|1|09...|
|      0|      ESC_SAB_ES1049|    STOPPED_AT| 43|2378| 38.63942|   3103|-9.149403|  3103_0_2|  3103_0|            SCHEDULED|      ES1049|      0.0| 140137|2024-11-30 09:52:39|3103_0_2_0900_092...|
|    296|      

In [27]:
# It will just stop the query execution, but the data will remain available in memory until the spark session is running
query.stop()

In [29]:
spark.sql("select * from vehicles").show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|    183|                1155| IN_TRANSIT_TO| 42|1155|38.917503|   2626|-9.322189|  2626_0_1|  2626_0|            SCHEDULED|       44537|11.666667| 080448|2024-11-30 09:53:02|2626_0_1|160|1|09...|
|      0|      ESC_SAB_ES1049|    STOPPED_AT| 43|2378| 38.63942|   3103|-9.149403|  3103_0_2|  3103_0|            SCHEDULED|      ES1049|      0.0| 140137|2024-11-30 09:52:39|3103_0_2_0900_092...|
|    296|      

# Simulating from landing zone (memory) to Bronze layer (store transformed parquet file)

In [30]:
!rm -rf /content/bronze

In [31]:
from pyspark.sql.functions import *

# watermark is necessary because of the aggregation
transformed = stream.withWatermark("timestamp", "60 seconds")
agg = (transformed
       .groupBy(window(transformed.timestamp, "5 minutes"), col("current_status"))
       .agg(min(transformed.timestamp).alias("init_timestamp"), count("*").alias("count")))

def insert_vehicles(df, batch_id):
  #df2 = df.groupBy("window").pivot("current_status").sum("count")
  df.write.format("parquet").mode("append").save("/content/bronze/vehicles")

# using memory for testing
query2 = (agg
          .writeStream
          .outputMode("append")
          # to write into a parquet or json file using spark streaming, it required to use a foreach method
          .foreachBatch(insert_vehicles)
          # In this case only ingesting from one entity, so only one checkpoint folder is required
          .option("checkpointLocation", "/content/bronze/checkpoint")
          .trigger(processingTime='20 seconds')
          .start())

In [56]:
spark.read.format("parquet").load("/content/bronze/vehicles/*").show(100, False)

+------------------------------------------+--------------+-------------------+-----+
|window                                    |current_status|init_timestamp     |count|
+------------------------------------------+--------------+-------------------+-----+
|{2024-11-30 09:50:00, 2024-11-30 09:55:00}|IN_TRANSIT_TO |2024-11-30 09:51:37|1187 |
|{2024-11-30 09:50:00, 2024-11-30 09:55:00}|INCOMING_AT   |2024-11-30 09:51:34|411  |
|{2024-11-30 09:50:00, 2024-11-30 09:55:00}|STOPPED_AT    |2024-11-30 09:51:26|583  |
+------------------------------------------+--------------+-------------------+-----+



## Report
- show vehicles by status in 5-min window time
- one line per window time

In [57]:
def pivot_data(df: DataFrame):
  result = df.orderBy("init_timestamp").groupBy("window").pivot("current_status").sum("count")
  result.show(100, False)

df = spark.read.format("parquet").load("/content/bronze/vehicles/*")
pivot_data(df)

+------------------------------------------+-----------+-------------+----------+
|window                                    |INCOMING_AT|IN_TRANSIT_TO|STOPPED_AT|
+------------------------------------------+-----------+-------------+----------+
|{2024-11-30 09:50:00, 2024-11-30 09:55:00}|411        |1187         |583       |
+------------------------------------------+-----------+-------------+----------+



In [58]:
query2.stop()