<a href="https://colab.research.google.com/github/drmartins2/EDIT_DE/blob/main/spark_streaming/examples/example_2_rate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 2
- Reading data from "rate"
- Aggregating data by window time
- Checking results from query in memory

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# Write output in memory

In [8]:
import pyspark.sql.functions as F

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream in memory
query = (agg.writeStream
.format('memory')
.queryName('my_query')
.outputMode('complete')
.start()
)

In [12]:
spark.sql("select * from my_query order by window desc").show(10,False)

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2024-11-23 15:50:35, 2024-11-23 15:50:40}|13   |
|{2024-11-23 15:50:30, 2024-11-23 15:50:35}|50   |
|{2024-11-23 15:50:25, 2024-11-23 15:50:30}|50   |
|{2024-11-23 15:50:20, 2024-11-23 15:50:25}|50   |
|{2024-11-23 15:50:15, 2024-11-23 15:50:20}|50   |
|{2024-11-23 15:50:10, 2024-11-23 15:50:15}|50   |
|{2024-11-23 15:50:05, 2024-11-23 15:50:10}|50   |
|{2024-11-23 15:50:00, 2024-11-23 15:50:05}|50   |
|{2024-11-23 15:49:55, 2024-11-23 15:50:00}|50   |
|{2024-11-23 15:49:50, 2024-11-23 15:49:55}|50   |
+------------------------------------------+-----+
only showing top 10 rows



In [10]:
query.lastProgress

{'id': '1ca021b8-eca5-4373-ac90-f4eeb0e0d6e4',
 'runId': 'c6b57d58-84df-4824-9d2e-0028325530b5',
 'name': 'my_query',
 'timestamp': '2024-11-23T15:50:12.379Z',
 'batchId': 3,
 'numInputRows': 160,
 'inputRowsPerSecond': 10.372099053545961,
 'processedRowsPerSecond': 13.00284437220642,
 'durationMs': {'addBatch': 12077,
  'commitOffsets': 142,
  'getBatch': 0,
  'latestOffset': 0,
  'queryPlanning': 36,
  'triggerExecution': 12305,
  'walCommit': 48},
 'stateOperators': [{'operatorName': 'stateStoreSave',
   'numRowsTotal': 10,
   'numRowsUpdated': 4,
   'allUpdatesTimeMs': 184,
   'numRowsRemoved': 0,
   'allRemovalsTimeMs': 0,
   'commitTimeMs': 7686,
   'memoryUsedBytes': 88696,
   'numRowsDroppedByWatermark': 0,
   'numShufflePartitions': 200,
   'numStateStoreInstances': 200,
   'customMetrics': {'loadedMapCacheHitCount': 1200,
    'loadedMapCacheMissCount': 0,
    'stateOnCurrentVersionSizeBytes': 22800}}],
 'sources': [{'description': 'RateStreamV2[rowsPerSecond=10, rampUpTimeSec

In [13]:
query.stop()

# Write output as json

In [20]:
!rm -rf content/output

In [21]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame

def save_parquet(df, batch_id):
  (df
   .withColumn("batch_id",F.lit(batch_id))
   .withColumn("load_time",F.current_timestamp())
   .write.mode("append")
   .parquet("content/output/rate_parquet")
  )

# read stream
stream1 = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# transform
transformed = stream1.withWatermark("timestamp", "5 seconds").withColumn("minute", F.minute("timestamp"))
agg = transformed.groupBy(F.window(transformed.timestamp, "5 seconds")).count()

# write stream as parquet with foreachBatch
query = (agg.writeStream
.option('checkpointLocation', 'content/output/checkpoint')
.trigger(processingTime='20 seconds')
.outputMode('append')
.foreachBatch(save_parquet)
.start()
)


In [23]:
result = spark.read.format("parquet").load("content/output/rate_parquet/")
result.sort(F.asc("window")).show(100, False)

+------------------------------------------+-----+--------+--------------------------+
|window                                    |count|batch_id|load_time                 |
+------------------------------------------+-----+--------+--------------------------+
|{2024-11-23 15:58:50, 2024-11-23 15:58:55}|49   |2       |2024-11-23 15:59:20.164956|
|{2024-11-23 15:58:55, 2024-11-23 15:59:00}|50   |3       |2024-11-23 15:59:40.166114|
|{2024-11-23 15:59:00, 2024-11-23 15:59:05}|50   |3       |2024-11-23 15:59:40.166114|
|{2024-11-23 15:59:05, 2024-11-23 15:59:10}|50   |3       |2024-11-23 15:59:40.166114|
|{2024-11-23 15:59:10, 2024-11-23 15:59:15}|50   |4       |2024-11-23 16:00:00.201819|
|{2024-11-23 15:59:15, 2024-11-23 15:59:20}|50   |4       |2024-11-23 16:00:00.201819|
|{2024-11-23 15:59:20, 2024-11-23 15:59:25}|50   |4       |2024-11-23 16:00:00.201819|
|{2024-11-23 15:59:25, 2024-11-23 15:59:30}|50   |4       |2024-11-23 16:00:00.201819|
|{2024-11-23 15:59:30, 2024-11-23 15:59:35}

In [24]:
query.stop()

# Enrich data with faker

In [25]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [26]:
!rm -rf content/output/events

In [27]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from faker import Faker

def insert_into_table(df, batch_id):
  fake = Faker()
  new_columns = {
      'name': F.lit(fake.name()),
      'address': F.lit(fake.address()),
      'email': F.lit(fake.email()),
      'dob': F.lit(fake.date_of_birth()),
      'phone': F.lit(fake.phone_number())
  }
  df = df.withColumns(new_columns)
  df.write.mode("append").format("parquet").save("content/output/events")

# read stream
df_stream = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# write stream
query = (df_stream.writeStream
.outputMode('append')
.trigger(processingTime='1 seconds')
.foreachBatch(insert_into_table)
.start()
)

In [28]:
query.stop()


In [29]:
spark.read.parquet("content/output/events").show(100, False)

+-----------------------+-----+-------------------+--------------------------------------------------------+-----------------------------+----------+----------------------+
|timestamp              |value|name               |address                                                 |email                        |dob       |phone                 |
+-----------------------+-----+-------------------+--------------------------------------------------------+-----------------------------+----------+----------------------+
|2024-11-23 16:08:15.003|24   |Jeffrey Hoffman    |01657 Isaac Tunnel Apt. 796\nNorth Mariaton, PA 04996   |christopher38@example.com    |2010-01-22|+1-579-282-8466x6297  |
|2024-11-23 16:08:01.003|10   |Mrs. Wanda Wagner  |19997 Jeffrey Terrace\nNew William, OH 23757            |cunninghamshannon@example.net|1995-10-17|+1-851-754-5225x1322  |
|2024-11-23 16:08:05.003|14   |Keith Griffin      |3851 Jackson Stream Suite 301\nAriasmouth, KS 05433     |kevinsimmons@example.net   

In [30]:
query.stop()