In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName('Stream Demo') \
    .getOrCreate()

# Set the legacy time parser policy to handle the date format correctly
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

spark 

In [3]:
from pyspark.sql.functions import *

weather_stream = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "weather-data") \
    .load()

#.option("startingOffsets", "latest") \

#.option("startingOffsets", "latest") \


traffic_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "traffic-data") \
    .option("startingOffsets", "latest") \
    .load()

#.option("startingOffsets", "latest") \


weather_json_df = weather_stream.selectExpr("CAST(value AS STRING) as value")

traffic_json_df = traffic_stream.withColumn('value', expr('cast(value as string)')).withColumn('key', expr('cast(key as string)'))

weather_schema = StructType([
    StructField('name', StringType()),
    StructField('latitude', DoubleType()),
    StructField('longitude', DoubleType()),
    StructField('date', IntegerType()),
    StructField('weather', StringType()),
    StructField('weather_description', StringType()),
    StructField('temp', DoubleType()),
    StructField('visibility', IntegerType()),
    StructField('clouds', IntegerType()),
    StructField('rain', DoubleType()),
    StructField('snow', DoubleType()),
])

traffic_schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("current_speed", IntegerType(), True),
    StructField("free_flow_speed", IntegerType(), True),
    StructField("current_travel_time", IntegerType(), True),
    StructField("free_flow_travel_time", IntegerType(), True),
    StructField("confidence", IntegerType(), True),
    StructField("road_closure", BooleanType(), True),
    StructField("date", StringType(), True)
])

# Parse the 'value' column as JSON
weather_parsed_df = weather_json_df.select(from_json("value", weather_schema).alias("data"))
traffic_parsed_df = traffic_json_df.withColumn("values_json", from_json(col("value"), traffic_schema))


# Flatten the JSON into separate columns
weather_flatten_df = weather_parsed_df.select(
                col('data.name').alias('name'),
                col('data.latitude').alias('latitude'),
                col('data.longitude').alias('longitude'), 
                from_unixtime(col('data.date')).alias('date_unix'),
                from_utc_timestamp(col("date_unix"), "America/Los_Angeles").alias("date"),
                col('data.weather').alias('weather'), 
                col('data.weather_description').alias('weather_description'), 
                col('data.temp').alias('temp'), 
                col('data.visibility').alias('visibility'),
                col('data.clouds').alias('clouds'),
                col('data.rain').alias('rain'),
                col('data.snow').alias('snow'))
    
weather_flatten_df = weather_flatten_df.drop("date_unix")

traffic_flatten_df = traffic_parsed_df.select(
    col("values_json.latitude").alias("latitude"), 
    col("values_json.longitude").alias("longitude"),
    col("values_json.current_speed").alias("current_speed"),
    col("values_json.free_flow_speed").alias("free_flow_speed"),
    col("values_json.current_travel_time").alias("current_travel_time"),
    col("values_json.free_flow_travel_time").alias("free_flow_travel_time"),
    col("values_json.confidence").alias("confidence"),
    col("values_json.road_closure").alias("road_closure"),
    to_timestamp(col("values_json.date"), "EEE, dd MMM yyyy HH:mm:ss z").alias("date_utc"),
    from_utc_timestamp(col("date_utc"), "America/Los_Angeles").alias("date")
)

traffic_flatten_df = traffic_flatten_df.drop("date_utc")

#weather_parsed_df.printSchema()
weather_flatten_df.show(truncate=False)

# Convert date string to timestamp in UTC
#to_timestamp(col("values_json.date"), "EEE, dd MMM yyyy HH:mm:ss z").alias("date_utc"),
# Convert to PST
#from_utc_timestamp(col("values_json.date"), "America/Los_Angeles").alias("date")


# Convert UNIX timestamp to 
#weather_flatten_df = weather_flatten_df.withColumn((from_unixtime(col("date")).alias("date"))
#traffic_flatten_df.show(truncate=False)
#traffic_flatten_df.printSchema()

+--------------+--------+---------+-------------------+-------+-------------------+-----+----------+------+----+----+
|name          |latitude|longitude|date               |weather|weather_description|temp |visibility|clouds|rain|snow|
+--------------+--------+---------+-------------------+-------+-------------------+-----+----------+------+----+----+
|NULL          |NULL    |NULL     |NULL               |NULL   |NULL               |NULL |NULL      |NULL  |NULL|NULL|
|NULL          |NULL    |NULL     |NULL               |NULL   |NULL               |NULL |NULL      |NULL  |NULL|NULL|
|NULL          |NULL    |NULL     |NULL               |NULL   |NULL               |NULL |NULL      |NULL  |NULL|NULL|
|Port Coquitlam|49.2839 |-122.7933|2025-03-25 23:21:47|Clouds |overcast clouds    |11.47|10000     |100   |0.0 |0.0 |
|Hope          |49.3797 |-121.4414|2025-03-25 23:21:47|Clear  |clear sky          |10.96|10000     |0     |0.0 |0.0 |
|Port Coquitlam|49.2617 |-122.7803|2025-03-25 23:21:47|C

In [None]:
weather_query = weather_flatten_df \
    .writeStream \
    .outputMode('append') \
    .format('console') \
    .start()
traffic_query = traffic_flatten_df \
    .writeStream \
    .outputMode('append') \
    .format('console') \
    .start()

print("Streaming started... waiting for data...")
weather_query.awaitTermination(60)
traffic_query.awaitTermination(60)

#weather_query.stop()
#traffic_query.stop()

Streaming started... waiting for data...


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
weather_query.stop()
traffic_query.stop()