In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName('Stream Demo') \
    .getOrCreate()

spark 

In [7]:
from pyspark.sql.functions import *

weather_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "weather-data") \
    .option("startingOffsets", "latest") \
    .load()
    

traffic_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "traffic-data") \
    .option("startingOffsets", "latest") \
    .load()

weather_json_df = weather_stream.withColumn('value', expr('cast(value as string)')).withColumn('key', expr('cast(key as string)'))
traffic_json_df = traffic_stream.withColumn('value', expr('cast(value as string)')).withColumn('key', expr('cast(key as string)'))

weather_schema = StructType([
    StructField('Longitude', DoubleType()),
    StructField('Latitude', DoubleType()),
    StructField('weather', StringType()),
    StructField('weather_description', StringType()),
    StructField('temp', DoubleType()),
    StructField('visibility', IntegerType()),
    StructField('clouds', IntegerType()),
    StructField('rain', IntegerType()),
    StructField('snow', IntegerType()),
    StructField('date', IntegerType()),
    StructField('name', StringType())
])

traffic_schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("current_speed", IntegerType(), True),
    StructField("free_flow_speed", IntegerType(), True),
    StructField("current_travel_time", IntegerType(), True),
    StructField("free_flow_travel_time", IntegerType(), True),
    StructField("confidence", IntegerType(), True),
    StructField("road_closure", BooleanType(), True)
])

weather_parsed_df = weather_json_df.withColumn("values_json", from_json(col("value"), weather_schema))
traffic_parsed_df = traffic_json_df.withColumn("values_json", from_json(col("value"), traffic_schema))

weather_flatten_df = weather_parsed_df.select("key",
                weather_parsed_df['values_json.Longitude'].alias('longitude'), 
                weather_parsed_df['values_json.Latitude'].alias('latitude'), 
                weather_parsed_df['values_json.weather'].alias('weather'), 
                weather_parsed_df['values_json.weather_description'].alias('weather_description'), 
                weather_parsed_df['values_json.temp'].alias('temp'), 
                weather_parsed_df['values_json.visibility'].alias('visibility'),
                weather_parsed_df['values_json.clouds'].alias('clouds'),
                weather_parsed_df['values_json.rain'].alias('rain'),
                weather_parsed_df['values_json.snow'].alias('snow'),
                weather_parsed_df['values_json.date'].alias('date'),
                weather_parsed_df['values_json.name'].alias('name'))

traffic_flatten_df = traffic_parsed_df.select(
    col("values_json.latitude").alias("latitude"), 
    col("values_json.longitude").alias("longitude"),
    col("values_json.current_speed").alias("current_speed"),
    col("values_json.free_flow_speed").alias("free_flow_speed"),
    col("values_json.current_travel_time").alias("current_travel_time"),
    col("values_json.free_flow_travel_time").alias("free_flow_travel_time"),
    col("values_json.confidence").alias("confidence"),
    col("values_json.road_closure").alias("road_closure")
)


#traffic_flatten_df.show(truncate=False)
#traffic_flatten_df.printSchema()

In [None]:
weather_query = weather_flatten_df.writeStream \
    .outputMode('update') \
    .format('console') \
    .start()
traffic_query = traffic_flatten_df.writeStream \
    .outputMode('update') \
    .format('console') \
    .start()

print("Streaming started... waiting for data...")
weather_query.awaitTermination()
traffic_query.awaitTermination()