In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName('Stream Demo') \
    .getOrCreate()

# Set the legacy time parser policy to handle the date format correctly
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

spark 

In [3]:
from pyspark.sql.functions import *

weather_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "weather-data") \
    .option("startingOffsets", "latest") \
    .load()


print(weather_stream.isStreaming)

traffic_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "traffic-data") \
    .option("startingOffsets", "latest") \
    .load()
    

weather_json_df = weather_stream.withColumn('value', expr('cast(value as string)')).withColumn('key', expr('cast(key as string)'))

#weather_json_df.show(truncate=False)
traffic_json_df = traffic_stream.withColumn('value', expr('cast(value as string)')).withColumn('key', expr('cast(key as string)'))

weather_schema = StructType([
    StructField('Longitude', DoubleType()),
    StructField('Latitude', DoubleType()),
    StructField('weather', StringType()),
    StructField('weather_description', StringType()),
    StructField('temp', DoubleType()),
    StructField('visibility', IntegerType()),
    StructField('clouds', IntegerType()),
    StructField('rain', DoubleType()),
    StructField('snow', DoubleType()),
    StructField('date', IntegerType()),
    StructField('name', StringType())
])

traffic_schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("current_speed", IntegerType(), True),
    StructField("free_flow_speed", IntegerType(), True),
    StructField("current_travel_time", IntegerType(), True),
    StructField("free_flow_travel_time", IntegerType(), True),
    StructField("confidence", IntegerType(), True),
    StructField("road_closure", BooleanType(), True),
    StructField("date", StringType(), True)
])

weather_parsed_df = weather_json_df.withColumn("values_json", from_json(col("value"), weather_schema))
traffic_parsed_df = traffic_json_df.withColumn("values_json", from_json(col("value"), traffic_schema))

weather_flatten_df = weather_parsed_df.select("key",
                weather_parsed_df['values_json.Longitude'].alias('longitude'), 
                weather_parsed_df['values_json.Latitude'].alias('latitude'), 
                weather_parsed_df['values_json.weather'].alias('weather'), 
                weather_parsed_df['values_json.weather_description'].alias('weather_description'), 
                weather_parsed_df['values_json.temp'].alias('temp'), 
                weather_parsed_df['values_json.visibility'].alias('visibility'),
                weather_parsed_df['values_json.clouds'].alias('clouds'),
                weather_parsed_df['values_json.rain'].alias('rain'),
                weather_parsed_df['values_json.snow'].alias('snow'),
                from_unixtime(col('values_json.date')).alias('date_unix'),
                from_utc_timestamp(col("date_unix"), "America/Los_Angeles").alias("date"),
                weather_parsed_df['values_json.name'].alias('name'))

weather_flatten_df = weather_flatten_df.drop("date_unix")

traffic_flatten_df = traffic_parsed_df.select(
    col("values_json.latitude").alias("latitude"), 
    col("values_json.longitude").alias("longitude"),
    col("values_json.current_speed").alias("current_speed"),
    col("values_json.free_flow_speed").alias("free_flow_speed"),
    col("values_json.current_travel_time").alias("current_travel_time"),
    col("values_json.free_flow_travel_time").alias("free_flow_travel_time"),
    col("values_json.confidence").alias("confidence"),
    col("values_json.road_closure").alias("road_closure"),
    to_timestamp(col("values_json.date"), "EEE, dd MMM yyyy HH:mm:ss z").alias("date_utc"),
    from_utc_timestamp(col("date_utc"), "America/Los_Angeles").alias("date")
)

traffic_flatten_df = traffic_flatten_df.drop("date_utc")

# Convert date string to timestamp in UTC
#to_timestamp(col("values_json.date"), "EEE, dd MMM yyyy HH:mm:ss z").alias("date_utc"),
# Convert to PST
#from_utc_timestamp(col("values_json.date"), "America/Los_Angeles").alias("date")


# Convert UNIX timestamp to 
#weather_flatten_df = weather_flatten_df.withColumn((from_unixtime(col("date")).alias("date"))
#traffic_flatten_df.show(truncate=False)
#traffic_flatten_df.printSchema()

True


In [7]:
# Start the weather stream and output it to the console to check the data
weather_flatten_df.writeStream \
    .outputMode('append') \
    .format('console') \
    .start() \
    .awaitTermination(60)  

AnalysisException: [WRITE_STREAM_NOT_ALLOWED] `writeStream` can be called only on streaming Dataset/DataFrame.

In [4]:
weather_query = weather_flatten_df.writeStream \
    .outputMode('update') \
    .format('console') \
    .start()
traffic_query = traffic_flatten_df.writeStream \
    .outputMode('update') \
    .format('console') \
    .start()

print("Streaming started... waiting for data...")
weather_query.awaitTermination(60)
traffic_query.awaitTermination(60)

#weather_query.stop()
#traffic_query.stop()

Streaming started... waiting for data...


StreamingQueryException: [STREAM_FAILED] Query [id = 6ed81136-7e4e-4a8a-bd8f-00f5567d6a32, runId = c792758f-ab3f-4504-ba8c-41bd92fc3203] terminated with exception: org.apache.kafka.common.errors.UnknownTopicOrPartitionException: This server does not host this topic-partition.

In [12]:
weather_query.stop()
traffic_query.stop()