In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName('Stream Demo') \
    .getOrCreate()

spark 

In [4]:
from pyspark.sql.functions import *

df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "test") \
    .option("startingOffsets", "earliest") \
    .load()


json_df = df.withColumn('value', expr('cast(value as string)')).withColumn('key', expr('cast(key as string)'))

df_schema = StructType([
    StructField('Longitude', DoubleType()),
    StructField('Latitude', DoubleType()),
    StructField('weather', StringType()),
    StructField('weather_description', StringType()),
    StructField('temp', IntegerType()),
    StructField('visibility', IntegerType()),
    StructField('clouds', IntegerType()),
    StructField('rain', IntegerType()),
    StructField('snow', IntegerType()),
    StructField('date', IntegerType()),
    StructField('name', StringType())
])

df = json_df.select(col('key'), col('value'))


df = df.withColumn('values_json', from_json(col('value'), df_schema))

flatten_df = df.select("key",
                df['values_json.Longitude'].alias('longitude'), 
                df['values_json.Latitude'].alias('latitude'), 
                df['values_json.weather'].alias('weather'), 
                df['values_json.weather_description'].alias('weather_description'), 
                df['values_json.temp'].alias('temp'), 
                df['values_json.visibility'].alias('visibility'),
                df['values_json.clouds'].alias('clouds'),
                df['values_json.rain'].alias('rain'),
                df['values_json.snow'].alias('snow'),
                df['values_json.date'].alias('date'),
                df['values_json.name'].alias('name'))

#flatten_df.show()
#flatten_df.printSchema()

In [None]:
query = flatten_df.writeStream \
    .outputMode('append') \
    .format('console') \
    .start()

query.awaitTermination()

query.stop()