In [None]:
#this cell creates a Spark Session object that is used to interact with Spark
from pyspark.sql import SparkSession
ss = SparkSession.builder \
.config('spark.jars.packages', 'org.apache.kafka:kafka_2.11:1.1.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4') \
.getOrCreate()
ss.version
# version 2.4.4 uses Scala 2.11

In [None]:
brokers = 'kafka:9092'

In [None]:
df=(ss
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers",brokers)
  .option("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
  .option("subscribe", "my_topic")
  .option("startingOffsets", "earliest") # read data from the beginning of the stream
  .load())

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, TimestampType, BooleanType
mySchema = StructType([
            StructField("name", StringType(), True),
            StructField("id", StringType(), True),
            StructField("firstname", IntegerType(), True),
            StructField("lastname", StringType(), True),
            StructField("address", StringType(), True),
            StructField("timestamp", TimestampType(), True),
            StructField("gender", StringType(), True),
            StructField("arrested", BooleanType(), True),
            StructField("age", IntegerType(), True),
            StructField("race", StringType(), True)
])




In [None]:
from pyspark.sql.functions import from_json

df1 =(df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS TIMESTAMP)")
  .select(from_json("value", mySchema).alias("data"), "timestamp")
  .select("data.race",
          "data.gender", 
          "data.lastname", 
          "data.firstname", 
          "data.arrested", 
          "data.age", "timestamp"))

In [None]:
df1.printSchema()

In [None]:
## create your own streaming query
df1.createOrReplaceTempView("mytable")
df2 =ss.sql(''' XXXXXXXXX  ''')

In [None]:

streamingQuery1 = df2.writeStream \
  .outputMode("complete") \
  .format("memory") \
  .queryName("test") \
  .option("truncate", "false") \
  .trigger(processingTime = "15 seconds")\
  .start()

In [None]:
import time
for i in range(5):
    time.sleep(15)
    print(f'Batch number {i}')
    ss.sql('select * from test').show()

In [None]:
streamingQuery1.stop()