## Consume data from Kafka stream

* in the previous ntb you published some data to Kafka stream
* read and process the data from the stream

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, udf, pandas_udf, PandasUDFType
from scipy.stats import poisson
import pandas as pd
import os
from pyspark.sql.types import *

spark = (
    SparkSession
    .builder
    .appName('kafka_test')
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0-preview")
    .getOrCreate()
)

In [12]:
spark.version

'3.0.0-preview'

In [13]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

checkpoint_location = os.path.join(project_path, 'output/streaming-output/checkpoint/4')

In [14]:
stream_schema = StructType(
    [
        StructField('question_id', LongType()),
        StructField('creation_date', TimestampType()),
        StructField('title', StringType()),
        StructField('r', IntegerType())
    ]
)

In [15]:
streaming_query = (
    spark
    .readStream
    .format('kafka')
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "myTop")
    .option('startingOffsets', 'earliest')
    .load()
)

In [16]:
parsed_data = (
    streaming_query
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
)

In [19]:
q = (
    parsed_data
    .writeStream
    .format('memory')
    .outputMode('append')
    .queryName('my_stream')
    .start()
)

In [20]:
spark.sql("select * from my_stream").show()

+----+--------------------+
| key|               value|
+----+--------------------+
|null|This is message n...|
|null|  This is my message|
|null|[9076, 2011-04-25...|
|null|[30283, 2012-06-1...|
|null|[21633, 2012-02-2...|
|null|[8286, 2011-04-08...|
|null|[26591, 2012-03-2...|
|null|[10631, 2011-05-3...|
|null|[6690, 2011-03-11...|
|null|[17913, 2011-12-0...|
|null|[15992, 2011-10-2...|
|null|[43207, 2012-11-0...|
|null|[20525, 2012-02-0...|
|null|[32947, 2012-07-2...|
|null|[30949, 2012-06-2...|
|null|[32876, 2012-07-2...|
|null|[43756, 2012-11-0...|
|null|[40890, 2012-10-1...|
|null|[31142, 2012-07-0...|
|null|[32360, 2012-07-1...|
+----+--------------------+
only showing top 20 rows



In [18]:
q.stop()

In [None]:
spark.stop()