# Structured Streaming

## Task - read data from Kafka stream

* Read data from the stream you created in previous notebook (create-kafka-stream)
* Add new column 'physicist' in which you apply the function you implemented in Text search ntb
* Write the result to memory

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit

from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, LongType, StringType

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Streaming II')
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0")
    .getOrCreate()
)

In [None]:
def get_person_dynamic(message):
    col_exp = when(lit(False), '')
    for name in names:
        col_exp = col_exp.when(message.like('%{}%'.format(name)), name)
    return col_exp.otherwise('other')

In [None]:
stream_schema = StructType(
    [
        StructField('question_id', LongType()),
        StructField('body', StringType())
    ]
)

In [None]:
kafka_stream = ( 
    spark 
    .readStream 
    .format("kafka") 
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "questions")
    .load()
)

In [None]:
stream = (
    kafka_stream
    .selectExpr("CAST(value AS STRING) AS v")
    .select(from_json(col('v'), stream_schema))
    .withColumn('physicist', get_person_dynamic(col('body')))
    .writeStream
    .format('memory')
    .outputMode('append')
    .queryName('my_stream')
    .start()
)

In [None]:
stream.stop()

In [None]:
spark.stop()