# Kafka Stream

## Task - write data to Kafka

* read data from parquet and write it to Kafka stream
* create queue that will serve as input in the next notebook
* the stream should have two cols: question_id, body

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, TimestampType, LongType, IntegerType, StringType, ArrayType
import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('kafka-stream')
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0")
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

stream_input_path = os.path.join(project_path, 'output/questions-transformed')

checkpoint_location = os.path.join(project_path, 'output/streaming-output/checkpoint/1')

#### Define the schema:

In [None]:
schema = StructType(
    [
        StructField('question_id', LongType()),
        StructField('body', StringType())
    ]
)

#### Read the data from parquet as stream

In [None]:
questionsDF = (
    spark
    .readStream
    .schema(schema)
    .parquet(stream_input_path)
)

#### Write the data to Kafka

In [None]:
questions_stream = (
    questionsDF
    .selectExpr(
        "CAST(question_id AS STRING) AS key",
        "to_json(struct(question_id, tags)) AS value"
    )
    .writeStream
    .format('kafka')
    .option('topic', 'questions')
    .option("kafka.bootstrap.servers", "localhost:9092")
    .queryName('questions-query')
    .option('checkpointLocation', checkpoint_location)
    .start()
)

In [None]:
questions_stream.lastProgress