## Publish data to Kafka stream

* stream data from file system and publish them to Kafka topic

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, udf, pandas_udf, PandasUDFType, struct
from scipy.stats import poisson
import pandas as pd
import os
from pyspark.sql.types import *

spark = (
    SparkSession
    .builder
    .appName('kafka_test')
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0-preview")
    .getOrCreate()
)

In [2]:
spark.version

'3.0.0-preview'

In [3]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

stream_input_path = os.path.join(project_path, 'data/questions-queue')

checkpoint_location = os.path.join(project_path, 'output/streaming-output/checkpoint/2')

In [4]:
stream_schema = StructType(
    [
        StructField('question_id', LongType()),
        StructField('creation_date', TimestampType()),
        StructField('title', StringType()),
        StructField('r', IntegerType())
    ]
)

In [5]:
streaming_query = (
    spark
    .readStream
    .schema(stream_schema)
    .option('maxFilesPerTrigger', 5)
    .json(stream_input_path)
    .select(struct(col('question_id'), col('creation_date')).cast('string').alias('value'))
)

In [6]:
streaming_query

DataFrame[value: string]

In [7]:
q = (
    streaming_query
    .writeStream
    .format('kafka')
    .outputMode('append')
    .queryName('my_stream_publish')
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("topic", "myTop")
    .option("checkpointLocation", checkpoint_location)
    .start()
)

In [8]:
#q.stop()

In [9]:
#spark.stop()