In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, TimestampType, LongType, IntegerType, StringType, ArrayType
import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('kafka-stram')
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0")
    .getOrCreate()
)

In [3]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

stream_input_path = os.path.join(project_path, 'output/questions-transformed')

checkpoint_location = os.path.join(project_path, 'output/streaming-output/checkpoint/1')

In [4]:
schema = StructType(
    [
        StructField('question_id', LongType()),
        StructField('tags', ArrayType(StringType()))
        
    ]
)

In [5]:
questionsDF = (
    spark
    .readStream
    .schema(schema)
    .parquet(stream_input_path)
)

In [6]:
x = (
    questionsDF
    .selectExpr(
        "CAST(question_id AS STRING) AS key",
        "to_json(struct(question_id, tags)) AS value"
    )
    .writeStream
    .format('kafka')
    .option('topic', 'questions')
    .option("kafka.bootstrap.servers", "localhost:9092")
    .queryName('questions-query')
    .option('checkpointLocation', checkpoint_location)
    .start()
)

In [8]:
x.lastProgress

{'id': 'db41b592-a58b-4536-a7d1-de46c4d12f1c',
 'runId': '68e67463-d2c9-4186-99a6-6a6fdedc974e',
 'name': 'questions-query',
 'timestamp': '2019-05-11T15:33:06.196Z',
 'batchId': 1,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'getOffset': 2, 'triggerExecution': 2},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/home/david/From-Simple-Transformations-to-Highly-Efficient-Jobs/output/questions-transformed]',
   'startOffset': {'logOffset': 0},
   'endOffset': {'logOffset': 0},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSourceProvider@185bc2d2'}}