# Structured Streaming Reading data from Apache Kafka Console
## --> Word Count Example

In [1]:
import findspark
findspark.init()

### 1. Importing spark-sql-kafka dependency and configuration

In [2]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.spark:spark-sql-kafka-0-10_2.11-2.3.1  pyspark-shell"

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder\
.master("local[4]")\
.appName("ReadFromKafka")\
.getOrCreate()

### 2. Creating a Kafka Source for Streaming Queries which topic called helloWorld

#### Single topic

In [5]:
df = spark\
.readStream\
.format("kafka")\
.option("kafka.bootstrap.servers", "localhost:9092")\
.option("subscribe","helloWorld")\
.load()

#### Mutiple topic

In [6]:
df = spark\
.readStream\
.format("kafka")\
.option("kafka.bootstrap.servers", "localhost:9092")\
.option("subscribe","helloWorld, deneme")\
.load()

#### Select Kafka source data as key and value

In [7]:
df2 = df.selectExpr("CAST(key AS STRING)","CAST(value AS STRING)")

#### Count words using GroupBy function

In [8]:
words = df2.select(explode(split(col("value"), " ")).alias("value"))
word_count = words.groupBy("value").count().sort(desc("count"))

### 3. Starting Streaming

In [9]:
query = word_count.writeStream\
.format("console")\
.outputMode("complete")\
.start()

In [None]:
query.awaitTermination()