# Demo: reading from our Kafka server
This notebook reads from topic 'noam' which contains a few strings.
If it works it means that all the parts are working together. It does NOT means there will not be performance problems down the road.

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import *
import os,time

In [0]:
%%time 
# The config packages must match the specific Spark version you run!
spark = SparkSession.builder.appName('streaming')\
    .config("spark.kryoserializer.buffer.max", "512m")\
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0')\
    .getOrCreate()


CPU times: user 905 µs, sys: 0 ns, total: 905 µs
Wall time: 12.4 ms


In [0]:
kafka_server = "kafka96224.eastus.cloudapp.azure.com:29092" 
topic = "noam"             # the topic name where the data is stored

In [0]:
SCHEMA = "name STRING"
static_df = spark.read\
                  .format("kafka")\
                  .option("kafka.bootstrap.servers", kafka_server)\
                  .option("subscribe", topic)\
                  .option("startingOffsets", "earliest")\
                  .option("failOnDataLoss",False)\
                  .load()
retail_data = static_df.select(f.from_csv(f.decode("value", "US-ASCII"), schema=SCHEMA).alias("value")).select("value.*")

In [0]:
%%time 
print("%d records in frame" % retail_data.count())
retail_data.show(5)

5 records in frame
+----------------+
|            name|
+----------------+
|             one|
|             two|
|           three|
|four four1 four2|
|             six|
+----------------+

CPU times: user 8.98 ms, sys: 0 ns, total: 8.98 ms
Wall time: 7.57 s


In [0]:
static_df.show()

+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|          [6F 6E 65]| noam|        0|     0|2023-06-16 14:15:...|            0|
|null|          [74 77 6F]| noam|        0|     1|2023-06-16 14:15:...|            0|
|null|    [74 68 72 65 65]| noam|        0|     2|2023-06-16 14:15:...|            0|
|null|[66 6F 75 72 20 6...| noam|        0|     3|2023-06-16 14:15:...|            0|
|null|          [73 69 78]| noam|        0|     4|2023-06-16 14:15:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+

