<div style="text-align: center; line-height: 0; padding-top: 2px;">
  <img src="https://www.quantiaconsulting.com/logos/quantia_logo_orizz.png" alt="Quantia Consulting" style="width: 600px; height: 250px">
</div>

# Write a pyspark consumer for avro topic

An automatic data-generator is writing on kafka topic `ratings`.

`ratings` is an avro topic.

Write a pyspark consumer for this topic.

## Getting Started

Let's start importing libraries and creating useful variables 

In [None]:
%load_ext autotime

In [None]:
import os
import qcutils
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from avro.io import DatumReader, BinaryDecoder
from pyspark.sql.functions import *
import time
import json
import avro.schema
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.5,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.4.1,com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.5 pyspark-shell'

spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )
qcutils.init_spark_session(spark)

spark

In [None]:
from pyspark.sql.functions import *
from confluent_kafka.avro.serializer.message_serializer import MessageSerializer
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient

@udf("string")
def from_avro(value,sr_url): 
    sr_conf = {'url': sr_url}
    schema_registry = CachedSchemaRegistryClient(sr_conf)
    deSerializer = MessageSerializer(schema_registry)
    return deSerializer.decode_message(value)

In [None]:
topic = 'ratings'

servers=qcutils.read_config_value("kafka.server") + ":" + str(qcutils.read_config_value("kafka.port"))
sr_url=qcutils.read_config_value("kafka.schema_registry.url")

avro_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", topic)
  .load())

output_df = (avro_df.select(from_avro("value", lit(sr_url)).alias("v")))

query = (output_df
    .writeStream
    .outputMode("append")
    .format("parquet") 
    .option("path", "/home/jovyan/data/pyspark/ratings.parquet")
    .option("checkpointLocation","/home/jovyan/data/pyspark/checkpoint/ratings") \
    .start())

#time.sleep(30)
#dfw.stop()

query.awaitTermination()

##### ![Quantia Tiny Logo](https://www.quantiaconsulting.com/logos/quantia_logo_tiny.png) 2020 Quantia Consulting, srl. All rights reserved.