In [6]:
import yaml

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType, FloatType, BooleanType
import pyspark.sql.functions as F

from SongSparkStreaming import SongSparkStreaming
from utils import *

In [7]:
with open('streaming_config.yml', 'r') as file:
    config = yaml.safe_load(file)

BOOTSTRAP_SERVERS = config['kafka']['bootstrap_servers']
TOPIC = 'listen_events'
SCHEMA = config['kafka']['topics'][TOPIC]['schema']
SPARK_JARS_PACKAGES = config['spark']['spark_jars_packages']
STARTING_OFFSETS = config['kafka']['starting_offsets']
STREAMING_KEY_COLUMNS = config['kafka']['topics'][TOPIC]['key_columns']

In [8]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages {SPARK_JARS_PACKAGES}'

In [11]:

song_spark_streaming = SongSparkStreaming(
    topic=TOPIC, 
    schema=SCHEMA,
    spark_app_name='listen_events_transformed',
    spark_jars_packages=SPARK_JARS_PACKAGES, 
    kafka_bootstrap_servers=BOOTSTRAP_SERVERS,
    starting_offsets=STARTING_OFFSETS,
    fail_on_data_loss=False
    )

df = song_spark_streaming.read()

#Transform data
df = transform_epcho_to_timestamp(df, 'ts')
df = hash_key(df, STREAMING_KEY_COLUMNS)
df = df.drop('value')
df = df.withColumn('key', df.key.cast(StringType()))

kafka_writer = song_spark_streaming.write_to_kafka(df, topic=f'{TOPIC}_transformed')

song_spark_streaming.spark.streams.awaitAnyTermination()

25/01/17 12:40:46 INFO ResolveWriteToStream: Checkpoint root checkpoint resolved to file:/opt/workspace/checkpoint.
25/01/17 12:40:46 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/01/17 12:40:46 INFO MicroBatchExecution: Starting [id = a2a965d9-551b-413a-94be-3a6232f9dd05, runId = c72f6e9e-3651-4c47-bcb2-2f4eebd96ae3]. Use file:/opt/workspace/checkpoint to store the query checkpoint.
25/01/17 12:40:46 INFO MicroBatchExecution: Reading table [org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@2a97f80d] from DataSourceV2 named 'kafka' [org.apache.spark.sql.kafka010.KafkaSourceProvider@245012f7]


25/01/17 12:40:46 INFO MicroBatchExecution: Resuming at batch 52 with committed offsets {KafkaV2[Subscribe[listen_events]]: {"listen_events":{"0":50860}}} and available offsets {KafkaV2[Subscribe[listen_events]]: {"listen_events":{"0":50860}}}
25/01/17 12:40:46 INFO MicroBatchExecution: Stream started from {KafkaV2[Subscribe[listen_events]]: {"listen_events":{"0":50860}}}
25/01/17 12:40:46 INFO ConsumerConfig: ConsumerConfig values: 
	allow.auto.create.topics = true
	auto.commit.interval.ms = 5000
	auto.offset.reset = earliest
	bootstrap.servers = [broker:29092, localhost:9092]
	check.crcs = true
	client.dns.lookup = use_all_dns_ips
	client.id = consumer-spark-kafka-source-8a11d89a-63d0-4b34-a3b7-164401a2c988-1721993561-driver-0-3
	client.rack = 
	connections.max.idle.ms = 540000
	default.api.timeout.ms = 60000
	enable.auto.commit = false
	exclude.internal.topics = true
	fetch.max.bytes = 52428800
	fetch.max.wait.ms = 500
	fetch.min.bytes = 1
	group.id = spark-kafka-source-8a11d89a-63d

In [12]:
kafka_writer.stop()

25/01/17 12:41:42 INFO DAGScheduler: Asked to cancel job group c72f6e9e-3651-4c47-bcb2-2f4eebd96ae3
25/01/17 12:41:42 INFO ConsumerCoordinator: [Consumer clientId=consumer-spark-kafka-source-8a11d89a-63d0-4b34-a3b7-164401a2c988-1721993561-driver-0-3, groupId=spark-kafka-source-8a11d89a-63d0-4b34-a3b7-164401a2c988-1721993561-driver-0] Revoke previously assigned partitions listen_events-0
25/01/17 12:41:42 INFO AbstractCoordinator: [Consumer clientId=consumer-spark-kafka-source-8a11d89a-63d0-4b34-a3b7-164401a2c988-1721993561-driver-0-3, groupId=spark-kafka-source-8a11d89a-63d0-4b34-a3b7-164401a2c988-1721993561-driver-0] Member consumer-spark-kafka-source-8a11d89a-63d0-4b34-a3b7-164401a2c988-1721993561-driver-0-3-3d2435ba-c23a-4633-89bc-dfcb83f25f99 sending LeaveGroup request to coordinator broker:29092 (id: 2147483646 rack: null) due to the consumer is being closed
25/01/17 12:41:42 INFO Metrics: Metrics scheduler closed
25/01/17 12:41:42 INFO Metrics: Closing reporter org.apache.kafka.c

25/01/17 12:43:54 INFO Metrics: Metrics scheduler closed
25/01/17 12:43:54 INFO Metrics: Closing reporter org.apache.kafka.common.metrics.JmxReporter
25/01/17 12:43:54 INFO Metrics: Metrics reporters closed
25/01/17 12:43:54 INFO AppInfoParser: App info kafka.consumer for consumer-spark-kafka-source-4274dba1-3653-46a3-92ab-afc5a04ecbf2-1721993561-executor-2 unregistered
25/01/17 12:48:54 INFO CachedKafkaProducer: Closing the KafkaProducer with id: 73a91837-54b3-4f2a-a004-f61ea521c572.
25/01/17 12:48:54 INFO KafkaProducer: [Producer clientId=producer-1] Closing the Kafka producer with timeoutMillis = 9223372036854775807 ms.
25/01/17 12:48:54 INFO Metrics: Metrics scheduler closed
25/01/17 12:48:54 INFO Metrics: Closing reporter org.apache.kafka.common.metrics.JmxReporter
25/01/17 12:48:54 INFO Metrics: Metrics reporters closed
25/01/17 12:48:54 INFO AppInfoParser: App info kafka.producer for producer-1 unregistered
25/01/17 13:07:51 INFO BlockManagerInfo: Removed broadcast_0_piece0 on f