In [6]:
import yaml

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType, FloatType, BooleanType
import pyspark.sql.functions as F

from SongSparkStreaming import SongSparkStreaming
from utils import *

In [7]:
with open('streaming_config.yml', 'r') as file:
    config = yaml.safe_load(file)

BOOTSTRAP_SERVERS = config['kafka']['bootstrap_servers']
TOPIC = 'listen_events'
SCHEMA = config['kafka']['topics'][TOPIC]['schema']
SPARK_JARS_PACKAGES = config['spark']['spark_jars_packages']
STARTING_OFFSETS = config['kafka']['starting_offsets']
STREAMING_KEY_COLUMNS = config['kafka']['topics'][TOPIC]['key_columns']

In [8]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages {SPARK_JARS_PACKAGES}'

In [None]:

song_spark_streaming = SongSparkStreaming(
    topic=TOPIC, 
    schema=SCHEMA,
    spark_app_name='listen_events_transformed',
    spark_jars_packages=SPARK_JARS_PACKAGES, 
    kafka_bootstrap_servers=BOOTSTRAP_SERVERS,
    starting_offsets=STARTING_OFFSETS,
    fail_on_data_loss=False
    )

df = song_spark_streaming.read()

#Transform data
df = transform_epcho_to_timestamp(df, 'ts')
df = hash_key(df, STREAMING_KEY_COLUMNS)
df = df.drop('value')
df = df.withColumn('key', df.key.cast(StringType()))

kafka_writer = song_spark_streaming.write_to_kafka(df, topic=f'{TOPIC}_transformed')

song_spark_streaming.spark.streams.awaitAnyTermination()

In [None]:
kafka_writer.stop()