In [1]:
%store -r params
%store -r secrets

In [2]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.avro.functions import to_avro, from_avro
import os

In [3]:
spark = SparkSession.builder.appName("Apply Engine").getOrCreate()

24/03/31 13:07:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
def read_schema(schema_file):
    with open(schema_file, 'r') as f:
        schema_json = json.load(f)
        return StructType.fromJson(schema_json)
        

In [5]:
mainframe_landing_dir = params['mainframe_landing']
schema_file = params["spark_schema_file"]

In [6]:
schema = read_schema(schema_file)

with open(params["avro_schema_file"]) as f:
    avsc = f.read()


In [7]:
df = spark.readStream.format("csv")\
    .option("cleanSource","archive")\
    .option("sourceArchiveDir",params["apply_archive"])\
    .option("maxFilePerTrigger", 2)\
    .option("header", True)\
    .schema(schema)\
    .load(mainframe_landing_dir)

In [8]:
df = df.withColumn("dob", df.dob.cast(StringType()))

In [9]:
def get_avro_kafka_df(src_df, key):
    kafka_df = src_df.selectExpr(f"{key} as key", "(struct(*)) as value")
    kafka_df = kafka_df.withColumn("key", kafka_df.key.cast(StringType())).withColumn("value", to_avro(kafka_df.value, avsc))
    return kafka_df

In [10]:
avro_df = get_avro_kafka_df(df, "id")

In [11]:
write_checkpoint_location = params["checkpoints"]["apply_write"]

In [14]:
sQuery = (avro_df
        .writeStream
        .format("kafka")
        .queryName("apply-engine")
        .option("kafka.bootstrap.servers", secrets["RED_KAFKA_SERVERS"])
        .option("kafka.sasl.mechanism", "PLAIN")
        .option("kafka.security.protocol", "SASL_SSL")
        .option("kafka.sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='{}' password='{}';".format(secrets["RED_KAFKA_USERNAME"], secrets["RED_KAFKA_PASSWORD"]))
        .option("topic",params["topic"]["red_landing"])
        .option("checkpointLocation",write_checkpoint_location)
        .outputMode("append")
        .start()
        )
        

24/03/31 13:14:39 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                