In [14]:
from pyspark.sql.functions import from_json, col, expr
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType

In [15]:
spark = SparkSession \
        .builder \
        .appName("File Streaming Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .getOrCreate()

In [16]:
schema = StructType([
        StructField("id", StringType()),
        StructField("name", StringType()),
        StructField("age", IntegerType()),
        StructField("address", StringType()),
        StructField("email", StringType()),
        StructField("CreatedTime", LongType()),        
        ])

In [17]:
kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "customer-data") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", False)\
        .load()

In [18]:
value_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("value"))

In [19]:
value_df.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- CreatedTime: long (nullable = true)



In [20]:
notification_df = value_df.select("value.id", "value.name", "value.age") \
        .withColumn("BirthYear", expr("2023 - age"))

In [21]:
notification_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- BirthYear: integer (nullable = true)



In [22]:
kafka_target_filtered_df = notification_df.selectExpr("id as key",
                                                 """to_json(named_struct(
                                                 'Name', name,
                                                 'Age', age,
                                                 'BirthYear', 2023 - age)) as value""")

In [23]:
# kafka_target_filtered_df = kafka_target_filtered_df.filter("age > 30")
#add csv withlocation to do a join

 #schema_csv=(StructType().add("location_code",StringType()).add("location_region",StringType()).add("location_country",StringType()))
# locations = spark.read.format("csv")\
#     .option("header", True)\
#     .option("header", True)\
#     .schema(schema_csv)\
#     .load("locations.csv").alias("locations")

#kafka_target_filtered_joined_df = kafka_target_filtered_df.join(location, kafka_target_filtered_df["location_code"]==locations["location_code"], "left")

# kafka_target_final_df = kafka_target_filtered_joined_df.groupBy("language_region")\
#     .agg(sum("favorite_count"), count("id"))

In [None]:
notification_writer_query = kafka_target_filtered_df.writeStream \
        .format("console") \
        .outputMode("append") \
        .option("truncate", "false") \
        .option("checkpointLocation", "./checkpoints/checkpointTEST/") \
        .start()

notification_writer_query.awaitTermination()

In [None]:
# notification_writer_query = kafka_target_df \
#         .writeStream \
#         .queryName("Notification Writer") \
#         .format("kafka") \
#         .option("kafka.bootstrap.servers", "localhost:9092") \
#         .option("topic", "notifications") \
#         .outputMode("append") \
#         .option("checkpointLocation", "./checkpoints/checkpoint-stream-kafka-to-kafka2/") \
#         .start()

In [None]:
# notification_writer_query.awaitTermination()