In [14]:
from pyspark.sql.functions import from_json, col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import SparkSession

In [15]:
def write_to_mysql(df, batch_id):
    df.write \
        .format("jdbc") \
        .option("driver","com.mysql.cj.jdbc.Driver") \
        .option("url", "jdbc:mysql://127.0.0.1:3306/spark_db") \
        .option("dbtable", "customer_search") \
        .option("user", "root") \
        .option("password", "root") \
        .save()
    df.show()

In [16]:
spark = SparkSession \
        .builder \
        .master("local[3]") \
        .appName("Stream Table Join Demo") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 2) \
        .config("spark.jars", "mysql-connector-java-8.0.13.jar") \
        .getOrCreate()

In [17]:
search_schema = StructType([
        StructField("id", StringType()),
        StructField("customer_id", StringType()),
        StructField("customer_name", StringType()),
        StructField("product_searched", StringType()),
        StructField("search_date", StringType()),
        StructField("country_name", StringType()),
        StructField("state", StringType())
    ])

In [18]:
kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "product-customer-qty") \
        .option("startingOffsets", "earliest") \
        .option("failOnDataLoss", False) \
        .load()

In [19]:
value_df = kafka_source_df.select(from_json(col("value").cast("string"), search_schema).alias("value"))

In [20]:
prod_customer_df = value_df.select("value.*") \
        .withColumn("search_date", to_timestamp(col("search_date"), "yyyy-MM-dd HH:mm:ss"))

In [21]:
prod_customer_df.printSchema()

In [22]:
output_df = prod_customer_df.select(col("id"), col("customer_id"), col("customer_name"), col("product_searched"), 
                             col("search_date"), col("country_name"),col("state"))
#                             .withColumn("idNum", col("customer_id").cast(IntegerType())) \
#                             .withColumn("id",    col("id").cast(IntegerType()))

In [23]:
# #to view the data in the console
# notification_writer_query = output_df.writeStream \
#         .format("console") \
#         .outputMode("append") \
#         .option("truncate", "false") \
#         .option("checkpointLocation", "./checkpoints/cassandra-proj/") \
#         .start()

# # notification_writer_query.awaitTermination()

In [24]:
#Aggregations Val

# agg_output_df = output_df.groupBy("country_name", "product_searched")\
#       .agg(count("id")).alias("search_qty")

# agg_search_locations_df = output_df.groupBy("name")\
#      .agg(sum("idNum"), count("id"))

In [25]:
query = output_df.writeStream \
                 .foreachBatch(write_to_mysql) \
                 .outputMode("update") \
                 .option("checkpointLocation", "./checkpoints/mysql-proj") \
                 .trigger(processingTime="1 minute") \
                 .start()

In [26]:
query.awaitTermination()