In [1]:
from pyspark.sql.functions import from_json, col, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession

In [2]:
def write_to_cassandra(target_df, batch_id):
    target_df.write \
        .format("org.apache.spark.sql.cassandra") \
        .option("keyspace", "spark_db") \
        .option("table", "users") \
        .mode("append") \
        .save()
    target_df.show()

In [3]:
spark = SparkSession \
        .builder \
        .master("local[3]") \
        .appName("Stream Table Join Demo") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.shuffle.partitions", 2) \
        .config("spark.cassandra.connection.host", "localhost") \
        .config("spark.cassandra.connection.port", "9042") \
        .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions") \
        .config("spark.sql.catalog.lh", "com.datastax.spark.connector.datasource.CassandraCatalog") \
        .getOrCreate()

In [4]:
login_schema = StructType([
        StructField("created_time", StringType()),
        StructField("login_id", StringType())
    ])

In [5]:
kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "logins") \
        .option("startingOffsets", "earliest") \
        .load()

In [6]:
value_df = kafka_source_df.select(from_json(col("value").cast("string"), login_schema).alias("value"))

In [7]:
login_df = value_df.select("value.*") \
        .withColumn("created_time", to_timestamp(col("created_time"), "yyyy-MM-dd HH:mm:ss"))

In [8]:
user_df = spark.read \
        .format("org.apache.spark.sql.cassandra") \
        .option("keyspace", "spark_db") \
        .option("table", "users") \
        .load()

In [9]:
join_expr = login_df.login_id == user_df.login_id
join_type = "inner"

In [10]:
joined_df = login_df.join(user_df, join_expr, join_type) \
        .drop(login_df.login_id)

In [11]:
output_df = joined_df.select(col("login_id"), col("user_name"),
                                 col("created_time").alias("last_login"))

In [12]:
output_query = output_df.writeStream \
        .foreachBatch(write_to_cassandra) \
        .outputMode("update") \
        .option("checkpointLocation", "./chekpoint/join-cassandra") \
        .trigger(processingTime="1 minute") \
        .start()

In [None]:
output_query.awaitTermination()

+--------+---------+----------+
|login_id|user_name|last_login|
+--------+---------+----------+
+--------+---------+----------+

