In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

if 'spark' in locals() or 'spark' in globals():
    spark.stop()
    
spark = SparkSession\
    .builder\
    .appName("Testando")\
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1")\
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

spark

In [10]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "meu_topico_kafka") \
    .option("startingOffsets", "earliest") \
    .load()

In [11]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [12]:
# {"transaction-ts": "2023-05-02T13:13:27Z-03", "transaction-id": "17ba24ea-90af-4a8b-9215-de469daef0f0", "product-id": 381}
from pyspark.sql.types import *
from pyspark.sql.functions import col, from_json, to_timestamp

schema = StructType()\
    .add("transaction-ts", TimestampType())\
    .add("transaction-id", StringType())\
    .add("product-id", IntegerType())
df = df.select(col("key").cast("string"), from_json(col("value").cast("string"), schema).alias("value"))
df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- transaction-ts: timestamp (nullable = true)
 |    |-- transaction-id: string (nullable = true)
 |    |-- product-id: integer (nullable = true)



In [13]:
df = df.select("value.*")

#df = df.withColumn('transaction-ts', to_timestamp('transaction-ts', 'Y-m-dTH:M:SZZ'))
df.printSchema()

root
 |-- transaction-ts: timestamp (nullable = true)
 |-- transaction-id: string (nullable = true)
 |-- product-id: integer (nullable = true)



In [42]:
stream = df.writeStream\
    .format("console")\
    .outputMode("update")\
    .start()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+--------------------+----------+
|     transaction-ts|      transaction-id|product-id|
+-------------------+--------------------+----------+
|2023-05-03 13:15:12|074c94a4-83e8-4c0...|       997|
|2023-05-03 13:15:12|8bce3904-0b55-4c1...|       122|
|2023-05-03 13:15:12|45a4bcf4-0b80-4d1...|        52|
|2023-05-03 13:15:12|55f0c81e-6e07-4da...|       552|
|2023-05-03 13:15:12|a147584a-921b-435...|       140|
|2023-05-03 13:15:12|d1773c05-bd1e-405...|       314|
|2023-05-03 13:15:12|583aadc0-3333-47b...|       362|
|2023-05-03 13:15:12|a0109594-34a9-43c...|       238|
|2023-05-03 13:15:12|cf8954b4-f976-4dd...|       411|
|2023-05-03 13:15:12|b266b127-cf73-437...|       277|
|2023-05-03 13:15:12|fe1b7c65-1af2-497...|       879|
|2023-05-03 13:15:12|0fe898fa-0aac-4f7...|       732|
|2023-05-03 13:15:13|5e307343-25d5-4e8...|       953|
|2023-05-03 13:15:13|03d2fb6c-80b9-4b9.

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+--------------------+----------+
|     transaction-ts|      transaction-id|product-id|
+-------------------+--------------------+----------+
|2023-05-03 13:17:07|464804c7-2484-4ab...|       730|
|2023-05-03 13:17:07|0900eda8-8c06-4cf...|       445|
|2023-05-03 13:17:07|55678541-ca8a-42e...|       846|
|2023-05-03 13:17:07|7c36d943-3943-4ca...|       865|
|2023-05-03 13:17:07|87a7c0d3-0150-489...|       815|
|2023-05-03 13:17:07|a641f89d-4bd9-4ab...|       580|
+-------------------+--------------------+----------+

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+--------------------+----------+
|     transaction-ts|      transaction-id|product-id|
+-------------------+--------------------+----------+
|2023-05-03 13:17:07|6a5f4572-dec9-483...|       132|
|2023-05-03 13:17:07|604a8dff-3b91-4b0...|       

In [43]:
stream.stop()

-------------------------------------------
Batch: 121
-------------------------------------------
+-------------------+--------------------+----------+
|     transaction-ts|      transaction-id|product-id|
+-------------------+--------------------+----------+
|2023-05-03 13:17:21|25d388a3-124f-420...|       324|
|2023-05-03 13:17:21|3f9fd171-107b-498...|       803|
|2023-05-03 13:17:21|b3dcb013-c9b4-46e...|       743|
|2023-05-03 13:17:21|a9db964f-881f-421...|       724|
|2023-05-03 13:17:21|85c819d1-3caa-41a...|       955|
|2023-05-03 13:17:21|6db9b58b-c9cf-432...|       728|
+-------------------+--------------------+----------+



In [66]:
stream = df.writeStream\
    .format("parquet")\
    .option("path", "s3a://my-bucket/output/spark-streaming-output")\
    .option("checkpointLocation", "s3a://logs/spark-streaming-checkpoint/teste")\
    .trigger(processingTime='5 seconds')\
    .start()

                                                                                

In [17]:
stream.stop()

In [14]:
stream = df.writeStream \
    .option("path", "s3a://my-bucket/output/spark-streaming-output/table_stream")\
    .option("checkpointLocation", "s3a://logs/spark-streaming-checkpoint/table_stream") \
    .format("parquet") \
    .trigger(processingTime='5 seconds')\
    .outputMode("append") \
    .toTable("default.table_stream")

Hive Session ID = 526689a2-08f0-47c4-bac3-b288d6fdf273
                                                                                

In [16]:
# Check the new table result
spark.read.table("default.table_stream").show()
print(f"{spark.read.table('default.table_stream').count()} registros")

+-------------------+--------------------+----------+
|     transaction-ts|      transaction-id|product-id|
+-------------------+--------------------+----------+
|2023-05-03 17:17:00|b34948c3-2a6d-432...|       529|
|2023-05-03 17:17:00|dc6918e7-d00d-430...|       603|
|2023-05-03 17:17:00|342bec5d-591a-403...|        15|
|2023-05-03 17:17:00|4e1d9210-ddc0-444...|       531|
|2023-05-03 17:17:00|61b1bb84-883e-444...|       480|
|2023-05-03 17:17:00|eff1af99-b00d-404...|       436|
|2023-05-03 17:17:00|9494387a-d400-44c...|       916|
|2023-05-03 17:17:00|f8f3c599-84d8-4f3...|       635|
|2023-05-03 17:17:00|6fc7a170-616a-4e5...|       237|
|2023-05-03 17:17:00|d760f445-0519-4da...|       384|
|2023-05-03 17:17:00|86d4cde4-a818-4ec...|       155|
|2023-05-03 17:17:00|fd026e79-ae96-420...|       570|
|2023-05-03 17:17:00|52695ded-687d-498...|       671|
|2023-05-03 17:17:00|fdf4a670-98a8-4d0...|       488|
|2023-05-03 17:17:00|c39b97f2-01a6-4a6...|       500|
|2023-05-03 17:17:00|1089a48

[Stage 119:=====>                                                 (2 + 20) / 22]

39235 registros


                                                                                

In [14]:
spark.sql("describe formatted default.table_stream").show(truncate= False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|transaction-ts              |timestamp                                                     |null   |
|transaction-id              |string                                                        |null   |
|product-id                  |int                                                           |null   |
|                            |                                                              |       |
|# Detailed Table Information|                                                              |       |
|Database                    |default                                                       |       |
|Table                       |table_stream                                        

In [18]:
spark.stop()