# Structured Streaming

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

HOST_ADDRESS = os.getenv("HOST_ADDRESS")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")

conf = SparkConf()

conf.setAppName("Sample Structured Streaming")
conf.set("spark.hadoop.fs.s3a.endpoint", f"http://{HOST_ADDRESS}:9000")
conf.set("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
conf.set("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
conf.set("hive.metastore.uris", "thrift://metastore:9083")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

## Configs path for streaming

In [5]:
input_directory = "s3a://landing/pacientes/*.json"

checkpoint_directory = "s3a://landing/pacientes/checkpoint"

## Function Main

In [7]:
if __name__ == "__main__":
    jsonschema = StructType([
        StructField("idpacientes", IntegerType(), True), 
        StructField("nome", StringType(), True), 
        StructField("situacao", StringType(), True)
    ])
    
    df = spark.readStream.schema(jsonschema).json(input_directory)
    
    def update_postgres (df, BatchId):
        try:
            df.write.format("jdbc") \
            .option("url", "jdbc:postgresql://172.21.121.140:5435/Adventureworks") \
            .option("dbtable", "tb_pacientes") \
            .option("user", "postgres") \
            .option("password", "postgres") \
            .option("driver", "org.postgresql.Driver") \
            .mode("append") \
            .save()
        except Exception as e:
            print(f"Error during batch write: {str(e)}")            

In [None]:
streamingQuery = df.writeStream.foreachBatch(update_postgres) \
    .outputMode("append") \
    .trigger(processingTime="5 seconds") \
    .option("checkpointLocation", checkpoint_directory) \
    .start()

streamingQuery.awaitTermination()