In [0]:
%pip install great_expectations==1.10.0

In [0]:
#dbutils.library.restartPython()

In [0]:
%run "./01_config"

In [0]:
import great_expectations_common as gec
print(dir(gec))

In [0]:
class Bronze():
    def __init__(self, env):
        self.Conf = Config()
        self.env = env
        self.landing_zone = self.Conf.landing + "raw"
        self.checkpoint_base = self.Conf.project_dir + "checkpoints"
        self.catalog = f"sbit_{env}_catalog"
        self.db_name = self.Conf.db_name
        spark.sql(f"USE {self.catalog}.{self.db_name}")

    def consume_user_registration(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id string, device_id string, mac_address string, registration_timestamp double"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .load(self.landing_zone + "/registered_users_bz")
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "registered_users_bz", "registered_users_bz_ingestion_stream", "bronze_p1", once, processing_time)

    def consume_gym_logins(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "mac_address string, gym bigint, login double, logout double"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .load(self.landing_zone + "/gym_logins_bz")
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "gym_logins_bz", "gym_logins_bz_ingestion_stream", "bronze_p1", once, processing_time)
        

    def consume_kafka_multiplex(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "key string, value string, topic string, partition bigint, offset bigint, timestamp timestamp"
        
        df_date_lookup = spark.table(f"{self.catalog}.{self.db_name}.date_lookup").select("date", "week_part")
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "json")
                        .option("multiLine", "true")
                        .load(self.landing_zone + "/kafka_multiplex_bz")
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                        .join(F.broadcast(df_date_lookup), 
                              F.to_date(F.col("timestamp")) == F.col("date"), 
                              "left")
                    )
        return self._write_stream_append(df_stream, "kafka_multiplex_bz", "kafka_multiplex_bz_ingestion_stream", "bronze_p1", once, processing_time)
        
    
    def _write_stream_append(self, df, path, query_name, pool, once, processing_time):
        catalog_name = f"sbit_{self.env}_catalog"
        schema_name = self.db_name
        table_name = path
        stream_writer = (df.writeStream
            .foreachBatch(lambda micro_df, batch_id: gec.validate_and_insert_process_batch(
                            df=micro_df,           
                            batch_id=batch_id,         
                            table_name=table_name, 
                            catalog=catalog_name,  
                            schema=schema_name ))
            .option("checkpointLocation", f"{self.checkpoint_base}/{path}")
            .queryName(query_name)
        )
        spark.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
        if once:
            return stream_writer.trigger(availableNow=True).start()
        else:
            return stream_writer.trigger(processingTime=processing_time).start()


    def consume(self, once=True, processing_time="5 seconds"):
        import time
        start = int(time.time())
        print(f"\nStarting bronze layer consumption ...")
        
        self.consume_user_registration(once, processing_time)
        self.consume_gym_logins(once, processing_time)
        self.consume_kafka_multiplex(once, processing_time)
        
        if once:
            for stream in spark.streams.active:
                stream.awaitTermination()
                
        print(f"Completed bronze layer consumtion {int(time.time()) - start} seconds")