In [0]:
#%pip install great_expectations==1.10.0

In [0]:
#dbutils.library.restartPython()

In [0]:
%run "./01_config"

In [0]:
import great_expectations_common as gec
print(dir(gec))

In [0]:
class Bronze():
    def __init__(self, env):
        Conf = Config()
        self.landing_zone = Conf.landing + 'landing_zone'
        self.checkpoint_base = Conf.checkpoint + 'checkpoints'
        self.initial = Conf.medallion + "initial"
        self.bronze = Conf.medallion + "bronze"
        self.silver = Conf.medallion + "silver"
        self.gold = Conf.medallion + "gold"
        self.catalog = f"fitbit_{env}_catalog"
        self.db_name = Conf.db_name
        self.initialized = False
        self.env = env

    def consume_calories_min_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, activity_minute timestamp, calories double, date date"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "calories_*.csv") 
                        .load(self.landing_zone)
                        .withColumn("timeKey", F.date_format("activity_minute", 'HH:mm:ss'))
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "calories_min_bz", "calories_min_bz_ingestion_stream", "bronze_p1", once, processing_time)
    

    def consume_heartrate_sec_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, time timestamp, value long, date date"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "heartrate_*.csv") 
                        .load(self.landing_zone)
                        .withColumn("timeKey", F.date_format("time", 'HH:mm:ss')) 
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "heartrate_sec_bz", "heartrate_sec_bz_ingestion_stream", "bronze_p1", once, processing_time)
    
    def consume_intensities_min_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, activity_minute timestamp, intensity long, date date"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "intensities_*.csv") 
                        .load(self.landing_zone)
                        .withColumn("timeKey", F.date_format("activity_minute", 'HH:mm:ss')) 
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "intensities_min_bz", "intensities_min_bz_ingestion_stream", "bronze_p1", once, processing_time)

    def consume_mets_min_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, activity_minute timestamp, mets long, date date"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "METs_*.csv") 
                        .load(self.landing_zone)
                        .withColumn("timeKey", F.date_format("activity_minute", 'HH:mm:ss')) 
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "mets_min_bz", "mets_min_bz_ingestion_stream", "bronze_p1", once, processing_time)

    def consume_sleep_min_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, activity_minute timestamp, value long, log_id long, date date"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "sleep_*.csv") 
                        .load(self.landing_zone)
                        .withColumn("timeKey", F.date_format("activity_minute", 'HH:mm:ss')) 
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "sleep_min_bz", "sleep_min_bz_ingestion_stream", "bronze_p1", once, processing_time)
    
    def consume_steps_min_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, activity_minute timestamp, steps long, date date"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "steps_*.csv")  
                        .load(self.landing_zone)
                        .withColumn("timeKey", F.date_format("activity_minute", 'HH:mm:ss'))
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "steps_min_bz", "steps_min_bz_ingestion_stream", "bronze_p1", once, processing_time)

    def consume_weight_daily_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "user_id long, date date, weight_kg double, weight_pounds double, fat double, bmi double, is_manual_report boolean, log_id long, activity_minute timestamp"
        
        df_stream = (spark.readStream
                        .format("cloudFiles")
                        .schema(schema)
                        .option("maxFilesPerTrigger", 1)
                        .option("cloudFiles.format", "csv")
                        .option("header", "true")
                        .option("pathGlobFilter", "weight_*.csv") 
                        .load(self.landing_zone)
                        .withColumn("load_time", F.current_timestamp())
                        .withColumn("source_file", F.col("_metadata.file_path"))
                    )
        return self._write_stream_append(df_stream, "weight_daily_bz", "weight_daily_bz_ingestion_stream", "bronze_p1", once, processing_time)
    
    
    def _write_stream_append(self, df, path, query_name, pool, once, processing_time):
        catalog_name = f"fitbit_{self.env}_catalog"
        schema_name = self.db_name
        table_name = path
        stream_writer = (df.writeStream
            .foreachBatch(lambda micro_df, batch_id: gec.validate_and_insert_process_batch(
                            df=micro_df,           
                            batch_id=batch_id,         
                            table_name=table_name, 
                            catalog=catalog_name,  
                            schema=schema_name ))
            .option("checkpointLocation", f"{self.checkpoint_base}/{path}")
            .queryName(query_name)
        )
        spark.sparkContext.setLocalProperty("spark.scheduler.pool", pool)
        if once:
            return stream_writer.trigger(availableNow=True).start()
        else:
            return stream_writer.trigger(processingTime=processing_time).start()
        

    def consume(self, once=True, processing_time="5 seconds"):
        import time
        start = int(time.time())
        print(f"\nStarting bronze layer consumption ...")
        
        self.consume_calories_min_bz(once, processing_time)
        self.consume_heartrate_sec_bz(once, processing_time)
        self.consume_intensities_min_bz(once, processing_time)
        self.consume_mets_min_bz(once, processing_time)
        self.consume_sleep_min_bz(once, processing_time)
        self.consume_steps_min_bz(once, processing_time)
        self.consume_weight_daily_bz(once, processing_time)
        
        if once:
            for stream in spark.streams.active:
                stream.awaitTermination()
                
        print(f"Completed bronze layer consumtion {int(time.time()) - start} seconds")