In [0]:
class GoldLayer:
    def __init__(self):
        self.catalog_name = "dev"
        self.gold_layer_schema_name = "swatch_consumption"
        self.silver_layer_schema_name = "swatch_staging"
        self.bronze_layer_schema_name = "swatch_raw"
        self.gym_summary_table_name = "gym_summary"
        self.workout_bpm_summary_table_name = "workout_bpm_summary"
        self.workout_bpm_table_name = "workout_bpm"
        self.user_profile_table_name = "user_profile"
        self.device_registration_table_name = 'device_registration'

In [0]:
from pyspark.sql.functions import *

class WorkoutBpmReport(GoldLayer):
    def __init__(self):
        super().__init__()

    users = spark.read.table("dev.swatch_staging.user_profile")

    def ingest_workout_bpm_summary(self):
        workout_bpm_stream_df = (spark
                                 .readStream
                                 .table(f"{self.catalog_name}.{self.silver_layer_schema_name}.{self.workout_bpm_table_name}")
                                 .withColumn('date',col("end_time").cast("date"))
                                 .withWatermark("end_time", "120 seconds")
                                 .groupBy("user_id", "workout_id","session_id","date")
                                 .agg(avg("heartrate").alias("avg_hr"),max("heartrate").alias("max_hr"),min("heartrate").alias("min_hr"),count("heartrate").alias("recordings"))
                                 
                                 .join(self.users, "user_id","inner")
                                 .select("user_id","date","workout_id","session_id","sex","city","state","avg_hr","max_hr","min_hr","recordings")
                                 )
        return workout_bpm_stream_df
    
    def upsert(self,final_df,batch_id):
        final_df.createOrReplaceTempView("workout_bpm_summary")
        merge_query = """
        MERGE into dev.swatch_consumption.workout_bpm_summary as T
        using workout_bpm_summary s
        on T.user_id = s.user_id and T.date = s.date and T.workout_id = s.workout_id and T.session_id = s.session_id
        when matched then update set *
        when not matched then insert *
        """
        final_df._jdf.sparkSession().sql(merge_query)
    
    def write_workout_bpm_summary(self,df):
        workout_bpm_summary_query = (df
                 .writeStream
                 .queryName("workout_bpm_summary_query")
                 .outputMode("update")
                 .option("checkpointLocation","/Volumes/dev/swatch_consumption/workout_bpm_summary_checkpoint")
                 .foreachBatch(self.upsert)
                 .start()
                 )
        return workout_bpm_summary_query
    
        




In [0]:
workout_bpm_report = WorkoutBpmReport()
df = workout_bpm_report.ingest_workout_bpm_summary()
workout_bpm_summary_query= workout_bpm_report.write_workout_bpm_summary(df)

<pyspark.sql.streaming.query.StreamingQuery at 0xffff5814cb10>

In [0]:
workout_bpm_summary_query.stop()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-1172548824997626>, line 1[0m
[0;32m----> 1[0m workout_bpm_summary_query[38;5;241m.[39mstop()

[0;31mNameError[0m: name 'workout_bpm_summary_query' is not defined

In [0]:
%sql

select * from dev.swatch_consumption.workout_bpm_summary;

user_id,date,workout_id,session_id,sex,city,state,recordings,max_hr,min_hr,avg_hr
14633,2023-03-12,1,1,F,Gardena,CA,137,94.17238125635576,35.89084608323042,66.52234303183477
14633,2023-03-12,1,2,F,Gardena,CA,89,94.95989143953022,35.30786301539906,62.25988311813774


In [0]:
from pyspark.sql.functions import *

class GymSummaryReport(GoldLayer):
    def __init__(self):
        super().__init__()
    
    # static Table
    gym_logs = spark.read.table("dev.swatch_staging.user_profile")

    def ingest_gym_summary(self):
        # static Table
        device_registration_df = (spark
                                  .read
                                  .table(f"{self.catalog_name}.{self.bronze_layer_schema_name}.{self.device_registration_table_name}")
                                  )
        
        processed_df = (spark.read.table("dev.swatch_raw.gym_attendance")
                        .withColumn("minutes_in_gym",round((col("logout_time") - col("login_time"))/60))
                        .withColumn("login_time",col("login_time").cast("timestamp"))
                        .withColumn("logout_time",col("logout_time").cast("timestamp"))
                        .withColumn("date",col("logout_time").cast("date"))
                        .join(device_registration_df,"mac_address","inner")
                        .select(col("user_id"),device_registration_df.mac_address,col("gym_id"),col("login_time"),col("logout_time"),col("minutes_in_gym"),col("date"))
                        )
        
        completed_workout_stream_df = (spark
                                       .read
                                       .table('dev.swatch_staging.completed_workouts')
                                       .withColumn("exercise_time",round(col("end_time").cast("long") - col("start_time").cast("long"))/60)
                                       )
        
        join_condition = [processed_df.user_id == completed_workout_stream_df.user_id,processed_df.login_time <= completed_workout_stream_df.start_time , processed_df.logout_time >= completed_workout_stream_df.end_time]

        gym_workout_report = (completed_workout_stream_df
                              .join(processed_df,join_condition,"inner")
                              .select(processed_df.user_id,col("mac_address"),col("date"),col("workout_id"),col("minutes_in_gym"),col("exercise_time"))
                              .groupBy(processed_df.user_id,col("mac_address"),col("date"),col("workout_id"),col("minutes_in_gym"))
                              .agg(sum(col("exercise_time")).alias("minutes_exercising"))
                              )
        
        gym_workout_report.display()
        #spark.write.mode("override").table("dev.swatch_consumption.gym_summary")

In [0]:
gym_summary = GymSummaryReport()
gym_summary.ingest_gym_summary()

user_id,mac_address,date,workout_id,minutes_in_gym,minutes_exercising
12140,ae:ec:f6:48:ca:f7,2023-03-11,1,45.0,35.0
14508,df:f9:dc:5e:e2:a8,2023-03-12,1,43.0,35.0
12227,57:24:ac:8c:75:ea,2023-03-11,1,75.0,65.0
15149,de:c0:cd:a7:71:f4,2023-03-12,1,70.0,60.0
13937,dd:96:be:e9:1e:f4,2023-03-12,1,48.0,40.0
14232,dd:45:d2:37:a8:0e,2023-03-12,1,55.0,46.0
13559,36:1f:d9:d3:e8:0d,2023-03-11,1,75.0,65.0
13559,36:1f:d9:d3:e8:0d,2023-03-11,1,60.0,50.0
14633,1d:69:69:75:d0:aa,2023-03-12,1,40.0,30.0
13937,dd:96:be:e9:1e:f4,2023-03-12,1,71.0,60.0
