In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
%sql
drop table if exists fact_hourly

In [0]:
dbutils.fs.rm("/user/hive/warehouse/fact_hourly", recurse=True)

In [0]:
@logs
def fact_hourly_weather():
    
    #loading the dimension tables
    dimcity_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/dimcity")
    dimdate_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/dimdate")
    dimtime_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/dimtime")
    
    #loading the cleaned weather data
    weather_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/weather_processed")
    
    #extracting hour from the weather data 
    weather_df = weather_df.withColumn("hour", hour(weather_df.date))
    
    #extracting hour from dimtime
    dimtime_df = dimtime_df.withColumn("hour", hour(dimtime_df.startHour))
    
    #joining the tables
    fact_hourly_df = weather_df.join(dimcity_df, weather_df.city_id == dimcity_df.id, "inner") \
    .join(dimdate_df, weather_df.date == dimdate_df.CalendarDate, "inner") \
    .join(dimtime_df, weather_df.hour == dimtime_df.hour, "inner") \
    .select(
        dimdate_df.dateKey,
        dimcity_df.id.alias("city_id"),
        dimtime_df.timeID.alias("time_id"),
        weather_df.temp,
        weather_df.temp_min,
        weather_df.temp_max,
        weather_df.pressure,
        weather_df.humidity,
        weather_df.visibility,
        weather_df.wind_speed,
        weather_df.wind_deg,
        weather_df.wind_gust,
        weather_df.clouds_all,
        weather_df.created_on,
        weather_df.created_by
    )
    
    start = datetime.fromtimestamp(fact_hourly_df.selectExpr("min(dateKey)").first()[0])
    end = datetime.fromtimestamp(fact_hourly_df.selectExpr("max(dateKey)").first()[0])
 
    return fact_hourly_df, start, end

   
      
    
    

In [0]:
%sql
drop table if exists fact_daily

In [0]:
dbutils.fs.rm("/user/hive/warehouse/fact_daily", recurse=True)

In [0]:
@logs
def fact_daily():
    #loading the dimension tables
    dimcity_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/dimcity")
    dimdate_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/dimdate")
    dimtime_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/dimtime")
    
    #loading the cleaned weather data
    weather_df = spark.read.format("delta").load("dbfs:/user/hive/warehouse/weather_processed")
    
    #extrating date
    weather_df = weather_df.withColumn("date", to_date(weather_df.date))
    
    #joining the tables
    fact_daily_df = weather_df.join(dimcity_df, weather_df.city_id == dimcity_df.id, "inner") \
    .join(dimdate_df, weather_df.date == dimdate_df.CalendarDate, "inner") \
    .select(
        dimdate_df.dateKey,
        dimcity_df.id.alias("city_id"),
        weather_df.temp,
        weather_df.temp_min,
        weather_df.temp_max,
        weather_df.pressure,
        weather_df.humidity,
        weather_df.visibility,
        weather_df.wind_speed,
        weather_df.wind_speed,
        weather_df.wind_deg,
        weather_df.wind_gust,
        weather_df.created_on,
        weather_df.created_by
    )
    
    fact_daily_df = fact_daily_df.groupBy("dateKey", "city_id").agg(
        avg("temp").alias("temp"),
        avg("temp_min").alias("temp_min"),
        avg("temp_max").alias("temp_max"),
        avg("pressure").alias("pressure"),
        avg("humidity").alias("humidity"),
        avg("visibility").alias("visibility"),
        avg("wind_speed").alias("wind_speed"),
        avg("wind_deg").alias("wind_deg"),
        avg("wind_gust").alias("wind_gust"),
        first("created_on").alias("created_on"),
        first("created_by").alias("created_by")
    )
    
    start = datetime.fromtimestamp(fact_daily_df.selectExpr("min(dateKey)").first()[0])
    end = datetime.fromtimestamp(fact_daily_df.selectExpr("max(dateKey)").first()[0])
    
    
    
    return fact_daily_df, start, end
    

    
    

    