In [0]:
%run /Users/075bei014.gokarna@pcampus.edu.np/MaintainingLog

In [0]:
from pyspark.sql.functions import lit, col, avg, max, concat
from pyspark.sql.types import IntegerType

In [0]:
%run /Users/075bei014.gokarna@pcampus.edu.np/Fact_Table_Load

# Forecast Pipeline

## Method
**Forecasting of next 5 is done by:**
  - Last *x* hour data is taken
  - Next hour's data is taken as average of last *x* hours 
  - Then next 4 hours is done average of last 5 and so on

In [0]:
import time

def wait(seconds : int = 500):
    time.sleep(seconds)

In [0]:
number_of_cities = 5
number_of_hours = 6
number_of_hours_to_forecast = 5
total_rows = number_of_cities*number_of_hours

while(True):
    
    # Remove all forecasted data as forecase is coming
    spark.sql("DELETE FROM fact_hourly_table WHERE is_forecasted_data = 'true'")
    
    # Update 
    load_hourly()
    for i in range(number_of_hours_to_forecast):
                
        df = spark.table('fact_hourly_table')

        df_to_forecast = df.sort(col('date_key'), col('time_key')).limit(total_rows)   
        column_list = ["timezone", "temp", "temp_min", "temp_max", "pressure", "humidity", "visibility", "speed", "deg", "gust"]

        df_grouped = df_to_forecast.groupBy('CityId').agg(
            *[ avg(col(column)).alias(column) for column in column_list]
        )
        max_date = df_to_forecast.select(max(col("date_key"))).first()[0]
        max_time = df_to_forecast.filter(col("date_key") == max_date).select(max(col("time_key"))).first()[0] 

        time_key = (max_time + 1)%23
        date_key =  max_date + 1 if time_to_load == 0 else max_date
        is_forecasted_data = True

        df_grouped = df_grouped.withColumn("date_key", lit(date_key))
        df_grouped = df_grouped.withColumn("time_key", lit(time_key))
        df_final = df_grouped.withColumn("is_forecasted_data", lit(is_forecasted_data))

        df_final = df_final.withColumn("p_key_hourly", concat(col("date_key"), lit(" "), col("time_key"), lit("-"), col("CityId")))
        df_final = df_final.withColumn("timezone", col("timezone").cast(IntegerType()))

        LogTable.load('FACT', 'STARTED', 'fact_hourly_table', "Forecasted Data")
        
        
        
        df_final.distinct().write.format('delta').mode('append').option("mergeSchema", "true").saveAsTable('fact_hourly_table')
        LogTable.load('FACT', 'COMPLETED', 'fact_hourly_table', "Forecasted Data")
        
    print(f"Forecasted for next {number_of_hours_to_forecast} hours! Now waiting for new data to arrive :")
    wait(3600)



Forecasted for next 5 hours! Now waiting for new data to arrive :


## Verification

In [0]:
%sql
SELECT * FROM fact_hourly_table;

p_key_hourly,date_key,time_key,CityId,timezone,temp,temp_min,temp_max,pressure,humidity,visibility,speed,deg,gust,is_forecasted_data
20230608 13-1282898,20230608,13,1282898,20700,28.79,28.79,28.79,1002.0,27.0,10000.0,3.22,357.0,3.25,False
20230608 13-1283582,20230608,13,1283582,20700,36.61,36.61,36.61,996.0,23.0,10000.0,3.84,116.0,6.91,False
20230608 13-1283378,20230608,13,1283378,20700,27.07,27.07,27.07,1000.0,22.0,10000.0,1.22,53.0,1.54,False
20230608 13-1282682,20230608,13,1282682,20700,28.8,28.12,28.8,1009.0,48.0,8000.0,3.6,200.0,,False
20230608 13-1282946,20230608,13,1282946,20700,28.64,27.96,28.64,1009.0,48.0,8000.0,3.6,200.0,,False
20230608 15-1282946,20230608,15,1282946,20700,25.86,24.96,25.86,1010.0,56.0,8000.0,3.6,90.0,,False
20230608 15-1283582,20230608,15,1283582,20700,34.69,34.69,34.69,998.0,27.0,10000.0,2.41,87.0,3.9,False
20230608 15-1283378,20230608,15,1283378,20700,25.36,25.36,25.36,1001.0,25.0,10000.0,1.41,45.0,1.62,False
20230608 15-1282898,20230608,15,1282898,20700,26.96,26.96,26.96,1004.0,29.0,10000.0,3.49,359.0,3.29,False
20230608 15-1282682,20230608,15,1282682,20700,26.02,25.12,26.02,1010.0,56.0,8000.0,3.6,90.0,,False
