In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import TimestampType

In [0]:
spark = SparkSession.builder.getOrCreate()

#Reading the csv file

In [0]:
df=spark.read.csv("dbfs:/FileStore/shared_uploads/evashrestha99@gmail.com/Weather_hourly.csv", header=True)

In [0]:
#adding a new column in the dataframe
df = df.withColumn("is_forecasted_data", lit("False"))

In [0]:
#dropping dt and timezone
df = df.drop("dt")
df = df.drop("timezone")

In [0]:
display(df)

id,city_name,lon,lat,date,time,temp,temp_min,temp_max,pressure,humidity,visibility,wind_speed,wind_deg,wind_gust,clouds_all,is_forecasted_data
1282616,Wali?,83.76667,27.98333,6/8/2023 12:17,12:17:32,312.06,312.06,312.06,1001,14,10000,146,2.65,3.37,8,False
1282621,Upardang Gadhi,84.566666,27.766666,6/8/2023 12:17,12:17:32,310.8,310.8,310.8,1000,14,10000,210,4.1,4.28,9,False
1282635,Tulsipur,82.297256,28.130989,6/8/2023 12:17,12:17:32,310.78,310.78,310.78,1003,17,10000,227,6.24,5.87,5,False
1282665,Tikoli,84.5,27.633333,6/8/2023 12:17,12:17:32,315.97,315.97,315.97,1001,13,10000,206,3.95,3.55,2,False
1282666,?ikapur,81.133331,28.5,6/8/2023 12:17,12:17:32,314.91,314.91,314.91,1001,12,10000,252,3.95,2.73,0,False
1282616,Wali?,83.76667,27.98333,6/8/2023 13:18,13:18:39,312.44,312.44,312.44,1000,14,10000,155,2.75,3.24,12,False
1282621,Upardang Gadhi,84.566666,27.766666,6/8/2023 13:18,13:18:39,311.1,311.1,311.1,999,13,10000,208,3.79,4.64,12,False
1282635,Tulsipur,82.297256,28.130989,6/8/2023 13:18,13:18:39,311.66,311.66,311.66,1001,15,10000,231,5.56,6.25,35,False
1282665,Tikoli,84.5,27.633333,6/8/2023 13:18,13:18:39,316.94,316.94,316.94,998,12,10000,202,3.89,4.71,2,False
1282666,?ikapur,81.133331,28.5,6/8/2023 13:18,13:18:39,316.76,316.76,316.76,999,10,10000,219,3.05,2.59,0,False


#Retrieving previous 4 hour weather data

In [0]:
def get_previous_four_hours_weather(df):
    window_spec = Window.partitionBy("city_name").orderBy(df["time"].desc())
    result = df.withColumn("row_num", row_number().over(window_spec))
    result = result.filter(result["row_num"] < 5).select("*")
    return result

#Forecasting the weather by taking the average of previous hours

In [0]:
def forecast_next_hour_weather(result):
    grouped_df = result.groupBy("id","city_name","lon","lat").agg(
    expr("max(date) as date"),
    expr("DATEADD(hour,1,max(time)) as time"),
    expr("round(avg(temp),2) as temp"),
    expr("round(avg(temp_min),2) as temp_min"),
    expr("round(avg(temp_max),2) as temp_max"),
    expr("round(avg(pressure),2) as pressure"),
    expr("round(avg(humidity),2) as humidity"),
    expr("round(avg(visibility),2) as visibility"),
    expr("round(avg(wind_deg),2) as wind_deg"),
    expr("round(avg(wind_speed),2) as wind_speed"),
    expr("round(avg(wind_gust),2) as wind_gust"),
    expr("round(avg(clouds_all),2) as clouds_all"),
    expr("'True' as Forcasted")
    )
    return grouped_df

In [0]:
#Forecasting weather data for the next hours by iteratively generating forecasted data for the next hour and appending it to the original DataFrame.

In [0]:
def forecast(df,hours):
    for i in range(hours):
        result=get_previous_four_hours_weather(df)
        forcasted_df=forecast_next_hour_weather(result)
        df=df.union(forcasted_df)
    
    return df

In [0]:
hours = 5
df=forecast(df,hours)

In [0]:
display(df)

id,city_name,lon,lat,date,time,temp,temp_min,temp_max,pressure,humidity,visibility,wind_speed,wind_deg,wind_gust,clouds_all,is_forecasted_data
1282616,Wali?,83.76667,27.98333,6/8/2023 12:17,12:17:32,312.06,312.06,312.06,1001.0,14.0,10000.0,146.0,2.65,3.37,8.0,False
1282621,Upardang Gadhi,84.566666,27.766666,6/8/2023 12:17,12:17:32,310.8,310.8,310.8,1000.0,14.0,10000.0,210.0,4.1,4.28,9.0,False
1282635,Tulsipur,82.297256,28.130989,6/8/2023 12:17,12:17:32,310.78,310.78,310.78,1003.0,17.0,10000.0,227.0,6.24,5.87,5.0,False
1282665,Tikoli,84.5,27.633333,6/8/2023 12:17,12:17:32,315.97,315.97,315.97,1001.0,13.0,10000.0,206.0,3.95,3.55,2.0,False
1282666,?ikapur,81.133331,28.5,6/8/2023 12:17,12:17:32,314.91,314.91,314.91,1001.0,12.0,10000.0,252.0,3.95,2.73,0.0,False
1282616,Wali?,83.76667,27.98333,6/8/2023 13:18,13:18:39,312.44,312.44,312.44,1000.0,14.0,10000.0,155.0,2.75,3.24,12.0,False
1282621,Upardang Gadhi,84.566666,27.766666,6/8/2023 13:18,13:18:39,311.1,311.1,311.1,999.0,13.0,10000.0,208.0,3.79,4.64,12.0,False
1282635,Tulsipur,82.297256,28.130989,6/8/2023 13:18,13:18:39,311.66,311.66,311.66,1001.0,15.0,10000.0,231.0,5.56,6.25,35.0,False
1282665,Tikoli,84.5,27.633333,6/8/2023 13:18,13:18:39,316.94,316.94,316.94,998.0,12.0,10000.0,202.0,3.89,4.71,2.0,False
1282666,?ikapur,81.133331,28.5,6/8/2023 13:18,13:18:39,316.76,316.76,316.76,999.0,10.0,10000.0,219.0,3.05,2.59,0.0,False
