In [0]:
weather_raw = spark.table("curlybyte_solutions_rawdata_europe_grid_load.european_weather_raw.weather_hourly")

In [0]:
weather_raw.show(5)

In [0]:
import reverse_geocode
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

# Getting distinct coordinates
coordinates = weather_raw.select("lat", "lon").distinct().collect()
coord_list = [(row['lat'], row['lon']) for row in coordinates]

# Getting country codes for each coordinate
country = [loc['country_code'] for loc in reverse_geocode.search(coord_list)]

# Creating a DataFrame mapping coordinates to country codes
coord_country_df = spark.createDataFrame(
    [(lat, lon, c) for (lat, lon), c in zip(coord_list, country)],
    schema=StructType([
        StructField("lat", DoubleType(), True),
        StructField("lon", DoubleType(), True),
        StructField("country", StringType(), True)
    ])
)

# Join back to the original weather_raw table
weather_with_country = weather_raw.join(coord_country_df, on=["lat", "lon"], how="left")

display(weather_with_country.orderBy("timestamp"))


In [0]:
columns_to_average = ["ssrd", "wind_speed", "temperature_c"]
agg_exprs = [F.mean(c).alias(f"mean_{c}") for c in columns_to_average]
country_time_mean = weather_with_country.groupBy("country", "timestamp").agg(*agg_exprs)

country_time_mean = country_time_mean.orderBy("timestamp")

display(country_time_mean)



In [0]:
countries_europe = ['ES', 'PT', 'FR', 'DE', 'IT', 'GB', 'NL', 'BE', 'AT', 'CH', 'PL', 'CZ', 'DK', 'SE', 'NO', 'FI', 'GR', 'IE', 'RO', 'BG', 'HU', 'SK', 'SI', 'HR', 'EE', 'LT', 'LV']

#print(len(countries_europe))  
country_time_mean = country_time_mean.filter(country_time_mean.country.isin(countries_europe))

display(country_time_mean)        

In [0]:
country_time_mean.write.mode("overwrite").saveAsTable("weather_europe")

In [0]:
spark.table('weather_europe').show(5)