In [0]:
print("INVESTIGATING WEATHER_HOURLY load_id COLUMN")
print("="*60)

# Define catalog paths
CATALOG = "curlybyte_solutions_rawdata_europe_grid_load"
WEATHER_SCHEMA = "european_weather_raw"

# Load weather table
weather = spark.table(f"{CATALOG}.{WEATHER_SCHEMA}.weather_hourly")

print("\nSample weather_hourly data with load_id:")
display(weather.select('timestamp', 'lat', 'lon', 'temperature_c', 'wind_speed', 'load_id').limit(10))

print("\nUnique load_id patterns:")
display(weather.select('load_id').distinct().limit(20))

# Check if load_id contains country codes
print("\nChecking if load_id contains country information...")
sample_load_ids = weather.select('load_id').distinct().limit(100).toPandas()
print("\nFirst 20 unique load_ids:")
for i, load_id in enumerate(sample_load_ids['load_id'].head(20)):
    print(f"  {i+1}. {load_id}")

# Count total unique load_ids
total_load_ids = weather.select('load_id').distinct().count()
print(f"\nTotal unique load_ids: {total_load_ids}")

In [0]:
from pyspark.sql import functions as F

print("ANALYZING WEATHER COORDINATES")
print("="*60)

# Get sample coordinates for each load_id
print("\nCoordinates by load_id:")
weather_coords = weather.groupBy('load_id').agg(
    F.min('lat').alias('min_lat'),
    F.max('lat').alias('max_lat'),
    F.min('lon').alias('min_lon'),
    F.max('lon').alias('max_lon'),
    F.count('*').alias('row_count')
)
display(weather_coords)

# Show sample data for each load_id
print("\nSample data for each load_id:")
for load_id_row in weather.select('load_id').distinct().collect():
    load_id = load_id_row['load_id']
    print(f"\n--- Load ID: {load_id} ---")
    sample = weather.filter(F.col('load_id') == load_id).limit(5)
    display(sample.select('timestamp', 'lat', 'lon', 'temperature_c', 'wind_speed', 'load_id'))

# Check if coordinates match any known European regions
print("\nüó∫Ô∏è COORDINATE RANGES:")
print("="*60)
print("European coordinate reference:")
print("  Germany: lat 47-55¬∞N, lon 6-15¬∞E")
print("  France: lat 42-51¬∞N, lon -5-8¬∞E")
print("  Spain: lat 36-43¬∞N, lon -9-3¬∞E")
print("  Poland: lat 49-55¬∞N, lon 14-24¬∞E")
print("  Scandinavia: lat 55-71¬∞N, lon 5-31¬∞E")

In [0]:
print("MAPPING WEATHER COORDINATES TO COUNTRIES")
print("="*60)

# Load weather table
CATALOG = "curlybyte_solutions_rawdata_europe_grid_load"
WEATHER_SCHEMA = "european_weather_raw"

weather = spark.table(f"{CATALOG}.{WEATHER_SCHEMA}.weather_hourly")
print("‚úì Weather table loaded")

# Step 1: Check how many UNIQUE coordinates we have first
print("\nStep 1: Counting unique coordinates...")
unique_coords_count = weather.select("lat", "lon").distinct().count()
print(f"Unique lat/lon pairs: {unique_coords_count:,}")

if unique_coords_count > 10000:
    print("‚ö†Ô∏è Too many unique coordinates! Let's use a smarter approach...")
    
    # Sample coordinates instead (much faster)
    print("\nStep 2: Sampling coordinates for mapping...")
    coord_sample = weather.select("lat", "lon").distinct().limit(1000).collect()
    print(f"Working with {len(coord_sample)} sample coordinates")
else:
    print("‚úì Manageable number of coordinates!")
    coord_sample = weather.select("lat", "lon").distinct().collect()

# Step 3: Check if reverse_geocode is installed
print("\nStep 3: Installing reverse_geocode...")
import subprocess
subprocess.check_call(['pip', 'install', 'reverse_geocode', '--break-system-packages'])

import reverse_geocode

# Step 4: Map coordinates to countries
print("\nStep 4: Mapping coordinates to countries...")
coord_list = [(float(row['lat']), float(row['lon'])) for row in coord_sample]

print("Processing geocoding (this may take 1-2 minutes)...")
import time
start = time.time()

countries = reverse_geocode.search(coord_list)

elapsed = time.time() - start
print(f"‚úì Geocoding complete in {elapsed:.1f} seconds")

# Step 5: Create mapping DataFrame
print("\nStep 5: Creating coordinate-to-country mapping...")
mapping_data = [(coord[0], coord[1], loc['country_code']) for coord, loc in zip(coord_list, countries)]
mapping_df = spark.createDataFrame(mapping_data, ["lat", "lon", "country"])

display(mapping_df.limit(20))

print(f"\nMapping created: {len(mapping_data)} coordinates mapped to countries")

In [0]:
print("ANALYZING COORDINATE-TO-COUNTRY MAPPING")
print("="*60)

# Show country distribution
print("\nCountries found in sample:")
country_counts = mapping_df.groupBy("country").count().orderBy("count", ascending=False)
display(country_counts)

# Show total unique countries
total_countries = mapping_df.select("country").distinct().count()
print(f"\nTotal unique countries in sample: {total_countries}")

# Compare with our master dataset countries
print("\nCountries in our master dataset:")
master_countries = spark.table("workspace.default.power_grid_master").select("country").distinct()
display(master_countries.orderBy("country"))

# Check overlap
print("\nChecking overlap between weather coords and our data...")
master_country_list = [row['country'] for row in master_countries.collect()]
weather_country_list = [row['country'] for row in mapping_df.select("country").distinct().collect()]

overlap = set(master_country_list) & set(weather_country_list)
print(f"Overlapping countries: {len(overlap)}")
print(f"Countries: {sorted(overlap)}")

# Decision time
print("\n" + "="*60)
print("NEXT STEPS:")
print("="*60)
print("We have a working coordinate-to-country mapping!")
print("\nOptions:")
print("1. ‚úÖ Apply this mapping to ALL weather data (will take ~30 min)")
print("2. ‚è© Skip weather for now, proceed with current dataset")
print("\nYour choice?")