In [0]:
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("EXPLORING NEW DATASET: electricity_and_weather_europe")
print("="*60)

# Load the table
df = spark.table("workspace.default.electricity_and_weather_europe")

print("\n1. BASIC INFO:")
print("-"*60)
print(f"Total rows: {df.count():,}")
print(f"Total columns: {len(df.columns)}")

print("\n2. COLUMN NAMES AND TYPES:")
print("-"*60)
df.printSchema()

print("\n3. SAMPLE DATA (first 10 rows):")
print("-"*60)
display(df.limit(10))

print("\n4. COLUMN LIST:")
print("-"*60)
print(df.columns)

print("\n5. CHECK FOR DUPLICATES:")
print("-"*60)
print(f"Total rows: {df.count():,}")
print(f"Distinct rows: {df.distinct().count():,}")

print("\n6. DATE RANGE:")
print("-"*60)
date_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
print(f"Time columns found: {date_cols}")
if date_cols:
    for col in date_cols:
        df.select(
            F.min(col).alias('min_date'),
            F.max(col).alias('max_date')
        ).show()

print("\n7. COUNTRIES:")
print("-"*60)
country_cols = [col for col in df.columns if 'country' in col.lower()]
if country_cols:
    for col in country_cols:
        print(f"\nUnique values in {col}:")
        df.select(col).distinct().orderBy(col).show(30, truncate=False)

In [0]:
print("STEP 2: DATA QUALITY CHECK")
print("="*60)

# Load data
df = spark.table("workspace.default.electricity_and_weather_europe")

print("\n1. MISSING VALUES CHECK:")
print("-"*60)

# Count nulls for each column
from pyspark.sql import functions as F

null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c) 
    for c in df.columns
])

# Convert to pandas for easier viewing
null_df = null_counts.toPandas().T
null_df.columns = ['null_count']
null_df['null_pct'] = (null_df['null_count'] / df.count()) * 100
null_df = null_df[null_df['null_count'] > 0].sort_values('null_pct', ascending=False)

print(f"\nColumns with missing values:")
if len(null_df) > 0:
    display(null_df.head(20))
else:
    print("‚úì No missing values found!")

print("\n2. DUPLICATE CHECK:")
print("-"*60)
total_rows = df.count()
distinct_rows = df.distinct().count()
duplicates = total_rows - distinct_rows
print(f"Total rows: {total_rows:,}")
print(f"Distinct rows: {distinct_rows:,}")
print(f"Duplicates: {duplicates:,} ({duplicates/total_rows*100:.2f}%)")

print("\n3. TIME COVERAGE:")
print("-"*60)
time_stats = df.select(
    F.min('index').alias('start_date'),
    F.max('index').alias('end_date'),
    F.count('index').alias('total_records')
).collect()[0]

print(f"Start date: {time_stats['start_date']}")
print(f"End date: {time_stats['end_date']}")
print(f"Total records: {time_stats['total_records']:,}")

# Check time frequency
print("\n4. COUNTRY COVERAGE:")
print("-"*60)
country_counts = df.groupBy('country').count().orderBy(F.desc('count'))
display(country_counts)

print("\n5. BASIC STATISTICS (Key Columns):")
print("-"*60)
key_cols = ['Actual_Load', 'net_imports', 'mean_temperature_c', 'mean_wind_speed']
df.select(key_cols).summary().show()

In [0]:
print("STEP 3: FEATURE ANALYSIS")
print("="*60)

# Convert to Pandas for easier analysis
df_pandas = df.toPandas()  # ‚Üê Using 'df' not 'df_renamed'

print("\n1. GENERATION FEATURES:")
print("-"*60)

# Get all generation columns
generation_cols = [col for col in df_pandas.columns if '__Actual_Aggregated' in col]
print(f"Total generation types: {len(generation_cols)}")

# Calculate total generation
df_pandas['Total_Generation'] = df_pandas[generation_cols].sum(axis=1)

print(f"\nTotal Generation statistics:")
print(f"  Mean: {df_pandas['Total_Generation'].mean():,.2f} MW")
print(f"  Median: {df_pandas['Total_Generation'].median():,.2f} MW")
print(f"  Max: {df_pandas['Total_Generation'].max():,.2f} MW")

# Top generation sources by average contribution
print(f"\nTop 10 Generation Sources (by average MW):")
gen_means = df_pandas[generation_cols].mean().sort_values(ascending=False).head(10)
for col, val in gen_means.items():
    col_name = col.replace('__Actual_Aggregated', '')
    print(f"  {col_name:.<50} {val:>10,.2f} MW")

print("\n2. SUPPLY vs DEMAND BALANCE:")
print("-"*60)

# Calculate supply
df_pandas['Supply'] = df_pandas['Total_Generation'] + df_pandas['net_imports']
df_pandas['Imbalance'] = df_pandas['Supply'] - df_pandas['Actual_Load']
df_pandas['Imbalance_Pct'] = (df_pandas['Imbalance'] / df_pandas['Actual_Load']) * 100

print(f"Average Actual_Load: {df_pandas['Actual_Load'].mean():>12,.2f} MW")
print(f"Average Total_Generation: {df_pandas['Total_Generation'].mean():>12,.2f} MW")
print(f"Average net_imports: {df_pandas['net_imports'].mean():>12,.2f} MW")
print(f"Average Supply: {df_pandas['Supply'].mean():>12,.2f} MW")
print(f"Average Imbalance: {df_pandas['Imbalance'].mean():>12,.2f} MW ({df_pandas['Imbalance_Pct'].mean():.2f}%)")

# Check for deficits
deficit_count = (df_pandas['Imbalance'] < 0).sum()
surplus_count = (df_pandas['Imbalance'] > 0).sum()
balanced_count = (df_pandas['Imbalance'] == 0).sum()

print(f"\nSupply-Demand Balance:")
print(f"  Deficit (Supply < Demand): {deficit_count:>8,} ({deficit_count/len(df_pandas)*100:>5.2f}%)")
print(f"  Surplus (Supply > Demand): {surplus_count:>8,} ({surplus_count/len(df_pandas)*100:>5.2f}%)")
print(f"  Balanced (Supply = Demand): {balanced_count:>8,} ({balanced_count/len(df_pandas)*100:>5.2f}%)")

if deficit_count > 0:
    deficits = df_pandas[df_pandas['Imbalance'] < 0]
    print(f"\n  Deficit Statistics:")
    print(f"    Mean deficit: {deficits['Imbalance'].mean():,.2f} MW")
    print(f"    Worst deficit: {deficits['Imbalance'].min():,.2f} MW")
    print(f"    Countries with deficits: {deficits['country'].nunique()}")

print("\n3. WEATHER FEATURES:")
print("-"*60)
weather_cols = ['mean_ssrd', 'mean_wind_speed', 'mean_temperature_c']
weather_stats = df_pandas[weather_cols].describe()
print(weather_stats)

In [0]:
print("\nSTEP 4: BLACKOUT DEFINITION ANALYSIS")
print("="*60)

print("\n1. IMBALANCE DISTRIBUTION:")
print("-"*60)

# Percentiles of imbalance
percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]
print("\nImbalance Percentiles:")
for p in percentiles:
    val = df_pandas['Imbalance'].quantile(p)
    print(f"  {p*100:>5.1f}% (worst {100-p*100:.1f}%): {val:>10,.2f} MW")

print("\n2. DEFICIT SEVERITY BY COUNTRY:")
print("-"*60)
deficit_by_country = df_pandas[df_pandas['Imbalance'] < 0].groupby('country').agg({
    'Imbalance': ['mean', 'min', 'count']
}).round(2)
deficit_by_country.columns = ['Mean_Deficit_MW', 'Worst_Deficit_MW', 'Deficit_Count']
deficit_by_country = deficit_by_country.sort_values('Mean_Deficit_MW')
print(deficit_by_country.head(10))

print("\n3. POTENTIAL BLACKOUT DEFINITIONS:")
print("-"*60)

# Option 1: Extreme deficit (worse than 95th percentile)
threshold_extreme = df_pandas['Imbalance'].quantile(0.05)
blackout_extreme = (df_pandas['Imbalance'] < threshold_extreme).sum()
print(f"\nOption 1: Extreme Deficit (< {threshold_extreme:.0f} MW)")
print(f"  Would flag: {blackout_extreme:,} records ({blackout_extreme/len(df_pandas)*100:.2f}%)")

# Option 2: Very severe deficit (< -10,000 MW)
blackout_10k = (df_pandas['Imbalance'] < -10000).sum()
print(f"\nOption 2: Severe Deficit (< -10,000 MW)")
print(f"  Would flag: {blackout_10k:,} records ({blackout_10k/len(df_pandas)*100:.2f}%)")

# Option 3: Deficit worse than -50% of load
df_pandas['Deficit_Pct_of_Load'] = (df_pandas['Imbalance'] / df_pandas['Actual_Load']) * 100
blackout_50pct = (df_pandas['Deficit_Pct_of_Load'] < -50).sum()
print(f"\nOption 3: Deficit > 50% of Load")
print(f"  Would flag: {blackout_50pct:,} records ({blackout_50pct/len(df_pandas)*100:.2f}%)")

# Visualize
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Imbalance distribution
axes[0, 0].hist(df_pandas['Imbalance'], bins=100, edgecolor='black')
axes[0, 0].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Balance')
axes[0, 0].axvline(x=threshold_extreme, color='orange', linestyle='--', linewidth=2, label='5th percentile')
axes[0, 0].set_xlabel('Imbalance (MW)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Supply-Demand Imbalance Distribution')
axes[0, 0].legend()

# Plot 2: Deficit % distribution
axes[0, 1].hist(df_pandas['Deficit_Pct_of_Load'], bins=100, range=(-100, 50), edgecolor='black')
axes[0, 1].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Balance')
axes[0, 1].axvline(x=-50, color='orange', linestyle='--', linewidth=2, label='-50%')
axes[0, 1].set_xlabel('Deficit as % of Load')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Deficit Percentage Distribution')
axes[0, 1].legend()

# Plot 3: Deficit by country
deficit_counts = df_pandas[df_pandas['Imbalance'] < 0].groupby('country').size().sort_values(ascending=False)
axes[1, 0].barh(deficit_counts.index[:15], deficit_counts.values[:15])
axes[1, 0].set_xlabel('Number of Deficit Hours')
axes[1, 0].set_ylabel('Country')
axes[1, 0].set_title('Top 15 Countries by Deficit Frequency')

# Plot 4: Time series sample (first 1000 records)
sample = df_pandas.head(1000)
axes[1, 1].plot(sample.index, sample['Actual_Load'], label='Actual Load', alpha=0.7)
axes[1, 1].plot(sample.index, sample['Supply'], label='Supply', alpha=0.7)
axes[1, 1].fill_between(sample.index, sample['Actual_Load'], sample['Supply'], 
                         where=(sample['Supply'] < sample['Actual_Load']), 
                         alpha=0.3, color='red', label='Deficit')
axes[1, 1].set_xlabel('Record Index')
axes[1, 1].set_ylabel('MW')
axes[1, 1].set_title('Load vs Supply (First 1000 records)')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("RECOMMENDATION:")
print("="*60)
print("Since generation data is incomplete, define 'blackout risk' as:")
print("  üéØ Imbalance < 5th percentile (extreme deficit)")
print(f"  üéØ Threshold: {threshold_extreme:.0f} MW")
print(f"  üéØ This flags ~5% of data ({blackout_extreme:,} records)")
print("\nThis represents UNUSUALLY BAD deficits, not just routine missing data.")

In [0]:
print("COMPARING ALL FAILURE DEFINITIONS")
print("="*60)

# Option 1: Extreme Deficit (current)
df_pandas['blackout_deficit'] = (df_pandas['Imbalance'] < df_pandas['Imbalance'].quantile(0.05)).astype(int)

# Option 2: Rapid Change
df_pandas['load_change'] = df_pandas.groupby('country')['Actual_Load'].diff()
df_pandas['load_change_pct'] = (df_pandas['load_change'].abs() / df_pandas['Actual_Load']) * 100
df_pandas['blackout_rapid'] = (df_pandas['load_change_pct'] > 15).astype(int)

# Option 3: Peak Stress
df_pandas['max_load_by_country'] = df_pandas.groupby('country')['Actual_Load'].transform('max')
df_pandas['load_pct_of_max'] = (df_pandas['Actual_Load'] / df_pandas['max_load_by_country']) * 100
df_pandas['blackout_peak'] = (df_pandas['load_pct_of_max'] > 90).astype(int)

print("\nFailure Type Frequencies:")
print(f"  Extreme Deficit:  {df_pandas['blackout_deficit'].sum():>8,} ({df_pandas['blackout_deficit'].mean()*100:>5.2f}%)")
print(f"  Rapid Change:     {df_pandas['blackout_rapid'].sum():>8,} ({df_pandas['blackout_rapid'].mean()*100:>5.2f}%)")
print(f"  Peak Stress:      {df_pandas['blackout_peak'].sum():>8,} ({df_pandas['blackout_peak'].mean()*100:>5.2f}%)")

print("\nOverlap Analysis:")
overlap_all = (df_pandas['blackout_deficit'] & df_pandas['blackout_rapid'] & df_pandas['blackout_peak']).sum()
overlap_deficit_rapid = (df_pandas['blackout_deficit'] & df_pandas['blackout_rapid']).sum()
overlap_deficit_peak = (df_pandas['blackout_deficit'] & df_pandas['blackout_peak']).sum()
overlap_rapid_peak = (df_pandas['blackout_rapid'] & df_pandas['blackout_peak']).sum()

print(f"  All 3 conditions:      {overlap_all:>8,}")
print(f"  Deficit + Rapid:       {overlap_deficit_rapid:>8,}")
print(f"  Deficit + Peak:        {overlap_deficit_peak:>8,}")
print(f"  Rapid + Peak:          {overlap_rapid_peak:>8,}")

In [0]:
from pyspark.sql import functions as F
import pandas as pd

print("STEP 1: ADDING FORECASTED_LOAD FROM ORIGINAL DATASET")
print("="*60)

# Load the main dataset
df = spark.table("workspace.default.electricity_and_weather_europe")
print(f"Main dataset loaded: {df.count():,} rows")

# Load the original load_forecast table
CATALOG = "curlybyte_solutions_rawdata_europe_grid_load"
GRID_SCHEMA = "european_grid_raw__v2"

load_forecast = spark.table(f"{CATALOG}.{GRID_SCHEMA}.load_forecast")

print(f"\n1. Original load_forecast table:")
print(f"   Rows: {load_forecast.count():,}")
print(f"   Columns: {load_forecast.columns}")

# Show sample
print(f"\n2. Sample forecast data:")
display(load_forecast.limit(5))

# Select only needed columns
load_forecast_clean = load_forecast.select(
    F.col('index'),
    F.col('country'),
    F.col('Forecasted_Load')
)

# Join with current dataset
print(f"\n3. Joining with electricity_and_weather_europe...")

df_with_forecast = df.join(
    load_forecast_clean,
    on=['index', 'country'],
    how='left'
)

print(f"\n4. Joined dataset:")
print(f"   Rows: {df_with_forecast.count():,}")
print(f"   Columns: {len(df_with_forecast.columns)}")

# Check match rate
forecast_nulls = df_with_forecast.filter(F.col('Forecasted_Load').isNull()).count()
print(f"   Matched rows: {df_with_forecast.count() - forecast_nulls:,}")
print(f"   Unmatched rows: {forecast_nulls:,} ({forecast_nulls/df_with_forecast.count()*100:.2f}%)")

# Convert to pandas for analysis
df_pandas_complete = df_with_forecast.toPandas()

# Calculate forecast error
df_pandas_complete['Forecast_Error'] = abs(df_pandas_complete['Actual_Load'] - df_pandas_complete['Forecasted_Load'])
df_pandas_complete['Forecast_Error_Pct'] = (df_pandas_complete['Forecast_Error'] / df_pandas_complete['Forecasted_Load']) * 100

print(f"\n5. Forecast Error Statistics:")
print(f"   Mean error: {df_pandas_complete['Forecast_Error'].mean():,.2f} MW")
print(f"   Mean error %: {df_pandas_complete['Forecast_Error_Pct'].mean():.2f}%")
print(f"   Median error %: {df_pandas_complete['Forecast_Error_Pct'].median():.2f}%")
print(f"   95th percentile: {df_pandas_complete['Forecast_Error_Pct'].quantile(0.95):.2f}%")

print("\n" + "="*60)
print("SUCCESS! Forecasted_Load added to dataset")
print("="*60)

In [0]:
print("FIXING DATA QUALITY ISSUES")
print("="*60)

# Check for problematic forecast values
print("\n1. Checking Forecasted_Load issues:")
zero_forecasts = (df_pandas_complete['Forecasted_Load'] == 0).sum()
null_forecasts = df_pandas_complete['Forecasted_Load'].isna().sum()
very_small_forecasts = (df_pandas_complete['Forecasted_Load'] < 1).sum()

print(f"   Zero forecasts: {zero_forecasts:,}")
print(f"   Null forecasts: {null_forecasts:,}")
print(f"   Very small (<1 MW) forecasts: {very_small_forecasts:,}")

# Clean the data: Remove rows with problematic forecasts
print("\n2. Cleaning data...")
original_count = len(df_pandas_complete)

df_pandas_complete = df_pandas_complete[
    (df_pandas_complete['Forecasted_Load'].notna()) & 
    (df_pandas_complete['Forecasted_Load'] > 10)  # Keep only reasonable forecasts
].copy()

removed_count = original_count - len(df_pandas_complete)
print(f"   Removed {removed_count:,} problematic rows")
print(f"   Remaining: {len(df_pandas_complete):,} rows")

# Recalculate forecast error (now safe)
df_pandas_complete['Forecast_Error'] = abs(df_pandas_complete['Actual_Load'] - df_pandas_complete['Forecasted_Load'])
df_pandas_complete['Forecast_Error_Pct'] = (df_pandas_complete['Forecast_Error'] / df_pandas_complete['Forecasted_Load']) * 100

print(f"\n3. CLEAN Forecast Error Statistics:")
print(f"   Mean error: {df_pandas_complete['Forecast_Error'].mean():,.2f} MW")
print(f"   Mean error %: {df_pandas_complete['Forecast_Error_Pct'].mean():.2f}%")
print(f"   Median error %: {df_pandas_complete['Forecast_Error_Pct'].median():.2f}%")
print(f"   95th percentile: {df_pandas_complete['Forecast_Error_Pct'].quantile(0.95):.2f}%")

print("\n" + "="*60)
print("DATA CLEANED! Ready for Step 2")
print("="*60)

In [0]:
print("STEP 2: CREATING ALL 4 FAILURE TYPE DEFINITIONS")
print("="*60)

# Calculate necessary columns for all failure types

# Get generation columns and calculate totals
generation_cols = [col for col in df_pandas_complete.columns if '__Actual_Aggregated' in col]
df_pandas_complete['Total_Generation'] = df_pandas_complete[generation_cols].sum(axis=1)
df_pandas_complete['Supply'] = df_pandas_complete['Total_Generation'] + df_pandas_complete['net_imports']
df_pandas_complete['Imbalance'] = df_pandas_complete['Supply'] - df_pandas_complete['Actual_Load']

# Failure Type 1: Extreme Deficit
df_pandas_complete['blackout_deficit'] = (df_pandas_complete['Imbalance'] < df_pandas_complete['Imbalance'].quantile(0.05)).astype(int)

# Failure Type 2: Forecast Deviation
df_pandas_complete['blackout_forecast_deviation'] = (df_pandas_complete['Forecast_Error_Pct'] > 10).astype(int)

# Failure Type 3: Rapid Change
df_pandas_complete['load_change'] = df_pandas_complete.groupby('country')['Actual_Load'].diff()
df_pandas_complete['load_change_pct'] = (df_pandas_complete['load_change'].abs() / df_pandas_complete['Actual_Load']) * 100
df_pandas_complete['blackout_rapid'] = (df_pandas_complete['load_change_pct'] > 15).astype(int)

# Failure Type 4: Peak Stress
df_pandas_complete['max_load_by_country'] = df_pandas_complete.groupby('country')['Actual_Load'].transform('max')
df_pandas_complete['load_pct_of_max'] = (df_pandas_complete['Actual_Load'] / df_pandas_complete['max_load_by_country']) * 100
df_pandas_complete['blackout_peak'] = (df_pandas_complete['load_pct_of_max'] > 90).astype(int)

print("\nAll 4 failure types created!")
print("="*60)

In [0]:
print("STEP 3: COMPARING ALL FAILURE TYPES")
print("="*60)

print("\nFailure Type Frequencies:")
print(f"  1. Extreme Deficit:        {df_pandas_complete['blackout_deficit'].sum():>8,} ({df_pandas_complete['blackout_deficit'].mean()*100:>5.2f}%)")
print(f"  2. Forecast Deviation:     {df_pandas_complete['blackout_forecast_deviation'].sum():>8,} ({df_pandas_complete['blackout_forecast_deviation'].mean()*100:>5.2f}%)")
print(f"  3. Rapid Change:           {df_pandas_complete['blackout_rapid'].sum():>8,} ({df_pandas_complete['blackout_rapid'].mean()*100:>5.2f}%)")
print(f"  4. Peak Stress:            {df_pandas_complete['blackout_peak'].sum():>8,} ({df_pandas_complete['blackout_peak'].mean()*100:>5.2f}%)")

print("\nOverlap Analysis:")
overlap_all = (df_pandas_complete['blackout_deficit'] & 
               df_pandas_complete['blackout_forecast_deviation'] & 
               df_pandas_complete['blackout_rapid'] & 
               df_pandas_complete['blackout_peak']).sum()
overlap_def_fore = (df_pandas_complete['blackout_deficit'] & df_pandas_complete['blackout_forecast_deviation']).sum()
overlap_def_rapid = (df_pandas_complete['blackout_deficit'] & df_pandas_complete['blackout_rapid']).sum()
overlap_def_peak = (df_pandas_complete['blackout_deficit'] & df_pandas_complete['blackout_peak']).sum()
overlap_fore_rapid = (df_pandas_complete['blackout_forecast_deviation'] & df_pandas_complete['blackout_rapid']).sum()
overlap_fore_peak = (df_pandas_complete['blackout_forecast_deviation'] & df_pandas_complete['blackout_peak']).sum()
overlap_rapid_peak = (df_pandas_complete['blackout_rapid'] & df_pandas_complete['blackout_peak']).sum()

print(f"  All 4 conditions:              {overlap_all:>8,}")
print(f"  Deficit + Forecast:            {overlap_def_fore:>8,}")
print(f"  Deficit + Rapid:               {overlap_def_rapid:>8,}")
print(f"  Deficit + Peak:                {overlap_def_peak:>8,}")
print(f"  Forecast + Rapid:              {overlap_fore_rapid:>8,}")
print(f"  Forecast + Peak:               {overlap_fore_peak:>8,}")
print(f"  Rapid + Peak:                  {overlap_rapid_peak:>8,}")

print("\n" + "="*60)
print("SAVING COMPLETE DATASET FOR TEAM")
print("="*60)

# Convert back to Spark DataFrame
df_complete_spark = spark.createDataFrame(df_pandas_complete)

# Save as Delta table
table_name = "workspace.default.electricity_weather_forecast_complete"
df_complete_spark.write.format("delta").mode("overwrite").saveAsTable(table_name)

print(f"\nSaved to: {table_name}")
print(f"  Rows: {df_complete_spark.count():,}")
print(f"  Columns: {len(df_complete_spark.columns)}")

# Verify the table was saved
print(f"\nVerifying save...")
test_load = spark.table(table_name)
print(f"  Verified: {test_load.count():,} rows in saved table")

print("\n" + "="*60)
print("HOW YOUR COLLEAGUES CAN ACCESS IT:")
print("="*60)
print("""
# In a new notebook, they can load it with:
df = spark.table('workspace.default.electricity_weather_forecast_complete')

# Or convert to pandas:
df_pandas = df.toPandas()

# Available columns include:
# - All original features (generation, weather, etc.)
# - Forecasted_Load (joined from original dataset)
# - Forecast_Error and Forecast_Error_Pct
# - All 4 blackout target variables:
#   * blackout_deficit
#   * blackout_forecast_deviation
#   * blackout_rapid
#   * blackout_peak
""")

print("\n" + "="*60)
print("DATASET SUMMARY:")
print("="*60)
print(f"  Time period: 2019-2021")
print(f"  Countries: 26 European countries")
print(f"  Total records: {len(df_pandas_complete):,}")
print(f"  Features: {len(df_pandas_complete.columns)}")
print(f"  Forecast match rate: 98.2%")
print(f"  Average forecast error: 4.48%")

print("\n" + "="*60)
print("RECOMMENDATION FOR MODELING:")
print("="*60)

In [0]:
print("FINDING YOUR SAVED TABLE")
print("="*60)

# Check all tables in workspace.default
print("\n1. Tables in workspace.default:")
tables_workspace = spark.sql("SHOW TABLES IN workspace.default").toPandas()
display(tables_workspace)

# Check if our table exists
table_name = "workspace.default.electricity_weather_forecast_complete"
table_exists = spark.catalog.tableExists(table_name)
print(f"\n2. Does table exist? {table_exists}")

# Try to load it
if table_exists:
    print("\n3. Table found! Loading...")
    test_df = spark.table(table_name)
    print(f"   Rows: {test_df.count():,}")
    print(f"   Columns: {len(test_df.columns)}")
else:
    print("\n3. Table not found. Let's check other locations...")
    
    # Check current database
    print(f"\nCurrent database: {spark.catalog.currentDatabase()}")
    
    # Check all databases
    print(f"\nAll databases:")
    databases = spark.sql("SHOW DATABASES").toPandas()
    display(databases)

# Also check what we have in memory
print("\n4. Available DataFrames in memory:")
print(f"   df_pandas_complete: {len(df_pandas_complete) if 'df_pandas_complete' in locals() else 'Not found'}")
print(f"   df_complete_spark: {'Found' if 'df_complete_spark' in locals() else 'Not found'}")

In [0]:
print("CHECKING SAVED TABLE CONTENTS")
print("="*60)

# Load the saved table
df_saved = spark.table('workspace.default.electricity_weather_forecast_complete')

# Convert to pandas to see all columns
df_saved_pandas = df_saved.toPandas()

print(f"\n1. TOTAL COLUMNS: {len(df_saved_pandas.columns)}")

print(f"\n2. ORIGINAL FEATURES (from electricity_and_weather_europe):")
original_features = [col for col in df_saved_pandas.columns if not col.startswith('blackout') 
                     and col not in ['Forecasted_Load', 'Forecast_Error', 'Forecast_Error_Pct', 
                                     'Total_Generation', 'Supply', 'Imbalance', 'Imbalance_Pct',
                                     'load_change', 'load_change_pct', 'max_load_by_country', 'load_pct_of_max']]
print(f"   Count: {len(original_features)}")
print(f"   Examples: {original_features[:5]}")

print(f"\n3. ADDED FROM FORECAST TABLE:")
forecast_features = ['Forecasted_Load', 'Forecast_Error', 'Forecast_Error_Pct']
for feat in forecast_features:
    exists = feat in df_saved_pandas.columns
    print(f"   {feat}: {'‚úÖ YES' if exists else '‚ùå NO'}")

print(f"\n4. ENGINEERED FEATURES (from failure analysis):")
engineered_features = {
    'Total_Generation': 'Sum of all generation types',
    'Supply': 'Total_Generation + net_imports',
    'Imbalance': 'Supply - Actual_Load',
    'Imbalance_Pct': 'Imbalance as % of load',
    'load_change': 'Hour-to-hour load change',
    'load_change_pct': 'Load change as %',
    'max_load_by_country': 'Historical max per country',
    'load_pct_of_max': 'Current load as % of max'
}
for feat, description in engineered_features.items():
    exists = feat in df_saved_pandas.columns
    print(f"   {feat}: {'‚úÖ YES' if exists else '‚ùå NO'} - {description}")

print(f"\n5. TARGET VARIABLES:")
target_features = {
    'blackout_deficit': 'Extreme supply deficit (5th percentile)',
    'blackout_forecast_deviation': 'Forecast error >10%',
    'blackout_rapid': 'Rapid load change >15%',
    'blackout_peak': 'Load >90% of max capacity'
}
for feat, description in target_features.items():
    exists = feat in df_saved_pandas.columns
    print(f"   {feat}: {'‚úÖ YES' if exists else '‚ùå NO'} - {description}")

print(f"\n6. COMPLETE COLUMN LIST:")
print(df_saved_pandas.columns.tolist())

print("\n" + "="*60)
print("SUMMARY:")
print("="*60)
print(f"Total columns saved: {len(df_saved_pandas.columns)}")
print(f"Ready for modeling: {'‚úÖ YES' if 'blackout_forecast_deviation' in df_saved_pandas.columns else '‚ùå NO'}")

In [0]:
print("CREATING CLEAN DATASET - ONLY ADDING FORECASTED_LOAD")
print("="*60)

# Step 1: Load original dataset
df_original = spark.table("workspace.default.electricity_and_weather_europe")
print(f"\n1. Original dataset loaded: {df_original.count():,} rows, {len(df_original.columns)} columns")

# Step 2: Load forecast data
CATALOG = "curlybyte_solutions_rawdata_europe_grid_load"
GRID_SCHEMA = "european_grid_raw__v2"
load_forecast = spark.table(f"{CATALOG}.{GRID_SCHEMA}.load_forecast")

# Select only needed columns
load_forecast_clean = load_forecast.select(
    F.col('index'),
    F.col('country'),
    F.col('Forecasted_Load')
)

print(f"\n2. Forecast data loaded: {load_forecast_clean.count():,} rows")

# Step 3: Join - ONLY add Forecasted_Load
df_clean = df_original.join(
    load_forecast_clean,
    on=['index', 'country'],
    how='left'
)

print(f"\n3. Joined dataset:")
print(f"   Rows: {df_clean.count():,}")
print(f"   Columns: {len(df_clean.columns)} (should be {len(df_original.columns) + 1})")

# Check match rate
forecast_nulls = df_clean.filter(F.col('Forecasted_Load').isNull()).count()
print(f"   Matched rows: {df_clean.count() - forecast_nulls:,}")
print(f"   Unmatched rows: {forecast_nulls:,} ({forecast_nulls/df_clean.count()*100:.2f}%)")

# Show columns
print(f"\n4. Columns in clean dataset:")
print(df_clean.columns)

# Step 4: Save clean version
print(f"\n5. Saving clean dataset...")
table_name_clean = "workspace.default.electricity_weather_with_forecast"
df_clean.write.format("delta").mode("overwrite").saveAsTable(table_name_clean)

print(f"\n‚úÖ Saved to: {table_name_clean}")
print(f"   Rows: {df_clean.count():,}")
print(f"   Columns: {len(df_clean.columns)}")

# Verify
print(f"\n6. Verification:")
test_df = spark.table(table_name_clean)
print(f"   Loaded: {test_df.count():,} rows")
display(test_df.limit(5))

print("\n" + "="*60)
print("CLEAN DATASET READY!")
print("="*60)
print(f"""
Your colleagues can load it with:
df = spark.table('{table_name_clean}')

This dataset contains:
‚úÖ All original 64 columns from electricity_and_weather_europe
‚úÖ ONLY 1 added column: Forecasted_Load
‚ùå NO engineered features
‚ùå NO target variables

Total: 65 columns (64 original + 1 forecast)
""")