In [0]:
train_df = spark.table("workspace.default.train_set")
train_df.printSchema()

In [0]:
display(train_df.limit(5))

In [0]:
print("Rows:", train_df.count())
print("Columns:", len(train_df.columns))


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
train_df.select(
    F.count("*").alias("n_rows"),
    F.min("index").alias("start_time"),
    F.max("index").alias("end_time")
).show(truncate=False)

train_df.select("country").distinct().orderBy("country").show(100, truncate=False)


In [0]:
# count missing values for each column
null_counts = train_df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in train_df.columns
])

display(null_counts)


In [0]:
# Convert to long format

nulls_long = (
    null_counts
    .select(F.expr(
        "stack({0}, {1}) as (column_name, null_count)"
        .format(
            len(train_df.columns),
            ", ".join(["'{0}', `{0}`".format(c) for c in train_df.columns])
        )
    ))
    .orderBy(F.desc("null_count"))
)

display(nulls_long)

In [0]:
# Add percentage of missing values

# --- 1. Compute NULL counts ---
null_counts = train_df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in train_df.columns
])

# --- 2. Convert to long format for readability ---
nulls_long = (
    null_counts
    .select(F.expr(
        "stack({0}, {1}) as (column_name, null_count)"
        .format(
            len(train_df.columns),
            ", ".join(["'{0}', `{0}`".format(c) for c in train_df.columns])
        )
    ))
)

# --- 3. Add percentage and % sign ---
total_rows = train_df.count()

nulls_pct_with_sign = nulls_long.withColumn(
    "percent_missing",
    F.concat(
        F.round((F.col("null_count") / total_rows) * 100, 2),
        F.lit("%")
    )
).orderBy(F.desc("null_count"))

# --- 4. Display result ---
display(nulls_pct_with_sign)


In [0]:
# Overall distribution

train_df.select(
    F.min("grid_stress_score"), F.max("grid_stress_score"),
    F.avg("grid_stress_score").alias("avg_grid_stress")
).show()

In [0]:
# Distribution by country

risk_by_country = (
    train_df
    .groupBy("country")
    .agg(
        F.count("*").alias("n_rows"),
        F.avg("grid_stress_score").alias("avg_grid_stress"),
        F.min("grid_stress_score").alias("min_grid_stress"),
        F.max("grid_stress_score").alias("max_grid_stress")
    )
    .orderBy(F.col("avg_grid_stress").desc())
)

display(risk_by_country)


In [0]:
import pandas as pd
import matplotlib.pyplot as plt


pdf = train_df.select("grid_stress_score").sample(False, 0.1, seed=42).toPandas()

plt.figure(figsize=(8,4))
plt.hist(pdf["grid_stress_score"], bins=30)
plt.title("Distribution of Grid Stress Score")
plt.xlabel("Grid Stress Score")
plt.ylabel("Count")
plt.show()

In [0]:
# Pick a few intuitive features:
    # Actual_Load
    # net_imports
    # mean_wind_speed
    # mean_temperature_c
    # reserve_margin_ml
# The target: grid_stress_score

# Check correlation with grid_stress_score
cols_to_check = [
    "Actual_Load", "net_imports", "mean_wind_speed",
    "mean_temperature_c", "reserve_margin_ml", "grid_stress_score"
]

# sample to avoid huge pandas
sample_pd = train_df.select(cols_to_check).sample(False, 0.05, seed=42).toPandas()

sample_pd.corr()


In [0]:
# Pick a country, e.g. "AT":

country = "AT"

country_ts = (
    train_df
    .filter(F.col("country") == country)
    .orderBy("index")
    .select("index", "Actual_Load", "grid_stress_score")
    .limit(5)
)

display(country_ts)


In [0]:
# ==========================================
#        FULL GRID STRESS EDA SUITE
# ==========================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F
from pyspark.sql.window import Window

plt.style.use("ggplot")

# Helper: sample Spark to pandas
def to_pd(df, frac=0.05):
    return df.sample(False, frac, seed=42).toPandas()


# ==========================================
# 1. Distribution of Grid Stress Score
# ==========================================
pdf = to_pd(train_df.select("grid_stress_score"))

plt.figure(figsize=(8,4))
plt.hist(pdf["grid_stress_score"], bins=30, color="steelblue")
plt.title("Distribution of Grid Stress Score")
plt.xlabel("Stress Score")
plt.ylabel("Count")
plt.show()


# ==========================================
# 2. Time Series (Load & Stress) for One Country
# ==========================================
country = "AT"

pdf2 = (
    train_df
    .filter(F.col("country") == country)
    .orderBy("index")
    .select("index", "Actual_Load", "grid_stress_score")
    .limit(3000)
    .toPandas()
)

pdf2["index"] = pd.to_datetime(pdf2["index"])

plt.figure(figsize=(14,4))
plt.plot(pdf2["index"], pdf2["Actual_Load"], label="Actual Load")
plt.plot(pdf2["index"], pdf2["grid_stress_score"], label="Grid Stress", alpha=0.7)
plt.title(f"Load & Grid Stress Over Time â€“ {country}")
plt.xlabel("Time")
plt.ylabel("Value")
plt.legend()
plt.show()


# ==========================================
# 3. Scatter Plots (relationships)
# ==========================================
pdf3 = to_pd(train_df.select("Actual_Load", "grid_stress_score", "mean_wind_speed"))

# Load vs Stress
plt.figure(figsize=(6,4))
plt.scatter(pdf3["Actual_Load"], pdf3["grid_stress_score"], alpha=0.3)
plt.xlabel("Actual Load")
plt.ylabel("Grid Stress Score")
plt.title("Load vs Grid Stress")
plt.show()

# Wind vs Stress
plt.figure(figsize=(6,4))
plt.scatter(pdf3["mean_wind_speed"], pdf3["grid_stress_score"], alpha=0.3, color="green")
plt.xlabel("Mean Wind Speed")
plt.ylabel("Grid Stress Score")
plt.title("Wind Speed vs Grid Stress")
plt.show()


# ==========================================
# 4. Correlation Heatmap (top numerical features)
# ==========================================
cols = [
    "Actual_Load", "net_imports", "mean_wind_speed",
    "mean_ssrd", "mean_temperature_c",
    "reserve_margin_ml", "forecast_load_error",
    "load_rel_error", "P10_net", "P90_net",
    "grid_stress_score"
]

pdf_corr = to_pd(train_df.select(cols))

plt.figure(figsize=(12,8))
sns.heatmap(pdf_corr.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


# ==========================================
# 5. Country Comparison of Stress
# ==========================================
country_risk = (
    train_df
    .groupBy("country")
    .agg(F.avg("grid_stress_score").alias("avg_stress"))
    .orderBy(F.desc("avg_stress"))
    .toPandas()
)

plt.figure(figsize=(10,5))
plt.bar(country_risk["country"], country_risk["avg_stress"], color="darkred")
plt.title("Average Grid Stress by Country")
plt.xlabel("Country")
plt.ylabel("Average Stress Score")
plt.xticks(rotation=45)
plt.show()

# ==========================================
print("ðŸŽ‰ EDA suite complete!")


In [0]:
from pyspark.sql import functions as F

df = spark.table("workspace.default.train_set_timebins_lags")

# List of actual generation columns (contains '__Actual_' )
gen_cols = [c for c in df.columns if "Actual" in c]

print("Number of generation-related columns:", len(gen_cols))

# Compute percentage of missing values per country per column
missing_per_country = (
    df.groupBy("country")
      .agg(*[
            (F.sum(F.col(c).isNull().cast("int")) / F.count("*") * 100)
            .alias(f"{c}_missing_pct")
            for c in gen_cols
      ])
)

missing_per_country_df = missing_per_country.toPandas()

missing_per_country_df.head()


In [0]:
# Compute average missing percentage across all generation columns
missing_per_country_df["avg_missing_pct"] = missing_per_country_df[
    [c for c in missing_per_country_df.columns if c.endswith("_missing_pct")]
].mean(axis=1)

# Sort best â†’ worst
ranking = missing_per_country_df[["country", "avg_missing_pct"]].sort_values("avg_missing_pct")

ranking


In [0]:
ranking.style.background_gradient(cmap="RdYlGn_r")
