In [0]:
train_set = spark.table("workspace.default.train_set_imputed")

In [0]:
df_train = train_set.toPandas()
df_train.head()
df_train.describe()
df_train.info()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#selected = ['ES']
selected = ['ES', 'FR', 'DE', 'PT', 'IT']

df_daily = df_train[df_train['country'].isin(selected)].copy()
df_daily['date'] = pd.to_datetime(df_daily['index']).dt.date
df_daily_mean = df_daily.groupby(['date', 'country'])['grid_stress_score'].mean().reset_index()

# Calculate monthly mean
df_daily['month'] = pd.to_datetime(df_daily['date']).dt.to_period('M')
df_monthly_mean = df_daily.groupby(['month', 'country'])['grid_stress_score'].mean().reset_index()
df_monthly_mean['month_start'] = df_monthly_mean['month'].dt.to_timestamp()

n_countries = len(selected)
fig, axes = plt.subplots(n_countries, 1, figsize=(15, 5 * n_countries), sharex=True)

if n_countries == 1:
    axes = [axes]

for ax, country in zip(axes, selected):
    data_daily = df_daily_mean[df_daily_mean['country'] == country]
    data_monthly = df_monthly_mean[df_monthly_mean['country'] == country]
    sns.lineplot(x='date', y='grid_stress_score', data=data_daily, ax=ax, label='Daily Mean')
    sns.lineplot(x='month_start', y='grid_stress_score', data=data_monthly, ax=ax, color='red', label='Monthly Mean')
    ax.set(title=f'Grid Stress Score (Daily & Monthly Mean) - {country}')
    ax.legend()

plt.tight_layout()
plt.savefig('../images/grid_stress_score_vs_time.png')
plt.show()

In [0]:
import numpy as np

# LOW / MEDIUM / HIGH stress level: 
    # < 33, "LOW"
    # >33 and < 66, "MEDIUM"
    # > 66, "HIGH"

results = []

for c in selected:
    low_stress=[]
    medium_stress=[]
    high_stress=[]  
    if c in df_daily['country'].unique():
        for i in df_daily[df_daily['country']==c]['grid_stress_score']:
            if i < 33:
                low_stress.append(i)
            elif i > 33 and i < 66:
                medium_stress.append(i)
            else:
                high_stress.append(i)

    # Save counts for final grouped plot
    results.append((c, len(low_stress), len(medium_stress), len(high_stress)))

labels = [r[0] for r in results]  # countries
low_vals = [r[1] for r in results]
med_vals = [r[2] for r in results]
high_vals = [r[3] for r in results]

x = np.arange(len(labels))  # positions for countries
width = 0.25  # bar width

plt.figure(figsize=(16, 6))

plt.bar(x - width, low_vals, width, label='Low stress', color='tab:blue', alpha=0.6)
plt.bar(x,        med_vals, width, label='Medium stress', color='tab:green', alpha=0.6)
plt.bar(x + width, high_vals, width, label='High stress', color='tab:red', alpha=0.6)

plt.xticks(x, labels, rotation=45, size=14)
plt.ylabel("Count", size=14)
plt.title("Grid Stress Score Categories per Country", size=16)
plt.yscale("log")   
plt.legend(fontsize=13)

plt.tight_layout()
plt.savefig("../images/grid_stress_score_all_countries.png")
plt.show()

In [0]:
import seaborn as sns

sns.histplot(df_train['grid_stress_score'], bins=20)
plt.title("Grid Stress Score Distribution", size=16)
plt.xlabel("Grid Stress Score", size=14)
plt.ylabel("Count", size=14)
plt.savefig("../images/grid_stress_score_distribution.png")
plt.show()