In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import duckdb

# For plotting
from sklearn.decomposition import PCA


The following dataset is simplified energy balances.

In [None]:
dataset = pd.read_csv('/Users/couch/Eurostat_data/nrg_bal/estat_nrg_bal_c.tsv', sep='\t')
dataset.head()

In [None]:
dataset.columns

In [None]:
# data cleaning
dataset[['freq', 'nrg_bal', 'siec', 'unit', 'geo',]] = dataset['freq,nrg_bal,siec,unit,geo\TIME_PERIOD'].str.split(',', expand=True)
dataset = dataset.drop(columns=['freq,nrg_bal,siec,unit,geo\TIME_PERIOD'])
dataset.columns = dataset.columns.str.strip()
dataset.replace({': ': 0, ': m': np.nan}, inplace=True)
dataset[['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
               '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
               '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
               '2020', '2021', '2022']] = dataset[['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
               '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
               '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
               '2020', '2021', '2022']].apply(pd.to_numeric, errors='coerce')
dataset.columns

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset['siec'].unique()

In [None]:
dataset['geo'].unique()

In [None]:
dataset['freq'].unique()

In [None]:
dataset['nrg_bal'].unique()

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
elec_only = dataset[dataset['siec']=='E7000']
print(elec_only.shape[0])
print(elec_only['nrg_bal'].unique())

In [None]:
elec_total_output = elec_only[elec_only['nrg_bal']=='TO_EHG']
elec_total_output

In [None]:
elec_total_output.columns

In [None]:
# Experimental plots
year_cols = [str(year) for year in range(1990, 2024)]  # or whatever range is available

long_df = elec_total_output.melt(
    id_vars=['geo', 'unit'],  # optionally keep 'siec' or 'nrg_bal' if you need
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

plt.figure(figsize=(12, 6))
sns.lineplot(data=long_df, x='year', y='value', hue='geo')
plt.title('Electricity Usage Over Time by Country')
plt.ylabel(f'Value ({long_df["unit"].unique()[0]})')
plt.gca().invert_yaxis()
plt.xlabel('Year')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Note: This dataset was only for the EU. Since Brexit, the UK is no longer part of the EU, it has values of 0 after
# a certain point.
UK_only = dataset[(dataset['geo']=='UK') & (dataset['siec']=='E7000')]
UK_only_long = UK_only.melt(
    id_vars=['geo', 'unit'],
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

plt.figure(figsize=(12, 6))
sns.lineplot(data=UK_only_long, x='year', y='value')
plt.title('Electricity Usage Over Time in the UK')
plt.ylabel(f'Value ({UK_only_long["unit"].unique()[0]})')
plt.xlabel('Year')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
BE_only = dataset[(dataset['geo']=='BE') & (dataset['siec']=='E7000')]
BE_only_long = BE_only.melt(
    id_vars=['geo', 'unit'],
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)
BE_only_long

In [None]:
BE_only = dataset[(dataset['geo']=='BE') & (dataset['siec']=='E7000')]
BE_only_long = BE_only.melt(
    id_vars=['geo', 'unit'],
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

plt.figure(figsize=(12, 6))
sns.lineplot(data=BE_only_long, x='year', y='value')
plt.title('Electricity Usage Over Time in Belgium')
plt.ylabel(f'Value ({BE_only_long["unit"].unique()[0]})')
plt.xlabel('Year')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
sweden_only = dataset[(dataset['geo']=='SE') & (dataset['siec']=='E7000')]
sweden_only_long = sweden_only.melt(
    id_vars=['geo', 'unit'],
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)
plt.figure(figsize=(12, 6))
sns.lineplot(data=sweden_only_long, x='year', y='value')
plt.title('Electricity Usage Over Time in Sweden')
plt.ylabel(f'Value ({sweden_only_long["unit"].unique()[0]})')
plt.xlabel('Year')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# Supposed to be countries by change in percentage of electricity usage per year but kind of mid graph tbh
long_df['value'] = pd.to_numeric(long_df['value'], errors='coerce')
long_df = long_df.sort_values(['geo', 'year'])
long_df['pct_change'] = long_df.groupby('geo')['value'].pct_change() * 100
plt.figure(figsize=(12, 6))
sns.lineplot(data=long_df, x='year', y='pct_change', hue='geo')
plt.title('Percentage Change in Electricity Usage Over Time by Country')
plt.ylabel('Percentage Change (%)')
plt.xlabel('Year')
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# First, aggregate total electricity usage per country-year
agg_df = long_df.groupby(['geo', 'year'], as_index=False)['value'].sum()

# Pivot: countries as rows, years as columns
country_year_matrix = agg_df.pivot(index='geo', columns='year', values='value')

# Optional: fill missing values (e.g., with 0 or interpolation)
country_year_matrix = country_year_matrix.fillna(0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
country_year_scaled = scaler.fit_transform(country_year_matrix)

# Keep row labels (country names)
country_year_scaled_df = pd.DataFrame(country_year_scaled, index=country_year_matrix.index, columns=country_year_matrix.columns)

from sklearn.cluster import KMeans

# Choose number of clusters (e.g., 3–5 is a good start)
kmeans = KMeans(n_clusters=4, random_state=42)
country_clusters = kmeans.fit_predict(country_year_scaled)

# Add cluster labels to a new column
country_year_matrix['cluster'] = country_clusters


In [None]:
# Performing PCA to cluster countries based on their electricity usage patterns
pca = PCA(n_components=2)
coords = pca.fit_transform(country_year_scaled)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=coords[:, 0], y=coords[:, 1], hue=country_clusters, palette='Set2')
plt.title('Country Clusters Based on Electricity Usage Patterns')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()


In [None]:
# PCA but labelled
pca = PCA(n_components=2)
coords = pca.fit_transform(country_year_scaled)

# Turn into DataFrame with country names and cluster labels
pca_df = pd.DataFrame({
    'PC1': coords[:, 0],
    'PC2': coords[:, 1],
    'country': country_year_matrix.index,
    'cluster': country_clusters
})

plt.figure(figsize=(10, 8))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='cluster', palette='Set2', s=100)

for _, row in pca_df.iterrows():
    plt.text(
        row['PC1'] + 0.1,
        row['PC2'],
        row['country'],
        fontsize=9,
        rotation=45  
    )

plt.title('PCA of Countries by Electricity Usage (Labeled)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()


In [None]:
# List of countries by cluster
cluster_groups = pca_df.groupby('cluster')['country'].apply(list)

print(cluster_groups)
