<a href="https://colab.research.google.com/github/connor-waldron3/Electricity-Mix/blob/main/Energy_Mix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Global Electricity Energy Sources

Tracking the Energy Source for Electricity generation Globally.

Data Source: **[Our World in Data](https://ourworldindata.org/energy)** (*Last accessed Aug 7th, 2024 03:16*)

> Ember (2024) – with major processing by Our World in Data. “Electricity generation from bioenergy – Ember and Energy Institute” [dataset]. Ember, “Yearly Electricity Data” [original data].





In [None]:

import pandas as pd
import numpy as np

electricity_df = pd.read_csv('electricity-prod-source-stacked.csv')

population_df = pd.read_csv('population-and-demography.csv')

In [None]:
population_df_subset = population_df[['Code', 'Year', 'Population - Sex: all - Age: all - Variant: estimates']].rename(
    columns={'Population - Sex: all - Age: all - Variant: estimates': 'Population'}
)

unique_countries = population_df['Entity'].unique()

df = pd.merge(electricity_df, population_df_subset, on=('Code', 'Year'), how='inner')

**Electricity Data Cleaning & Selection**
*   Rename columns
*   Elimination of non-country entities from Dataset
*   Select the a timeframe where all countries have valid values (2000 - 2023)
*   Eliminate NaN values from Dataset  

In [None]:
non_countries = [
    'ASEAN (Ember)', 'Africa', 'Africa (EI)', 'Africa (Ember)', 'Asia', 'Asia (Ember)', 'Asia Pacific (EI)', 'CIS (EI)', 'Europe', 'Europe (EI)',
    'Europe (Ember)', 'European Union (27)','G20 (Ember)', 'G7 (Ember)', 'G7 (Ember)', 'Latin America and Caribbean (Ember)', 'Low-income countries',
    'Lower-middle-income countries', 'Middle East (EI)', 'Middle East (Ember)', 'Non-OECD (EI)', 'North America', 'North America (EI)',
    'North America (Ember)', 'OECD (EI)', 'OECD (Ember)', 'Oceania', 'Oceania (Ember)', 'South America', 'South and Central America (EI)',
    'Upper-middle-income countries', 'World', 'High-income countries'
    ]

data = df[(~df['Entity'].isin(non_countries)) & (df['Year'] >= 2000) & (df['Population'] >= 5_000_000)]

simple_column_names = {
    'Entity': 'Country'
    , 'Other renewables excluding bioenergy - TWh (adapted for visualization of chart electricity-prod-source-stacked)':  'Other Renewables'
    , 'Electricity from bioenergy - TWh (adapted for visualization of chart electricity-prod-source-stacked)':            'Bioenergy'
    , 'Electricity from solar - TWh (adapted for visualization of chart electricity-prod-source-stacked)':                'Solar'
    , 'Electricity from wind - TWh (adapted for visualization of chart electricity-prod-source-stacked)':                 'Wind'
    , 'Electricity from hydro - TWh (adapted for visualization of chart electricity-prod-source-stacked)':                'Hydro'
    , 'Electricity from nuclear - TWh (adapted for visualization of chart electricity-prod-source-stacked)':              'Nuclear'
    , 'Electricity from oil - TWh (adapted for visualization of chart electricity-prod-source-stacked)':                  'Oil'
    , 'Electricity from gas - TWh (adapted for visualization of chart electricity-prod-source-stacked)':                  'Gas'
    , 'Electricity from coal - TWh (adapted for visualization of chart electricity-prod-source-stacked)':                 'Coal'
}

country_data = data.rename(columns=simple_column_names).fillna(0).copy()

country_data = country_data.drop(columns='Code')

country_data = country_data.fillna(0).sort_values(by=['Country', 'Year']).reset_index(drop=True)

**Data Preparation**

*   Grouping of different energy sources into Energy Type Groups.

*   Calculation of aggregation columns and definition of Energy Type Groups (Renewables, Fossils and Nuclear).



In [None]:
energy_columns = ['Nuclear', 'Coal', 'Oil', 'Solar', 'Gas', 'Hydro', 'Wind', 'Bioenergy', 'Other Renewables']

renewables = ['Solar', 'Hydro', 'Wind', 'Bioenergy', 'Other Renewables']

fossils = ['Oil', 'Coal', 'Gas']

country_data = country_data.assign(
    **{
      'Fossil Fuel Energy': country_data[fossils].sum(axis=1),
      'Renewable Energy': country_data[renewables].sum(axis=1),
      'Nuclear Energy': country_data['Nuclear'],
      'Total Energy': country_data[energy_columns].sum(axis=1),
      'Fossil Fuel Share': (lambda df: (df['Fossil Fuel Energy'] / df['Total Energy'] * 100).round(2)),
      'Renewable Share': (lambda df: (df['Renewable Energy'] / df['Total Energy'] * 100).round(2)),
      'Nuclear Share': (lambda df: (df['Nuclear Energy'] / df['Total Energy'] * 100).round(2))
    }
)
print(country_data[country_data['Year'] == 2000]['Total Energy'].dtype)
print(country_data[country_data['Year'] == 2023]['Total Energy'].dtype)


**Calculation of national energy per capita in KWh/yr**


> Electric energy per capita [ in watt-hour ] = Total population electricity consumption [in kW·h/yr] × 1,000 /population.


Source: [Wikipedia](https://https://en.wikipedia.org/wiki/List_of_countries_by_electricity_consumption)



In [None]:
country_data['Energy Per Capita'] = ((country_data['Total Energy'] * 1_000_000_000) / country_data['Population']).round(3)

country_data = country_data[country_data['Energy Per Capita'] != 0]

## Cluster Analysis Data Preparation

To simplify the cluster analysis, only the first and last years are considered for evaluation due to largely consistent development in many countries.

In [None]:
min_year = country_data.groupby('Country')['Year'].min().reset_index()
max_year = country_data.groupby('Country')['Year'].max().reset_index()

min_data = pd.merge(country_data, min_year, on=['Country', 'Year'], how='inner')
max_data = pd.merge(country_data, max_year, on=['Country', 'Year'], how='inner')

filtered_country_data = pd.merge(min_data, max_data, on='Country', suffixes=(' 2000', ' 2023'))

filtered_country_data['Total Energy Increase'] = (
    (filtered_country_data['Total Energy 2023'] - filtered_country_data['Total Energy 2000'])
    / filtered_country_data['Total Energy 2000']
).round(2) * 100
filtered_country_data['Energy Per Capita Increase'] = (
    (filtered_country_data['Energy Per Capita 2023'] - filtered_country_data['Energy Per Capita 2000'])
    / filtered_country_data['Energy Per Capita 2000']
).round(2) * 100
filtered_country_data['Fossil Fuel Share Increase'] = (filtered_country_data['Fossil Fuel Share 2023'] - filtered_country_data['Fossil Fuel Share 2000']).round(2)
filtered_country_data['Renewables Share Increase'] = (filtered_country_data['Renewable Share 2023'] - filtered_country_data['Renewable Share 2000']).round(2)
filtered_country_data['Nuclear Share Increase'] = (filtered_country_data['Nuclear Share 2023'] - filtered_country_data['Nuclear Share 2000']).round(2)

In [None]:
df_2 = filtered_country_data[
    [ 'Country',
      'Total Energy Increase',
      'Energy Per Capita 2023',
      'Energy Per Capita Increase',
      'Fossil Fuel Share 2023',
      'Fossil Fuel Share Increase',
      'Renewable Share 2023',
      'Renewables Share Increase',
      'Nuclear Share 2023',
      'Nuclear Share Increase', ]
]

clustering_data = df_2.drop(columns='Country')

clustering_data

**Aggregate the Dataframe** per country for cluster analysis.



In [None]:
# Outdated attempt that aggregated the increments of each year

'''
def calculate_growth(df):
  first_value = df.iloc[0]
  last_value = df.iloc[-1]
  if first_value == 0:
    return 1
  return(((last_value - first_value) / first_value) * 100).round(2)

def calculate_share(df, year, column):
    value = df.loc[df['Year'] == year, column].values[0]
    total_value = df.loc[df['Year'] == year, 'Total Energy'].values[0]
    if total_value == 0 or np.isnan(total_value):
        return 0
    return (value / total_value) * 100

country_data['Initial Year'] = country_data.groupby('Country')['Year'].transform('min')
country_data['Current Year'] = country_data.groupby('Country')['Year'].transform('max')

country_data['Initial Renewables Share'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Initial Year'], 'Renewable Energy'), axis=1)
country_data['Current Renewables Share'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Current Year'], 'Renewable Energy'), axis=1)

country_data['Initial Fossils Share'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Initial Year'], 'Fossil Fuel Energy'), axis=1)
country_data['Current Fossils Share'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Current Year'], 'Fossil Fuel Energy'), axis=1)

country_data['Initial Nuclear Share'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Initial Year'], 'Nuclear Energy'), axis=1)
country_data['Current Nuclear Share'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Current Year'], 'Nuclear Energy'), axis=1)

country_data['Initial Energy Per Capita'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Initial Year'], 'Energy Per Capita'), axis=1)
country_data['Current Energy Per Capita'] = country_data.apply(lambda x: calculate_share(country_data[country_data['Country'] == x['Country']], x['Current Year'], 'Energy Per Capita'), axis=1)

country_data['Total Increase Percent'] = country_data.groupby('Country')['Total Energy'].transform(calculate_growth)

country_data['Total Increase'] = country_data.groupby('Country')['Total Energy'].transform(lambda x: x.diff().round(2)).fillna(0)

country_data['Renewables Increase'] = country_data.groupby('Country')['Renewable Energy'].transform(lambda x: x.diff().round(2)).fillna(0)

country_data['Fossils Increase'] = country_data.groupby('Country')['Fossil Fuel Energy'].transform(lambda x: x.diff().round(2)).fillna(0)

country_data['Nuclear Increase'] = country_data.groupby('Country')['Nuclear Energy'].transform(lambda x: x.diff().round(2)).fillna(0)

country_data.replace([np.inf, -np.inf], 0, inplace=True)

agg_country_data = country_data.groupby('Country').agg({
    'Total Energy': 'max',
    'Fossil Fuel Energy': 'max',
    'Population in Mio': 'max',
    'Initial Energy Per Capita': 'min',
    'Current Energy Per Capita': 'max',
    'Renewable Energy': 'max',
    'Nuclear Energy': 'max',
    'Total Increase': 'sum',
    'Renewables Increase': 'sum',
    'Fossils Increase': 'sum',
    'Nuclear Increase': 'sum',
    'Total Increase Percent': 'max',
    'Initial Fossils Share': 'min',
    'Current Fossils Share': 'max',
    'Initial Renewables Share': 'min',
    'Current Renewables Share': 'max',
    'Initial Nuclear Share': 'min',
    'Current Nuclear Share': 'max',
}).reset_index()

agg_country_data.replace([np.inf, -np.inf], 0, inplace=True)

agg_country_data
'''

# DBSCAN Clustering Analysis

Configure eps= and min_samples= to get a decent amount of groups with minimal outliers

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

import numpy as np

import matplotlib.pyplot as plt

scaler = StandardScaler()
scaled_data = scaler.fit_transform(clustering_data)

dbscan = DBSCAN(eps=0.5, min_samples=2)
labels = dbscan.fit_predict(scaled_data)

unique_labels = set(labels)

n_clusters = len(unique_labels) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f'Estimated number of clusters: {n_clusters}')
print(f'Estimated number of noise points: {n_noise}')

In [None]:
df_2['Cluster Label'] = labels

df_2

In [None]:

clustering_data['Cluster Label'] = labels

print(clustering_data.columns)

cluster_info = []

for label in unique_labels:
  if label == -1:
    continue

  class_member_mask = (df_2['Cluster Label'] == label)
  cluster_points = df_2[class_member_mask]

  core_samples_mask = np.zeros_like(labels, dtype=bool)
  core_samples_mask[dbscan.core_sample_indices_] = True
  core_cluster_points = df_2[class_member_mask & core_samples_mask]

  numeric_core_cluster_points = core_cluster_points.drop(columns=['Country', 'Cluster Label'])
  centroid = numeric_core_cluster_points.mean(axis=0)

  distances = np.linalg.norm(cluster_points - centroid, axis=1)
  closest_point_index = np.argmin(distances)
  closest_point = cluster_points.iloc[closest_point_index]

  numeric_core_cluster_points = core_cluster_points.drop(columns=['Country', 'Cluster Label'])
  distances = np.linalg.norm(numeric_cluster_points - centroid, axis=1)
  closest_point_index = np.argmin(distances)
  closest_point = cluster_points.iloc[closest_point_index]

  cluster_size = len(cluster_points)
  cluster_info.append({
      'Cluster': label,
      'Size': cluster_size,
      'Centroid': centroid,
      'Representative Country': closest_point['Country'],  # Assuming a 'Country' column exists
      'Representative Point': closest_point
  })

cluster_info_df = pd.DataFrame(cluster_info)

import ace_tools as tools; tools.display_dataframe_to_user(name="Cluster Information", dataframe=cluster_info_df)


## Table

Create dataset for table analysis, taking into account only the **first & last year** where data is available.

In [None]:
earliest_year_per_country = country_data.groupby('Country')['Year'].transform('min')
latest_year_per_country = country_data.groupby('Country')['Year'].transform('max')

earliest_country_data = country_data[country_data['Year'] == earliest_year_per_country]
latest_country_data = country_data[country_data['Year'] == latest_year_per_country]

before_after_comparison = pd.merge(
    earliest_country_data[['Country', 'Fossil Fuel Energy', 'Renewable Energy', 'Nuclear Energy', 'Total Energy', 'Fossil Fuel Share', 'Renewable Share']],
    latest_country_data[['Country', 'Fossil Fuel Energy', 'Renewable Energy', 'Nuclear Energy', 'Total Energy', 'Fossil Fuel Share', 'Renewable Share']],
    on='Country',
    how='inner',
    suffixes=(' Min', ' Max')
)

table_data = pd.DataFrame(
    {

        'Country': before_after_comparison['Country'],
        'Total Energy Increase in KWh': (before_after_comparison['Total Energy Max'] - before_after_comparison['Total Energy Min']).round(2),
        'Total Energy Consumption in KWh': before_after_comparison['Total Energy Max'].round(2),
        'Fossil Fuel Increase in KWh': (before_after_comparison['Fossil Fuel Energy Max'] - before_after_comparison['Fossil Fuel Energy Min']).round(2),
        'Renewable Energy Increase in KWh': (before_after_comparison['Renewable Energy Max'] - before_after_comparison['Renewable Energy Min']).round(2),
        'Nuclear Energy Increase in KWh': (before_after_comparison['Nuclear Energy Max'] - before_after_comparison['Nuclear Energy Min']).round(2),
        'Fossil Fuel Share Increase in %': (before_after_comparison['Total Energy Max'] / before_after_comparison['Fossil Fuel Share Max']).round(2),
        'Renewable Energy Share of Total %': (before_after_comparison['Renewable Energy Max'] / before_after_comparison['Total Energy Max']).round(2),
        'Total Energy Increase in %': ((before_after_comparison['Total Energy Max'] - before_after_comparison['Total Energy Min']) / before_after_comparison['Total Energy Min']*100).round(2)
    }
)


percentage_columns = [col for col in table_data.columns if '%' in col]

sorted_table_data = table_data.sort_values(by='Total Energy Increase in KWh', ascending=False).reset_index(drop=True)

sorted_table_data.index = sorted_table_data.index + 1

display(sorted_table_data)

# Timeplot






From the data in the table, lets select some interesting countries to observe in a Timechart:

* Top 3 countries with max increased total energy consumption
* Top 3 countries with max increased fossil fuel energy consumption
* Top 3 countries with max increased renewable energy consumption

**Plotting the timechart**
*   Create a visibility dictionary to toggle visibility based on Renewable Energy Type
*   Create Traces for each observation and assign them to the visibility dictionary
*   Create one button for eaech item in the visibility dictionary to be able to toggle visibility

In [None]:
import pandas as pd

def get_top_countries(data, column, n=5):
    sorted_data = data.sort_values(by=column, ascending=False).reset_index(drop=True)
    return sorted_data[['Country']].head(n)

columns = [
    'Total Energy Increase in KWh',
    'Renewable Energy Increase in KWh',
    'Fossil Fuel Increase in KWh',
    'Nuclear Energy Increase in KWh'
]

top_countries_list = []
for column in columns:
    top_countries = get_top_countries(table_data, column)
    top_countries_list.append(top_countries)

timeplot_countries = pd.concat(top_countries_list).drop_duplicates().reset_index(drop=True)

timeplot_countries_list = timeplot_countries['Country'].tolist()
unique_countries = list(set(timeplot_countries_list))

timeplot_countries_data = country_data[country_data['Country'].isin(unique_countries)]

print('Countries returned for analysis: ', unique_countries)


In [None]:
import plotly.graph_objects as go

fig = go.Figure()

analysis_columns = ['Total Energy', 'Renewable Energy', 'Fossil Fuel Energy', 'Nuclear Energy']

visibility_dict = {col: [] for col in analysis_columns}

detailed_visibility_dict = []

for country in timeplot_countries_list:
    country_specific_data = country_data[country_data['Country'] == country]
    for col in analysis_columns:
        detailed_visibility_dict = []
        trace_data = country_specific_data[['Year', col]].to_dict(orient='records')

        fig.add_trace(go.Scatter(
            x=country_specific_data['Year'], y=country_specific_data[col],
            mode='lines+text',
            name=f'{country}',
            visible=col == 'Total Energy',
            text=[f'{country} {col}' if i == len(country_data) - 1 else '' for i in range(len(country_data))],
            textposition='top right',
            hovertemplate=(
                '%{y} KWh'
            )
        ))
        for key in visibility_dict:
            visibility_dict[key].append(key == col)

visibility_all = [True] * len(fig.data)

buttons = []

for col in analysis_columns:
    buttons.append({
        'args': [{'visible': visibility_dict[col]},  {'title': f'<b>Global {col} over Time in KWh</b>'}],
        'label': col,
        'method': 'update'
    })

buttons.extend([
    {
        'args': [{'visible': visibility_all},  {'title': f'<b>Global Total Energy over Time in KWh</b>'}],
        'label': 'All',
        'method': 'update'
    }
])

In [None]:
fig.update_layout(
    autosize=True, height=800,
    title=f'<b>Global Total Energy over Time in KWh</b>',
    xaxis_title='Year',
    yaxis=dict(
        title='Energy in KWh',
        fixedrange=True,
        rangemode='nonnegative',
        range=[0, None],
    ),
    legend_title_text='Country',
    hovermode='x unified',
    updatemenus=[
        {
            'type': 'buttons',
            'showactive': True,
            'buttons': buttons,
            'direction': 'left',
            'x': 0.0,
            'xanchor': 'left',
            'y': 1.0,
            'yanchor': 'top'
        }
    ]
)

fig.show()





# SciPy Cluster Analysis

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt

X = clustering_data.columns.tolist()
# metrics_df = filtered_country_data[X]

# One-hot encoding for creating pivot style catgorical booleans across String Columns:
# data_encoded = pd.get_dummies(data, columns=['Country'])

scaler = StandardScaler()
scaled_data = scaler.fit_transform(clustering_data)

# linking methods = ['single', 'complete', 'average', 'centroid', 'median', 'weighted']

# Method with best results 'ward', max_d = 12

linked_data = linkage(scaled_data, method='ward')

max_d = 12
clusters = fcluster(linked_data, max_d, criterion='distance')

filtered_country_data['Cluster'] = clusters #Add back to table with country names for inspection

scaled_df = pd.DataFrame(scaled_data, columns=X)
scaled_df['Cluster'] = clusters


**Describing the Clusters**

In [65]:
df_2['Cluster'] = clusters

numerical_data = df_2.drop(columns='Country')

# scaled_data['Cluster'] = clusters

def concatenate_countries(names):
    return ', '.join(names)

cluster_summary = df_2.groupby('Cluster').agg({
    'Country': concatenate_countries,
    'Total Energy Increase': 'mean',
    'Energy Per Capita 2023': 'mean',
    'Energy Per Capita Increase': 'mean',
    'Fossil Fuel Share 2023': 'mean',
    'Fossil Fuel Share Increase': 'mean',
    'Renewable Share 2023': 'mean',
    'Renewables Share Increase': 'mean',
    'Nuclear Share 2023': 'mean',
    'Nuclear Share Increase': 'mean'
})

cluster_summary['Number of Countries'] = df_2.groupby('Cluster')['Country'].count()

cluster_summary



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Country,Total Energy Increase,Energy Per Capita 2023,Energy Per Capita Increase,Fossil Fuel Share 2023,Fossil Fuel Share Increase,Renewable Share 2023,Renewables Share Increase,Nuclear Share 2023,Nuclear Share Increase,Number of Countries
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,"Afghanistan, Australia, Austria, Brazil, Centr...",72.902439,2897.095951,28.536585,27.85,-17.6,71.163902,18.002195,0.986098,-0.401951,41
2,"Belarus, Czechia, Finland, Hungary, Pakistan, ...",21.875,5110.480625,15.25,37.3325,-23.39125,26.93125,12.7575,35.735,10.63375,8
3,"Belgium, Bulgaria, Canada, France, Germany, Ja...",17.583333,11577.85975,9.0,39.580833,-4.701667,37.764167,14.066667,22.655,-9.365833,12
4,"Algeria, Argentina, Azerbaijan, Bolivia, Burki...",135.98,1798.74656,54.36,80.7076,4.9824,18.9492,-4.927,0.343,-0.0556,50
5,"Angola, Bangladesh, Benin, Cambodia, China, Et...",961.727273,1629.896091,605.272727,45.804545,-3.627273,53.777273,3.320909,0.418182,0.306364,11


Cluster Analysis usinig Wade Method finds 5 distinct Groups of Countries:

* **Cluster 1**: Diverse Economies with Strong Renewable Energy
Focus (Australia, Brazil, Denmark, Italy, the Netherlands, Spain, and the UK, along with many others from various continents.)

* **Cluster 2**: Eastern European and Post-Soviet States with Nuclear Emphasis (Belarus, Czechia, Finland, Hungary, and Ukraine)

* **Cluster 3**: Highly Developed Economies with a Mature Energy Mix (Canada, France, Germany, Japan, the United States, and South Korea)

* **Cluster 4**: Emerging Economies with High Fossil Fuel Dependence (India, Indonesia, Nigeria, and South Africa, as well as several smaller economies in Africa and Asia)

* **Cluster 5**: Rapidly Growing, Low-Income Nations with Emerging Energy Needs (China, Bangladesh, Vietnam, and Ethiopia)


In [None]:
import seaborn as sns

numeric_columns = df_2.drop(columns=['Country', 'Cluster'])

scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_data = scaler.fit_transform(numeric_columns)

scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns.columns)

scaled_df['Cluster'] = df_2['Cluster']

plt.figure(figsize=(14, 10))  # Adjust figsize to make the heatmap larger
cluster_means = scaled_df.groupby('Cluster').mean()
sns.heatmap(cluster_means, annot=True, cmap='viridis')
plt.title("Heatmap of Cluster Centers")
plt.show()

In [None]:
import seaborn as sns

for col in scaled_df:
    plt.figure(figsize=(6, 6))
    sns.boxplot(x='Cluster', y=col, data=scaled_df)
    plt.title(f"Boxplot of {col} by Cluster")
    plt.show()

In [None]:
cluster_profiles = culster_data.groupby('Cluster').describe()
print(cluster_profiles)

In [None]:
# import seaborn as sns

plt.figure(figsize=(14, 10))  # Adjust figsize to make the heatmap larger
cluster_means = scaled_df.groupby('Cluster').mean()
sns.heatmap(cluster_means, annot=True, cmap='viridis')
plt.title("Heatmap of Cluster Centers")
plt.show()

**Visualising the Clusters**

In [None]:
cluster_counts = agg_country_data['Cluster'].value_counts()

small_clusters = cluster_counts[cluster_counts <= 3].index

countries_in_small_clusters = agg_country_data[agg_country_data['Cluster'].isin(small_clusters)]

country_names_in_small_clusters = countries_in_small_clusters['Country'].unique()

fig = go.Figure()

analysis_columns = ['Total Energy', 'Renewable Energy', 'Fossil Fuel Energy', 'Nuclear Energy']

visibility_dict = {col: [] for col in analysis_columns}

detailed_visibility_dict = []

for country in country_names_in_small_clusters:
    country_specific_data = country_data[country_data['Country'] == country]
    for col in analysis_columns:
        detailed_visibility_dict = []
        trace_data = country_specific_data[['Year', col]].to_dict(orient='records')

        fig.add_trace(go.Scatter(
            x=country_specific_data['Year'], y=country_specific_data[col],
            mode='lines+text',
            name=f'{country}',
            visible=col == 'Total Energy',
            text=[f'{country} {col}' if i == len(country_data) - 1 else '' for i in range(len(country_data))],
            textposition='top right',
            hovertemplate=(
                '%{y} KWh'
            )
        ))
        for key in visibility_dict:
            visibility_dict[key].append(key == col)

visibility_all = [True] * len(fig.data)

buttons = []

for col in analysis_columns:
    buttons.append({
        'args': [{'visible': visibility_dict[col]},  {'title': f'<b>Global {col} over Time in KWh</b>'}],
        'label': col,
        'method': 'update'
    })

buttons.extend([
    {
        'args': [{'visible': visibility_all},  {'title': f'<b>Global Total Energy over Time in KWh</b>'}],
        'label': 'All',
        'method': 'update'
    }
])

fig.update_layout(
    autosize=True, height=800,
    title=f'<b>Global Total Energy over Time in KWh</b>',
    xaxis_title='Year',
    yaxis=dict(
        title='Energy in KWh',
        fixedrange=True,
        rangemode='nonnegative',
        range=[0, None],
    ),
    legend_title_text='Country',
    hovermode='x unified',
    updatemenus=[
        {
            'type': 'buttons',
            'showactive': True,
            'buttons': buttons,
            'direction': 'left',
            'x': 0.0,
            'xanchor': 'left',
            'y': 1.0,
            'yanchor': 'top'
        }
    ]
)

fig.show()

In [None]:
plt.figure(figsize=(10, 7))
dendrogram(linked_data, leaf_rotation=90., leaf_font_size=8.)
plt.title('weighted')
plt.xlabel("Sample index")
plt.ylabel("Distance")
plt.show()

In [None]:
sns.pairplot(metrics, hue='Cluster', palette='viridis', diag_kind='kde')
plt.show()

# Sklearn Cluster Analysis

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

features = agg_country_data.columns.drop('Country').tolist()

X = agg_country_data[features]

print("NaNs before transformation:")
print(X.isna().sum())

X_log_transformed = np.log1p(X.clip(lower=0))

print("NaNs after log transformation:")
print(pd.DataFrame(X_log_transformed, columns=features).isna().sum())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_log_transformed)

print("NaNs after scaling:")
print(pd.DataFrame(X_scaled, columns=features).isna().sum())

sse = []
for k in range(1, 40):
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=0)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

plt.plot(range(1, 40), sse)
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

kmeans = KMeans(n_clusters=6, n_init=10, random_state=0)
agg_country_data['Cluster'] = kmeans.fit_predict(X_scaled)

cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers_df = pd.DataFrame(cluster_centers, columns=features)
print("Cluster Centers:\n", cluster_centers_df)

print("Cluster Assignments:\n", agg_country_data[['Country', 'Cluster']])

sns.pairplot(agg_country_data, hue='Cluster', palette='viridis', vars=features)
plt.show()

for feature in features:
    plt.figure()
    sns.boxplot(x='Cluster', y=feature, data=agg_country_data)
    plt.title(f'{feature} by Cluster')
    plt.show()

# cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
# cluster_centers_df = pd.DataFrame(cluster_centers, columns=features)
# print("Cluster Centers:\n", cluster_centers_df)

# print("Cluster Assignments:\n", agg_country_data[['Country', 'Cluster']])



In [None]:
agg_country_data[['Country', 'Cluster']]