In [12]:
import pandas as pd
import numpy as np
import hvplot.pandas  # noqa
import os
import sys
import re
pd.options.plotting.backend = 'holoviews'

## Utils

In [129]:
# Function to replace a substring in a string column
def process_values(value):
    if isinstance(value, str):
        value = re.sub(r'[\r\n]+', ' - ', value).strip('"\' \)\()')

    return np.nan if not value else value


# Clean columns content
def process_columns(column_name):
    # return column_name

    # BASIC
    column_name = re.sub(r'([A-Z]+)', r' \1', column_name.lower())
    column_name = re.sub(r'(\s+|\.)', r' ', column_name).strip()
    column_name = re.sub(r'(\s+)', r'_', column_name)
    
    return column_name

# FORMAT DF:
def process_df(df):
    # PROCESSING
    # STRING VALUES (might set to NaN)
    df = df.applymap(process_values)
    # Uniformize NaN values
    df = df.replace(['', 'nan'], np.nan)

    # Drop all columns having only NaN
    df = df.dropna(axis=1, how='all')

    # Replace column names using the format function
    df = df.rename(columns=process_columns)

    # Filter value if exists
    if ('value' in df.columns):
        df = df.dropna(subset=['value'])
    
    # Change year type to int
    if ('year' in df.columns):
        df['year'] = df['year'].astype(int)   

    return df

def func(x):
    if x['ember_region'] == 'Middle East':
        return 1.
    else:
        return np.nan

def lfunc(x):
    res = ''
    if x['eu'] == 1.:
        res += 'eu,'
    if x['oecd'] == 1.:
        res += 'oecd,'
    if x['g20'] == 1.:
        res += 'g20,'
    if x['g7'] == 1.:
        res += 'g7,'
    if x['middle_east'] == 1.:
        res += 'middle_east,'  
    if res == '':
        return np.nan
    
    return res[:-1]

## Load and Process Data

In [182]:
df_ember = pd.read_csv("../../../data/_raw/ember/ember_file_elec_all_year.csv")
df_ember = process_df(df_ember)
df_ember.head()

Unnamed: 0,area,country_code,year,area_type,continent,ember_region,eu,oecd,g20,g7,category,subcategory,variable,unit,value,yoy_absolute_change,yoy_%_change
0,Afghanistan,AFG,2000,Country,Asia,Asia,,,,,Capacity,Aggregate fuel,Clean,GW,0.19,,
1,Afghanistan,AFG,2000,Country,Asia,Asia,,,,,Capacity,Aggregate fuel,Fossil,GW,0.03,,
2,Afghanistan,AFG,2000,Country,Asia,Asia,,,,,Capacity,Aggregate fuel,Gas and Other Fossil,GW,0.03,,
3,Afghanistan,AFG,2000,Country,Asia,Asia,,,,,Capacity,Aggregate fuel,"Hydro, Bioenergy and Other Renewables",GW,0.19,,
4,Afghanistan,AFG,2000,Country,Asia,Asia,,,,,Capacity,Aggregate fuel,Renewables,GW,0.19,,


In [183]:
df_info = pd.read_csv("../../../data/_info/__INFO_UN_M49_en.csv")
df_info = process_df(df_info)
df_info.head()

Unnamed: 0,global_code,global_name,region_code,region_name,sub-region_code,sub-region_name,intermediate_region_code,intermediate_region_name,country_or_area,m49_code,iso-alpha2_code,iso-alpha3_code,least_developed_countries_(ldc),land_locked_developing_countries_(lldc),small_island_developing_states_(sids)
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,
2,1,World,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY,,,
3,1,World,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR,,,
4,1,World,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN,x,,


In [184]:
# Left joining
df_ember = df_ember.merge(df_info, left_on='country_code', right_on='iso-alpha3_code', how='left')

In [185]:
# Filter columns
selected_columns = ['area', 'country_code', 'year', 'area_type', 'region_name', 'ember_region', 'sub-region_name', 'eu', 'oecd', 'g20', 'g7', 'category', 'subcategory', 'variable', 'unit', 'value']
df_ember = df_ember[selected_columns]

# Create new column "middle_east"
df_ember['middle_east'] = df_ember.apply(func, axis=1)

# Delete area rows of type region
df_ember = df_ember.drop(df_ember_elec_generation[df_ember['area_type']=='Region'].index).reset_index(drop=True)
df_ember = df_ember.drop(df_ember[df_ember['subcategory']=='Total'].index).reset_index(drop=True)
df_ember = df_ember[df_ember.variable.str.contains(' and ')==False]

# Create new columns
df_ember['group'] = df_ember.apply(lfunc, axis=1)
df_ember['source'] = 'ember'

# Filter columns
df_ember.drop(columns=['eu', 'oecd', 'g20', 'g7', 'middle_east', 'area_type', 'ember_region'], inplace=True)

# Rename columns
df_ember.rename(columns={'area' : 'country', 'country_code' : 'country_code_a3', 'category' : 'type'}, inplace=True)

df_ember.head()

  df_ember = df_ember.drop(df_ember_elec_generation[df_ember['area_type']=='Region'].index).reset_index(drop=True)


Unnamed: 0,country,country_code_a3,year,region_name,sub-region_name,type,subcategory,variable,unit,value,group,source
0,Afghanistan,AFG,2000,Asia,Southern Asia,Capacity,Aggregate fuel,Clean,GW,0.19,,ember
1,Afghanistan,AFG,2000,Asia,Southern Asia,Capacity,Aggregate fuel,Fossil,GW,0.03,,ember
4,Afghanistan,AFG,2000,Asia,Southern Asia,Capacity,Aggregate fuel,Renewables,GW,0.19,,ember
5,Afghanistan,AFG,2000,Asia,Southern Asia,Capacity,Fuel,Hydro,GW,0.19,,ember
6,Afghanistan,AFG,2000,Asia,Southern Asia,Capacity,Fuel,Other Fossil,GW,0.03,,ember


## Electricity generation & capacity

In [186]:
df_ember_elec_gen_cap = df_ember[(df_ember['type'].isin(['Electricity generation', 'Capacity']))&(df_ember['unit']!='%')].reset_index()
df_ember_elec_gen_cap = df_ember_elec_gen_cap.groupby(['country', 'country_code_a3', 'year', 'region_name', 'sub-region_name', 'group', 'variable', 'type', 'unit', 'source'])['value'].sum().reset_index()
df_ember_elec_gen_cap.head()

Unnamed: 0,country,country_code_a3,year,region_name,sub-region_name,group,variable,type,unit,source,value
0,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Bioenergy,Capacity,GW,ember,0.07
1,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Bioenergy,Electricity generation,TWh,ember,0.36
2,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Clean,Capacity,GW,ember,9.71
3,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Clean,Electricity generation,TWh,ember,34.88
4,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Coal,Capacity,GW,ember,0.38


In [175]:
df_ember_elec_gen_cap.to_csv('../../../data/_processed/ember_electricity_generation_capacity.csv', index=False)

### Electricity generation & capacity visualization

In [203]:
df_ember_elec_gen_cap.hvplot(x='year', by=['country', 'type', 'variable', 'unit'], groupby =['country', 'type', 'variable'], widget_location='left_top')

## Electricity imports

In [188]:
df_ember_elec_imports = df_ember[(df_ember['type']=='Electricity imports')].reset_index()
df_ember_elec_imports = df_ember_elec_imports.groupby(['country','country_code_a3', 'year', 'region_name', 'sub-region_name', 'group', 'type', 'unit', 'source'])['value'].sum().reset_index()
df_ember_elec_imports.head()

Unnamed: 0,country,country_code_a3,year,region_name,sub-region_name,group,type,unit,source,value
0,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Electricity imports,TWh,ember,1.22
1,Argentina,ARG,2001,Americas,Latin America and the Caribbean,g20,Electricity imports,TWh,ember,1.76
2,Argentina,ARG,2002,Americas,Latin America and the Caribbean,g20,Electricity imports,TWh,ember,5.84
3,Argentina,ARG,2003,Americas,Latin America and the Caribbean,g20,Electricity imports,TWh,ember,5.03
4,Argentina,ARG,2004,Americas,Latin America and the Caribbean,g20,Electricity imports,TWh,ember,3.47


In [199]:
df_ember_elec_imports.to_csv('../../../data/_processed/ember_electricity_imports.csv', index=False)

### Electricity imports visualization

In [201]:
df_ember_elec_imports.hvplot(x='year', by=['country', 'type', 'unit'], groupby =['country'], widget_location='left_top')

### Electricity Demand

In [192]:
df_ember_elec_demand = df_ember[(df_ember['type']=='Electricity demand')].reset_index()
df_ember_elec_demand = df_ember_elec_demand.groupby(['country','country_code_a3', 'year', 'region_name', 'sub-region_name', 'group', 'type', 'subcategory', 'unit', 'source'])['value'].sum().reset_index()
df_ember_elec_demand.head()

Unnamed: 0,country,country_code_a3,year,region_name,sub-region_name,group,type,subcategory,unit,source,value
0,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Electricity demand,Demand,TWh,ember,86.47
1,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Electricity demand,Demand per capita,MWh,ember,2.33
2,Argentina,ARG,2001,Americas,Latin America and the Caribbean,g20,Electricity demand,Demand,TWh,ember,88.25
3,Argentina,ARG,2001,Americas,Latin America and the Caribbean,g20,Electricity demand,Demand per capita,MWh,ember,2.36
4,Argentina,ARG,2002,Americas,Latin America and the Caribbean,g20,Electricity demand,Demand,TWh,ember,86.99


In [197]:
df_ember_elec_demand.to_csv('../../../data/_processed/ember_electricity_demand.csv', index=False)

### Electricity demand visualization

In [202]:
df_ember_elec_demand.hvplot(x='year', by=['country', 'subcategory', 'unit'], groupby =['country', 'subcategory'], widget_location='left_top')

## Power sector emissions

In [195]:
df_ember_power_secor_emmissions = df_ember[(df_ember['type']=='Power sector emissions')].reset_index()
df_ember_power_secor_emmissions = df_ember_power_secor_emmissions.groupby(['country','country_code_a3', 'year', 'region_name', 'sub-region_name', 'group', 'type', 'subcategory', 'unit', 'source'])['value'].sum().reset_index()
df_ember_power_secor_emmissions.head()

Unnamed: 0,country,country_code_a3,year,region_name,sub-region_name,group,type,subcategory,unit,source,value
0,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Power sector emissions,Aggregate fuel,mtCO2,ember,27.4
1,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Power sector emissions,CO2 intensity,gCO2/kWh,ember,312.49
2,Argentina,ARG,2000,Americas,Latin America and the Caribbean,g20,Power sector emissions,Fuel,mtCO2,ember,26.64
3,Argentina,ARG,2001,Americas,Latin America and the Caribbean,g20,Power sector emissions,Aggregate fuel,mtCO2,ember,23.65
4,Argentina,ARG,2001,Americas,Latin America and the Caribbean,g20,Power sector emissions,CO2 intensity,gCO2/kWh,ember,262.34


In [198]:
df_ember_power_secor_emmissions.to_csv('../../../data/em_processed/ember_power_secor_emmissions.csv', index=False)

### Power sector emissions

In [196]:
df_ember_power_secor_emmissions.hvplot(x='year', by=['country', 'subcategory', 'unit'], groupby =['country', 'subcategory'], widget_location='left_top')