In [15]:
import pandas as pd
import numpy as np
import os
import sys
import re

## Utils

In [16]:
# Function to replace a substring in a string column
def process_values(value):
    if isinstance(value, str):
        value = re.sub(r'[\r\n]+', ' - ', value).strip('"\' \)\()')

    return np.nan if not value else value


# Clean columns content
def process_columns(column_name):
    # return column_name

    # BASIC
    column_name = re.sub(r'([A-Z]+)', r' \1', column_name.lower())
    column_name = re.sub(r'(\s+|\.)', r' ', column_name).strip()
    column_name = re.sub(r'(\s+)', r'_', column_name)

    # COUNTRY AND REGION
    column_name = re.sub(r'(iso[2-3])code', r'\1', column_name)
    column_name = re.sub(r'(country)(iso[2-3])', r'\1_\2', column_name)
    column_name = re.sub(r'(country)(region)', r'\2', column_name)
    column_name = re.sub(r'(?<=region)(?=type|name)', '_', column_name)
    column_name = re.sub(r'(?<=region)(?=id)', '_', column_name)
    column_name = re.sub(r'(?<=type)(?=id|name)', '_', column_name)

    column_name = column_name.replace('country_code', 'country_iso3')

    # DATE RELATED
    column_name = re.sub(r'(?<=last)(?=updated)', '_', column_name)

    return column_name

# FORMAT DF:
def process_df(df):
    # PROCESSING
    # STRING VALUES (might set to NaN)
    df = df.applymap(process_values)
    # Uniformize NaN values
    df = df.replace(['', 'nan'], np.nan)

    # Drop all columns having only NaN
    df = df.dropna(axis=1, how='all')

    # Replace column names using the format function
    df = df.rename(columns=process_columns)

    # Filter value if exists
    if ('value' in df.columns):
        df = df.dropna(subset=['value'])
    
    # Change year type to int
    if ('year' in df.columns):
        df['year'] = df['year'].astype(int)   

    return df

## Load and Process Data

### all data

In [17]:
df_iea = pd.read_csv("../../../data/_raw/iea/iea_api_eei.csv")
df_iea = process_df(df_iea)
df_iea.head()

Unnamed: 0,country,tab,flow,enduse,activity,unit,year,value
69,Australia,Freight transport,SHARE_COUNTRY_EM,Freight trains,,%,2013,0.9
70,Australia,Freight transport,SHARE_COUNTRY_EM,Trucks,,%,2013,8.65
71,Australia,Freight transport,SHARE_FOSSIL,Domestic freight ships,,%,2013,88.52
72,Australia,Freight transport,SHARE_FOSSIL,Freight trains,,%,2013,94.43
73,Australia,Freight transport,SHARE_FOSSIL,Trucks,,%,2013,99.21


In [18]:
df_info = pd.read_csv("../../../data/_info/__INFO_UN_M49_en.csv")
df_info = process_df(df_info)
df_info.head()

Unnamed: 0,global_code,global_name,region_code,region_name,sub-region_code,sub-region_name,intermediate_region_code,intermediate_region_name,country_or_area,m49_code,iso-alpha2_code,iso-alpha3_code,least_developed_countries_(ldc),land_locked_developing_countries_(lldc),small_island_developing_states_(sids)
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,
2,1,World,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY,,,
3,1,World,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR,,,
4,1,World,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN,x,,


In [20]:
# Mapping data with info_m49_en

# Left joining
df_merged = df_iea.merge(df_info, left_on='country', right_on='country_or_area', how='left')

# Filter columns
selected_columns = ['country', 'iso-alpha3_code', 'year', 'tab', 'flow', 'enduse', 'region_name', 'sub-region_name', 'activity', 'unit', 'value']
df_merged = df_merged[selected_columns]
df_merged.head()

Unnamed: 0,country,iso-alpha3_code,year,tab,flow,enduse,region_name,sub-region_name,activity,unit,value
0,Australia,AUS,2013,Freight transport,SHARE_COUNTRY_EM,Freight trains,Oceania,Australia and New Zealand,,%,0.9
1,Australia,AUS,2013,Freight transport,SHARE_COUNTRY_EM,Trucks,Oceania,Australia and New Zealand,,%,8.65
2,Australia,AUS,2013,Freight transport,SHARE_FOSSIL,Domestic freight ships,Oceania,Australia and New Zealand,,%,88.52
3,Australia,AUS,2013,Freight transport,SHARE_FOSSIL,Freight trains,Oceania,Australia and New Zealand,,%,94.43
4,Australia,AUS,2013,Freight transport,SHARE_FOSSIL,Trucks,Oceania,Australia and New Zealand,,%,99.21


In [22]:
df_merged['flow'].unique()

array(['SHARE_COUNTRY_EM', 'SHARE_FOSSIL', 'E_FINAL', 'ENERGY_EFF_CARBON',
       'ENERGY_EFF_ENERGY'], dtype=object)

### Final energy

In [23]:
df_final_energy = df_merged[df_merged['flow']=='E_FINAL']
df_final_energy = df_final_energy.drop(columns=['activity'])
df_final_energy = df_final_energy.groupby(['country', 'iso-alpha3_code', 'year', 'tab', 'region_name', 'sub-region_name', 'unit'])['value'].sum().reset_index()
df_final_energy["source"] = 'iea'
df_final_energy["type"] = 'final_energy'
df_final_energy.rename(columns={'iso-alpha3_code':'country_code_a3', 'tab':'sector', 'sub-region_name':'subregion_name'}, inplace=True)
df_final_energy.to_csv('../../../data/_processed/iea_final_energy.csv', index=False)
df_final_energy.head()

Unnamed: 0,country,country_code_a3,year,sector,region_name,subregion_name,unit,value,source,type
0,Albania,ALB,2000,Industry and services,Europe,Southern Europe,PJ,25.12,iea,final_energy
1,Albania,ALB,2000,Passenger transport,Europe,Southern Europe,PJ,20.14,iea,final_energy
2,Albania,ALB,2000,Residential,Europe,Southern Europe,PJ,15.75,iea,final_energy
3,Albania,ALB,2001,Industry and services,Europe,Southern Europe,PJ,25.65,iea,final_energy
4,Albania,ALB,2001,Passenger transport,Europe,Southern Europe,PJ,21.17,iea,final_energy
