In [1]:
import pandas as pd
import numpy as np
import hvplot.pandas  # noqa
import os
import sys
import re
pd.options.plotting.backend = 'holoviews'

## Utils

In [2]:
# Function to replace a substring in a string column
def process_values(value):
    if isinstance(value, str):
        value = re.sub(r'[\r\n]+', ' - ', value).strip('"\' \)\()')

    return np.nan if not value else value


# Clean columns content
def process_columns(column_name):
    # return column_name

    # BASIC
    column_name = re.sub(r'([A-Z]+)', r' \1', column_name.lower())
    column_name = re.sub(r'(\s+|\.)', r' ', column_name).strip()
    column_name = re.sub(r'(\s+)', r'_', column_name)

    # COUNTRY AND REGION
    column_name = re.sub(r'(iso[2-3])code', r'\1', column_name)
    column_name = re.sub(r'(country)(iso[2-3])', r'\1_\2', column_name)
    column_name = re.sub(r'(country)(region)', r'\2', column_name)
    column_name = re.sub(r'(?<=region)(?=type|name)', '_', column_name)
    column_name = re.sub(r'(?<=region)(?=id)', '_', column_name)
    column_name = re.sub(r'(?<=type)(?=id|name)', '_', column_name)

    column_name = column_name.replace('country_code', 'country_iso3')

    # DATE RELATED
    column_name = re.sub(r'(?<=last)(?=updated)', '_', column_name)

    return column_name

# FORMAT DF:
def process_df(df):
    # PROCESSING
    # STRING VALUES (might set to NaN)
    df = df.applymap(process_values)
    # Uniformize NaN values
    df = df.replace(['', 'nan'], np.nan)

    # Drop all columns having only NaN
    df = df.dropna(axis=1, how='all')

    # Replace column names using the format function
    df = df.rename(columns=process_columns)

    # Filter value if exists
    if ('value' in df.columns):
        df = df.dropna(subset=['value'])
    
    # Change year type to int
    if ('year' in df.columns):
        df['year'] = df['year'].astype(int)   

    return df

## Load and Process Data

In [3]:
df_edgar_tot_sect = pd.read_csv("../../data/_raw/edgar/edgar_file_em_tot_co2eq_sect.csv")
df_edgar_tot_sect = process_df(df_edgar_tot_sect)
df_edgar_tot_sect.head()

Unnamed: 0,sector,edgar_country_iso3,country,year,value
0,Buildings,ABW,Aruba,1970,0.041855
1,Other industrial combustion,ABW,Aruba,1970,0.00071
2,Other sectors,ABW,Aruba,1970,0.020696
3,Power Industry,ABW,Aruba,1970,0.034778
4,Transport,ABW,Aruba,1970,0.004197


In [4]:
df_info = pd.read_csv("../../data/_info/__INFO_UN_M49_en.csv")
df_info = process_df(df_info)
df_info.head()

Unnamed: 0,global_code,global_name,region_code,region_name,sub-region_code,sub-region_name,intermediate_region_code,intermediate_region_name,country_or_area,m49_code,iso-alpha2_code,iso-alpha3_code,least_developed_countries_(ldc),land_locked_developing_countries_(lldc),small_island_developing_states_(sids)
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,
2,1,World,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY,,,
3,1,World,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR,,,
4,1,World,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN,x,,


In [6]:
# Mapping data with info_m49_en

# Left joining
df_edgar_tot_sect_region = df_edgar_tot_sect.merge(df_info, left_on='edgar_country_iso3', right_on='iso-alpha3_code', how='left')

# Filter columns
selected_columns = ['year', 'sector', 'edgar_country_iso3', 'country', 'value', 'region_name', 'sub-region_name']
df_edgar_tot_sect_region = df_edgar_tot_sect_region[selected_columns]

# Add unit column 
df_edgar_tot_sect_region['unit'] = 'mtcoc2eq'

df_edgar_tot_sect_region.head()

Unnamed: 0,year,sector,edgar_country_iso3,country,value,region_name,sub-region_name,unit
0,1970,Buildings,ABW,Aruba,0.041855,Americas,Latin America and the Caribbean,mtcoc2eq
1,1970,Other industrial combustion,ABW,Aruba,0.00071,Americas,Latin America and the Caribbean,mtcoc2eq
2,1970,Other sectors,ABW,Aruba,0.020696,Americas,Latin America and the Caribbean,mtcoc2eq
3,1970,Power Industry,ABW,Aruba,0.034778,Americas,Latin America and the Caribbean,mtcoc2eq
4,1970,Transport,ABW,Aruba,0.004197,Americas,Latin America and the Caribbean,mtcoc2eq


## Data Visualization

In [19]:
df_edgar_tot_sect_region.hvplot(x='year', by=['country','sector'], groupby =['country', 'sector'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [25]:
df_edgar_tot_sect_region.hvplot(x='year', by=['sector'], groupby =['country'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [27]:
df_vis = df_edgar_tot_sect_region.groupby(['year', 'sector', 'region_name'])[['value']].sum().reset_index()
df_vis.hvplot(x='year', by=['sector', 'region_name'], groupby =['sector', 'region_name'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [28]:
df_vis.hvplot(x='year', by=['sector'], groupby =['region_name'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [24]:
df_vis = df_edgar_tot_sect_region.groupby(['year', 'edgar_country_iso3', 'country'])[['value']].sum().reset_index()
df_vis.hvplot(x='year', by='country', groupby =['country'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [13]:
df_vis = df_edgar_tot_sect_region.groupby(['year', 'region_name'])[['value']].sum().reset_index()
df_vis.hvplot(x='year', by='region_name', groupby =['region_name'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [14]:
df_vis = df_edgar_tot_sect_region.groupby(['year', 'sub-region_name'])[['value']].sum().reset_index()
df_vis.hvplot(x='year', by='sub-region_name', groupby =['sub-region_name'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')

In [16]:
df_vis = df_edgar_tot_sect_region.groupby(['year', 'sector'])[['value']].sum().reset_index()
df_vis.hvplot(x='year', by='sector', groupby =['sector'], widget_location='left_top', title='Total emission of CO2eq(Mtonne)')