In [286]:
import pandas as pd
import numpy as np
import hvplot.pandas  # noqa
pd.options.plotting.backend = 'holoviews'
import os
import re

## Functions

In [1]:
exclude_columns = [
    'iso3166_numeric'   
]

# Function to fill each column with the first non-null value
def first_non_null(column):
    first_non_null = column.first_valid_index()
    return column.fillna(column[first_non_null])

# Function to replace a substring in a string column
def process_values(value):   
    if isinstance(value, str):
        value = re.sub(r'[\r\n]+', ' - ', value).strip('"\' \)\()')
    
    return np.nan if not value else value



def exclude(column_name):
    if column_name in exclude_columns:
        return True
    if str(column_name).startswith('dataflag'):
        return True

def process_columns(column_name):
    #return column_name
    
    # BASIC 
    column_name = re.sub(r'([A-Z]+)', r' \1', column_name.lower())
    column_name = re.sub(r'(\s+|\.)', r' ', column_name).strip()
    column_name = re.sub(r'(\s+)', r'_', column_name)   
    
    # COUNTRY AND REGION 
    column_name = re.sub(r'(iso[2-3])code', r'\1', column_name)
    column_name = re.sub(r'(country)(iso[2-3])', r'\1_\2', column_name)
    column_name = re.sub(r'(country)(region)', r'\2', column_name)
    column_name = re.sub(r'(?<=region)(?=type|name)', '_', column_name)
    column_name = re.sub(r'(?<=region)(?=id)', '_', column_name)
    column_name = re.sub(r'(?<=type)(?=id|name)', '_', column_name)

    
    # WB specific
    column_name = re.sub(r'(?<=source)(?=organization|note)', '_', column_name)
    column_name = re.sub(r'(?<=data)(?=availability)', '_', column_name)
    
    # EIA specific
    if (column_name=='period'):
        column_name = column_name.replace('period','year')   
            
    # UNIT
    column_name = re.sub(r'(?<=unit)(?=name)', '_', column_name)
    
    # MISC
    if (column_name=='var'):
        column_name = column_name.replace('var','variable')
            
    # UID
    if (column_name=='code'):
        column_name = re.sub(r'code', 'uid', column_name) 
    #column_name = column_name.replace('country_id','country_iso2')  
    column_name = column_name.replace('country_code','country_iso3')  
 
    # DATE RELATED    
    column_name = re.sub(r'(?<=last)(?=updated)', '_', column_name) 
    
    return column_name

# FORMAT DF: 
def process_df(df):
    # PROCESSING 
    # STRING VALUES (might set to NaN)
    df = df.applymap(process_values) 
    # Uniformize NaN values
    df = df.replace(['', 'nan'], np.nan)  
                
    # Drop all columns having only NaN
    df = df.dropna(axis=1, how='all')  

    # Replace column names using the format function
    df = df.rename(columns=process_columns)

    # Filter value if exists
    if ('value' in df.columns):
        df = df.dropna(subset=['value'])

    # DATE: Split the 'date' using regex if it matches the pattern
    # Date column format: YYYY-MM-DD OR YYYY
    if ('date' in df.columns):
        df[['year', 'month']] = df['date'].astype(str).str.extract(r'^(\d{4})-(\d{2})-\d{2}$').fillna(np.nan)
        # Drop all columns having only NaN AGAIN due to previously added columns
        df = df.dropna(axis=1, how='all') 
        
        if ('year' in df.columns):
            df = df.drop(columns=['date'])
        else:
            # Date is in YYYY format
            df = df.rename(columns={'date': 'year'})   
    
    if ('year' in df.columns):
            df['year']=df['year'].astype(int)
            if ('month' in df.columns):
                df['month']=df['month'].astype(int)    
    
    # Unit
    if ('unit' in df.columns and not 'unit_name' in df.columns ):
        df['unit_name']=df['unit']  
    
    # WB SPECIFIC    
    # Manage inconsistent country/region iso2 or iso3 in indicators
    if ('country_id' in df.columns and 'country_value' in df.columns ):
        if ('country_iso3' in df.columns):
            df['country_iso3'] = df['country_iso3'].fillna(df['country_id'])
        else:
            df['country_iso3']=df['country_id']
        df = df.drop(columns=['country_id','country_value'])
            
    return df

In [288]:
csv_file_path = "../../data/all.csv"
world = pd.read_csv(csv_file_path)
world.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


## edgar_file_em_tot_co2eq_sect

In [289]:
csv_file_path = "../../data/_raw/edgar/edgar_file_em_tot_co2eq_sect.csv"
df_0 = pd.read_csv(csv_file_path)
df_0.head()

Unnamed: 0,Sector,EDGAR Country Code,Country,year,value
0,Buildings,ABW,Aruba,1970,0.041855
1,Other industrial combustion,ABW,Aruba,1970,0.00071
2,Other sectors,ABW,Aruba,1970,0.020696
3,Power Industry,ABW,Aruba,1970,0.034778
4,Transport,ABW,Aruba,1970,0.004197


In [290]:
df_0 = process_df(df_0)
world = process_df(world)

### Data analysis
* the CO2eq emission values of Serbia and Montenegro are combined in Edgar data.
* Netherlands Antilles includes : Saba, Sint Eustatius, Sint Maarten, Aruba (until 1986) Bonaire and Curaçao

In [291]:
#list of countries or states that are not present in Edgar data
list(world["name"][~world["alpha-3"].isin(df_0["edgar_country_iso3"])].drop_duplicates())

['Åland Islands',
 'American Samoa',
 'Andorra',
 'Antarctica',
 'Bonaire, Sint Eustatius and Saba',
 'Bouvet Island',
 'British Indian Ocean Territory',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Curaçao',
 'French Southern Territories',
 'Guam',
 'Guernsey',
 'Heard Island and McDonald Islands',
 'Holy See',
 'Isle of Man',
 'Jersey',
 'Liechtenstein',
 'Marshall Islands',
 'Mayotte',
 'Micronesia (Federated States of',
 'Monaco',
 'Montenegro',
 'Montserrat',
 'Nauru',
 'Niue',
 'Norfolk Island',
 'Northern Mariana Islands',
 'Palestine, State of',
 'Pitcairn',
 'Saint Barthélemy',
 'Saint Martin (French part',
 'San Marino',
 'Serbia',
 'Sint Maarten (Dutch part',
 'South Georgia and the South Sandwich Islands',
 'South Sudan',
 'Svalbard and Jan Mayen',
 'Tokelau',
 'Tuvalu',
 'United States Minor Outlying Islands',
 'Virgin Islands (U.S.',
 'Wallis and Futuna']

In [292]:
#list of countries or states that are not present in world data
list(df_0["edgar_country_iso3"][~df_0["edgar_country_iso3"].isin(world["alpha-3"])].drop_duplicates())

['AIR', 'ANT', 'SCG', 'SEA']

In [293]:
dict_iso_region = dict(zip(world["alpha-3"],world["region"]))

In [294]:
## add region column
df_0["region"] = df_0["edgar_country_iso3"].apply(lambda x : dict_iso_region[x] if x not in ['AIR', 'ANT', 'SCG', 'SEA'] else np.nan)
df_0.loc[df_0[df_0["edgar_country_iso3"]=="SCG"].index,"region"]="Europe"
df_0.loc[df_0[df_0["edgar_country_iso3"]=="ANT"].index,"region"]="Americas"
df_0.head()

Unnamed: 0,sector,edgar_country_iso3,country,year,value,region
0,Buildings,ABW,Aruba,1970,0.041855,Americas
1,Other industrial combustion,ABW,Aruba,1970,0.00071,Americas
2,Other sectors,ABW,Aruba,1970,0.020696,Americas
3,Power Industry,ABW,Aruba,1970,0.034778,Americas
4,Transport,ABW,Aruba,1970,0.004197,Americas


#### Total emission of CO2eq (kilotonne) by sector and region

In [295]:
df_0.hvplot(x='year', by='country', groupby =['sector', 'region'], widget_location='left_top', title='Total emission of CO2eq(kilotonne)')

#### Total emission of CO2eq (kilotonne) by sector and country

In [296]:
df_0.hvplot(x='year', by=['sector', 'country'], groupby =['sector', 'country'], widget_location='left_top', title='Total emission of CO2eq(kilotonne)')

#### Total emission of CO2eq (kilotonne) by country and by sector

In [297]:
df_0.hvplot(x='year', by=['country','sector'], groupby =['country'], widget_location='left_top', title='Total emission of CO2eq by kilotonne')

## edgar_file_em_tot_co2eq

In [298]:
csv_file_path = "../../data/_raw/edgar/edgar_file_em_tot_co2eq.csv"
df_1 = pd.read_csv(csv_file_path)
df_1 = process_df(df_1)
df_1.head()

Unnamed: 0,edgar_country_iso3,country,year,value
0,AIR,International Aviation,1970,171.869542
1,SEA,International Shipping,1970,375.595206
2,AFG,Afghanistan,1970,17.142304
3,ALB,Albania,1970,8.117479
4,DZA,Algeria,1970,55.065044


In [299]:
df_1["region"] = df_1["edgar_country_iso3"].apply(lambda x : dict_iso_region[x] if x not in ['AIR', 'ANT', 'SCG', 'SEA'] else np.nan)
df_1.loc[df_1[df_1["edgar_country_iso3"]=="SCG"].index,"region"]="Europe"
df_1.loc[df_1[df_1["edgar_country_iso3"]=="ANT"].index,"region"]="Americas"
df_1.head()

Unnamed: 0,edgar_country_iso3,country,year,value,region
0,AIR,International Aviation,1970,171.869542,
1,SEA,International Shipping,1970,375.595206,
2,AFG,Afghanistan,1970,17.142304,Asia
3,ALB,Albania,1970,8.117479,Europe
4,DZA,Algeria,1970,55.065044,Africa


In [300]:
df_1_region = df_1.groupby(["region","year"]).sum('value')
df_1_region.reset_index(inplace=True, level=['year','region'])
df_1_region.head()

Unnamed: 0,region,year,value
0,Africa,1970,1091.67149
1,Africa,1971,1104.806622
2,Africa,1972,1155.278159
3,Africa,1973,1231.434852
4,Africa,1974,1254.568828


#### Total emission of CO2eq (kilotonne) by country

In [301]:
df_1.hvplot(x='year', by='country', groupby =['country'], widget_location='left_top', title='Total emission of CO2eq(kilotonne)')

#### Total emission of CO2eq (kilotonne) by region

In [302]:
df_1_region.hvplot(x='year', by='region', groupby =['region'], widget_location='left_top', title='Total emission of CO2eq(kilotonne)')