In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import ipywidgets as widgets
from ipywidgets import interact 
import hvplot.pandas  # noqa
pd.options.plotting.backend = 'holoviews'
import os
import io
import re

## Functions

In [2]:
exclude_columns = [
    'iso3166_numeric'
    
]

# Function to fill each column with the first non-null value
def first_non_null(column):
    first_non_null = column.first_valid_index()
    return column.fillna(column[first_non_null])

# Function to replace a substring in a string column
def process_values(value):   
    if isinstance(value, str):
        value = re.sub(r'[\r\n]+', ' - ', value).strip('"\' \)\()')
    
    return np.nan if not value else value


def exclude(column_name):
    if column_name in exclude_columns:
        return True
    if str(column_name).startswith('dataflag'):
        return True

def process_columns(column_name):
    #return column_name
    
    # BASIC 
    column_name = re.sub(r'([A-Z]+)', r' \1', column_name.lower())
    column_name = re.sub(r'(\s+|\.)', r' ', column_name).strip()
    column_name = re.sub(r'(\s+)', r'_', column_name)   
    
    # COUNTRY AND REGION 
    column_name = re.sub(r'(iso[2-3])code', r'\1', column_name)
    column_name = re.sub(r'(country)(iso[2-3])', r'\1_\2', column_name)
    column_name = re.sub(r'(country)(region)', r'\2', column_name)
    column_name = re.sub(r'(?<=region)(?=type|name)', '_', column_name)
    column_name = re.sub(r'(?<=region)(?=id)', '_', column_name)
    column_name = re.sub(r'(?<=type)(?=id|name)', '_', column_name)

    
    # WB specific
    column_name = re.sub(r'(?<=source)(?=organization|note)', '_', column_name)
    column_name = re.sub(r'(?<=data)(?=availability)', '_', column_name)
    
    # EIA specific
    if (column_name=='period'):
        column_name = column_name.replace('period','year')   
            
    # UNIT
    column_name = re.sub(r'(?<=unit)(?=name)', '_', column_name)
    
    # MISC
    if (column_name=='var'):
        column_name = column_name.replace('var','variable')
            
    # UID
    if (column_name=='code'):
        column_name = re.sub(r'code', 'uid', column_name) 
    #column_name = column_name.replace('country_id','country_iso2')  
    column_name = column_name.replace('country_code','country_iso3')  
 
    # DATE RELATED    
    column_name = re.sub(r'(?<=last)(?=updated)', '_', column_name) 
    
    return column_name

# FORMAT DF: 
def process_df(df):
    # PROCESSING 
    # STRING VALUES (might set to NaN)
    df = df.applymap(process_values) 
    # Uniformize NaN values
    df = df.replace(['', 'nan'], np.nan)  
                
    # Drop all columns having only NaN
    df = df.dropna(axis=1, how='all')  

    # Replace column names using the format function
    df = df.rename(columns=process_columns)

    # Filter value if exists
    if ('value' in df.columns):
        df = df.dropna(subset=['value'])

    # DATE: Split the 'date' using regex if it matches the pattern
    # Date column format: YYYY-MM-DD OR YYYY
    if ('date' in df.columns):
        df[['year', 'month']] = df['date'].astype(str).str.extract(r'^(\d{4})-(\d{2})-\d{2}$').fillna(np.nan)
        # Drop all columns having only NaN AGAIN due to previously added columns
        df = df.dropna(axis=1, how='all') 
        
        if ('year' in df.columns):
            df = df.drop(columns=['date'])
        else:
            # Date is in YYYY format
            df = df.rename(columns={'date': 'year'})   
    
    if ('year' in df.columns):
            df['year']=df['year'].astype(int)
            if ('month' in df.columns):
                df['month']=df['month'].astype(int)    
    
    # Unit
    if ('unit' in df.columns and not 'unit_name' in df.columns ):
        df['unit_name']=df['unit']  
    
    # WB SPECIFIC    
    # Manage inconsistent country/region iso2 or iso3 in indicators
    if ('country_id' in df.columns and 'country_value' in df.columns ):
        if ('country_iso3' in df.columns):
            df['country_iso3'] = df['country_iso3'].fillna(df['country_id'])
        else:
            df['country_iso3']=df['country_id']
        df = df.drop(columns=['country_id','country_value'])
            
    return df

## edgar_file_em_tot_co2eq_sect

In [3]:
csv_file_path = "../../data/_raw/edgar/edgar_file_em_tot_co2eq_sect.csv"
df_0 = pd.read_csv(csv_file_path)
df_0.head()

Unnamed: 0,Sector,EDGAR Country Code,Country,year,value
0,Buildings,ABW,Aruba,1970,0.041855
1,Other industrial combustion,ABW,Aruba,1970,0.00071
2,Other sectors,ABW,Aruba,1970,0.020696
3,Power Industry,ABW,Aruba,1970,0.034778
4,Transport,ABW,Aruba,1970,0.004197


In [4]:
# Uniformize NaN
df_0 = df_0.replace(['', 'nan'], np.nan)
# Drop all columns having only NaN
df_0 = df_0.dropna(axis=1, how='all')  

# Replace column names using the format function
df_0 = df_0.rename(columns=process_columns)

# Filter value if exists
if ('value' in df_0.columns):
    df_0 = df_0.dropna(subset=['value'])

In [5]:
df_0.head()

Unnamed: 0,sector,edgar_country_iso3,country,year,value
0,Buildings,ABW,Aruba,1970,0.041855
1,Other industrial combustion,ABW,Aruba,1970,0.00071
2,Other sectors,ABW,Aruba,1970,0.020696
3,Power Industry,ABW,Aruba,1970,0.034778
4,Transport,ABW,Aruba,1970,0.004197


In [6]:
df_0.hvplot(x='year', groupby =['sector', 'country'], widget_location='left_top', title='Total emission of CO2eq by kilotonne')

In [7]:
df_0.hvplot(x='year', by='sector', groupby =['country'], widget_location='left_top', title='Total emission of CO2eq by kilotonne')

## edgar_file_em_tot_co2eq

In [8]:
csv_file_path = "../../data/_raw/edgar/edgar_file_em_tot_co2eq.csv"
df_1 = pd.read_csv(csv_file_path)
# Uniformize NaN
df_1 = df_1.replace(['', 'nan'], np.nan)
# Drop all columns having only NaN
df_1 = df_1.dropna(axis=1, how='all')  

# Replace column names using the format function
df_1 = df_1.rename(columns=process_columns)

# Filter value if exists
if ('value' in df.columns):
    df_1 = df_1.dropna(subset=['value'])
df_1.head()

NameError: name 'df' is not defined

In [None]:
df_1.hvplot(x='year', groupby =['country'], widget_location='left_top', title='Total emission of CO2eq by kilotonne')