In [13]:
import pandas as pd
import numpy as np
import os
from importlib.machinery import SourceFileLoader

path_dependencies = '/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/ECP/_code/compilation/_dependencies/dep_ecp'
path_ghg = '/Users/ejoiner/OneDrive - rff/ecp/ecp_dataset/source_data/ghg_inventory/raw'

ecp_general = SourceFileLoader('general_func', path_dependencies+'/ecp_v3_gen_func.py').load_module()


In [14]:
# Country names
iea_wb_map = {'Australi':'Australia', 
            'Bosniaherz':'Bosnia and Herzegovina',
            'Brunei':'Brunei Darussalam', 
            'Congo':'Congo, Rep.', 
            'Congorep':'Congo, Dem. Rep.',
            'Costarica':'Costa Rica',
            'Coteivoire':"Cote d'Ivoire", 
            'Czech':'Czech Republic',
            'Dominicanr':'Dominican Republic',
            'Egypt':'Egypt, Arab Rep.', 
            'Elsalvador':'El Salvador',
            'Eqguinea':'Equatorial Guinea',
            'Eswatini':'Lesotho', 
            'Hongkong':'Hong Kong, SAR', 
            'Iran':'Iran, Islamic, Rep.', 
            'Korea':'Korea, Rep.', 
            'Koreadpr':'Korea, Dem. Rep.', 
            'Kyrgyzstan':'Kyrgyz Republic', 
            'Lao':'Lao PDR', 
            'Luxembou':'Luxembourg',
            'Nethland':'Netherlands',
            'Northmaced':'North Macedonia',
            'Nz':'New Zealand',
            'Philippine':'Philippines',
            'Russia':'Russian Federation',
            'Saudiarabi':'Saudi Arabia', 
            'Slovakia':'Slovak Republic',
            'Southafric':'South Africa',
            'Srilanka':'Sri Lanka', 
            'Ssudan':'South Sudan', 
            'Switland':'Switzerland', 
            'Syria':'Syrian Arab Republic', 
            'Turkmenist':'Turkmenistan',
            'Uae':'United Arab Emirates',
            'Uk':'United Kingdom',
            'Usa':'United States',
            'Venezuela':'Venezuela, RB',
            'Yemen':'Yemen, Rep.'}

In [77]:
gas = "CH4"

df = pd.read_table(path_ghg+'/national/IEA/iea_energy_ghg_emissions/2024_edition/WORLD_GHG.TXT',
                        sep = " ", names=["jurisdiction", "Product", "year", "FLOWname", "gas", "Value", "add_drop"])
    # restrict to gas
df = df[df['gas'] == gas]
    
df.rename(columns= {"Value": gas}, inplace = True)
    
df.drop(columns = ["add_drop", "gas"], inplace = True)
    
      # Filter out memo items (aggregates)
memoAggregates = ['OECDAM', 'OECDAO', 'OECDEUR', 'OECDTOT', 'OTHERAFRIC' 'OTHERASIA' 'OTHERLATIN',
                      'IEATOT', 'ANNEX2NA', 'ANNEX2EU', 'ANNEX2AO', 'ANNEX2', 'MG7', 'AFRICA',
                      'UNAFRICA', 'MIDEAST', 'EURASIA', 'LATAMER', 'ASIA', 'CHINAREG', 'NOECDTOT',
                      'IEAFAMILY', 'WORLDAV', 'WORLDMAR', 'WORLD', 'UNAMERICAS', 'UNASIATOT',
                      'UNEUROPE', 'UNOCEANIA', 'EU28', 'ANNEX1', 'ANNEX1EIT', 'NONANNEX1', 'ANNEXB',
                      'MYUGO', 'MFSU15', 'MG8', 'MG20', 'OPEC', 'MASEAN', 'EU27_2020', 'MBURKINAFA',
                      'MCHAD', 'MMAURITANI', 'MPALESTINE', 'MMALI', 'MGREENLAND', 'FSUND']

df = df[~df.jurisdiction.isin(memoAggregates)]
df["jurisdiction"] = df["jurisdiction"].str.capitalize()

        # Country names replacement
df["jurisdiction"].replace(to_replace=iea_wb_map, inplace=True)


        # Add Flow codes to dataframe
flowCodes = pd.read_csv('/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/ECP/_raw/_aux_files/iea_ukds_FLOWcodes.csv',
                                usecols=[0,1])
df = df.merge(flowCodes, on='FLOWname', how='left')

# Add ipcc codes
ipccCodes = pd.read_csv('/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/ECP/_raw/_aux_files/ipcc2006_iea_code_update.csv')
ipccCodes.rename(columns={"Product ": "Product"}, inplace=True)

df = df.merge(ipccCodes, on=["Product", "FLOWname"], how='left')

df.rename(columns= {"IPCC_CODE": "ipcc_code", "IPCC_CODE2 ": "ipcc_code2", "IPCC_CODE3": "ipcc_code3"}, inplace = True)

df["Source"] = "IEA"

df = df.replace({"Product": {"TOTAL": "Total", "OIL": "Oil", "COAL": "Coal", "NATGAS": "Natural gas", "BIOPROD": "Bioprod", "OTHER": "Other"},
                 "CH4": {"..": "0", "x": "0", "c": "0"}})

df["CH4"] = df["CH4"].astype(str)
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["jurisdiction"].replace(to_replace=iea_wb_map, inplace=True)


In [78]:
#EDGAR DATA 
# format ipcc_code and year columns
edgar_ghg = pd.read_csv('/Users/ejoiner/OneDrive - rff/ecp/ecp_dataset/source_data/ghg_inventory/processed/ghg_national_total_ipcc.csv')

edgar_ghg = edgar_ghg[["jurisdiction", "year", "ipcc_code", gas]]

edgar_ghg["Source"] = "EDGAR"
## remove all energy related values, as those are covered by the IEA data

edgar_ghg = edgar_ghg[~edgar_ghg['ipcc_code'].astype(str).str.startswith('1')]

edgar_ghg["CH4"] = edgar_ghg["CH4"].astype(str)


In [None]:
combined_nat = pd.concat([df, edgar_ghg], ignore_index=True)

In [35]:
# CREATE WCPD 

gases = ["CO2"] # "CH4", "N2O", "FGASES" 

path_wcpd = '/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/WorldCarbonPricingDatabase/_dataset/data'
ecp_cov_fac = SourceFileLoader('coverage_factors', path_dependencies+'/ecp_v3_coverageFactors.py').load_module()
ecp_overlap = SourceFileLoader('overlap', path_dependencies+'/ecp_v3_overlap.py').load_module()

wcpd = {}

for gas in gases: 

    # LOAD WCPD DATAFRAMES

    wcpd_ctry = ecp_general.concatenate(path_wcpd+"/"+gas+"/national")
    wcpd_subnat = ecp_general.concatenate(path_wcpd+"/"+gas+"/subnational")
    wcpd_all = pd.concat([wcpd_ctry, wcpd_subnat]).sort_values(by=["jurisdiction", "year"])

    wcpd_all = wcpd_all.drop_duplicates(['jurisdiction', 'year', 'ipcc_code', 'Product']) # duplicates from WCPD need to be corrected

    # ADD COLUMN WITH IEA SECTOR CODES
    ipcc_iea_map = pd.read_csv("/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/ECP/_raw/_aux_files/ipcc2006_iea_category_codes.csv", 
                    usecols=["ipcc_code", "FLOW"])
    
    # goal should be to replace "FLOW" with IEA codes 
    
    ipcc_iea_map.columns = ["ipcc_code", "iea_code"]


    wcpd_all = wcpd_all.merge(ipcc_iea_map, on=["ipcc_code"], how="left")

    # LISTS OF JURISDICTION NAMES

    ctry_names = list(wcpd_ctry.jurisdiction.unique())
    subnat_names = list(wcpd_subnat.jurisdiction.unique())

    std_ctry_names = [x.replace(".", "").replace(",", "").replace(" ", "_") for x in ctry_names]
    countries_dic = dict(zip(ctry_names, std_ctry_names))

    std_subnat_names = [x.replace(".", "").replace(",", "").replace(" ", "_") for x in subnat_names]
    subnat_dic = dict(zip(subnat_names, std_subnat_names))

    if len(wcpd_all[wcpd_all.duplicated(['jurisdiction', 'year', 'ipcc_code', 'Product'], keep=False)] != 0):
        print("The dataset contains duplicates!")

    # ADD COVERAGE FACTORS 

    wcpd_all = ecp_cov_fac.coverageFactors(wcpd_all, gas)

    # MECHANISM OVERLAP 
    overlap = pd.read_csv("/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/WorldCarbonPricingDatabase/_raw/overlap/overlap_mechanisms_"+gas+".csv")

    wcpd_all = ecp_overlap.overlap(wcpd_all, overlap)

    wcpd[gas] = wcpd_all


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inst_df_ids.loc[:, "overlap_"+i[0]+"_"+i[1]] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inst_df_ids.loc[:, "overlap_"+i[0]+"_"+i[1]+"_ids"] = inst_df_ids.loc[:, scheme_columns[i[0]]] + inst_df_ids.loc[:, scheme_columns[i[1]]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inst_df_ids.loc[:,

In [79]:
inventory_gas_nat = wcpd_all[wcpd_all["jurisdiction"].isin(ctry_names)][
        ["jurisdiction", "year", "ipcc_code", "iea_code", "Product"]].copy()

inventory_gas_nat[["iea_code", "Product"]] = inventory_gas_nat[["iea_code", "Product"]].fillna("NA")


In [80]:
# MERGE IEA DATA 
 
inventory_gas_nat = inventory_gas_nat.merge(
        df,
        on=["jurisdiction", "year", "ipcc_code", "Product"],
        how="left"
    )

# MERGE EDGAR DATA 

inventory_gas_nat = inventory_gas_nat.merge(
        edgar_ghg,
        on=["jurisdiction", "year", "ipcc_code"],
        how="left"
    )

inventory_gas_nat["CH4"] = inventory_gas_nat["CH4_x"].fillna("")  + inventory_gas_nat["CH4_y"].fillna("") 
inventory_gas_nat["Source"] = inventory_gas_nat["Source_x"].fillna("") + inventory_gas_nat["Source_y"].fillna("")
inventory_gas_nat = inventory_gas_nat.drop(columns = ["CH4_x","CH4_y","Source_x", "Source_y"])