In [5]:
import pandas as pd
import numpy as np
import os
import re
import requests
from pathlib import Path
import os
import glob
from pathlib import Path
from itertools import chain
from zipfile import ZipFile
from io import BytesIO

In [None]:
# concatenate function from gen func file 

def concatenate(indir):
    os.chdir(indir) #sets the current directory to 'indir'
    fileList=glob.glob("*.csv") # generates a list of csv files in the directory
    dfList = []

    for filename in fileList: #each iteration of the loop adding a dataframe to the list
        df=pd.read_csv(filename, header=0)
        dfList.append(df)

    concatDf=pd.concat(dfList,axis=0) #'axis=0' ensures that we are concatenating vertically

    return concatDf


In [None]:
## Need to create concordance between IPCC CODES in EDGAR data and IPCC CODES in WCPD 

# EDGAR N2O

# confirmed IPCC_CODE_2006_FOR_STANDARD_REPORT

N2O_confirmed = ["1A1A", "1A1BC", "1A2", "1A3A", "1A3B", "1A3C", 
                 "1A3D", "1A3E", "1A4", "1A4", "1A5", "1B1", "1B2",
                 "2B", "2G", "3A2", "3C1", "3C4", "3C5", "3C6",
                 "4B", "4C", "4D", "5A", "5B"]

# EDGAR CH4 

CH4_confirmed = ["1A1A", "1A1BC", "1A2", "1A3A", "1A3B", "1A3C", 
                 "1A3D", "1A3E", "1A4", "1A5", "1B1", "1B2",
                 "2B", "2C", "3A1", "3A2", "3C1", "3C7",
                 "4A", "4B", "4C", "4D", "5B"]


In [7]:
path_wcpd = Path('/Users/ejoiner/OneDrive - rff/Documents/RFF Organization/Research Documents/WCPD/WorldCarbonPricingDatabase/_dataset/data')

In [None]:
# Look at WCPD codes from WCPD concat file N2O

    # Load WCPD data
wcpd_n2o_ctry = concatenate(f"{path_wcpd}/N2O/national")
wcpd_n2o_subnat = concatenate(f"{path_wcpd}/N2O/subnational")
wcpd_n2o_all = pd.concat([wcpd_n2o_ctry, wcpd_n2o_subnat]).sort_values(by=["jurisdiction", "year"])

    # Clean and deduplicate
wcpd_n2o_all["Product"] = wcpd_n2o_all["Product"].fillna('NA')
wcpd_n2o_all = wcpd_n2o_all.drop_duplicates(subset=["jurisdiction", "year", "ipcc_code", "Product"])

    # If tax = 1 or ets = 1, summarize ipcc codes
wcpd_n2o_all_policy = wcpd_n2o_all[(wcpd_n2o_all["tax"] == 1) | (wcpd_n2o_all["ets"] == 1)]
wcpd_n2o_ipcc_codes = wcpd_n2o_all_policy["ipcc_code"].unique().tolist()
     
 # Load WCPD data
 
wcpd_ch4_ctry = concatenate(f"{path_wcpd}/CH4/national")
wcpd_ch4_subnat = concatenate(f"{path_wcpd}/CH4/subnational")
wcpd_ch4_all = pd.concat([wcpd_ch4_ctry, wcpd_ch4_subnat]).sort_values(by=["jurisdiction", "year"])

    # Clean and deduplicate
wcpd_ch4_all["Product"] = wcpd_ch4_all["Product"].fillna('NA')
wcpd_ch4_all = wcpd_ch4_all.drop_duplicates(subset=["jurisdiction", "year", "ipcc_code", "Product"])

    # If tax = 1 or ets = 1, summarize ipcc codes to get lists of codes reported in coverage files 
wcpd_ch4_all_policy = wcpd_ch4_all[(wcpd_ch4_all["tax"] == 1) | (wcpd_ch4_all["ets"] == 1)]
wcpd_ch4_ipcc_codes = wcpd_ch4_all_policy["ipcc_code"].unique().tolist()


Building policy features data frame for N2O


In [None]:

def match_ipcc_substrings(wcpd_codes, confirmed_codes, max_len=None):
    """
    Build successive-truncation columns for wcpd_codes and mark whether each truncation
    matches any code in confirmed_codes.
    Returns a DataFrame with:
      - ipcc_code: original code
      - code_len_{L}: substring of length L (for L = max_len .. 1)
      - match_len_{L}: 1 if code_len_{L} in confirmed_codes else 0
      - first_matched: the first (longest) substring found in confirmed_codes or None
      - matched_len: length of the first matched substring or 0
    wcpd_codes: iterable of strings (or a single-column DataFrame/Series)
    confirmed_codes: iterable of strings
    max_len: optional int to force maximum column length (defaults to longest wcpd code)
    """
    import pandas as pd

    # Normalize inputs
    if isinstance(wcpd_codes, (pd.Series, pd.DataFrame)):
        series = pd.Series(wcpd_codes.squeeze().astype(str).values)
    else:
        series = pd.Series([str(x) for x in list(wcpd_codes)])

    confirmed_set = set([str(x) for x in confirmed_codes])

    # determine maximum length to generate columns for
    observed_max = series.str.len().max()
    if max_len is None:
        max_len = observed_max
    max_len = int(max_len)

    rows = []
    for code in series:
        L = len(code)
        row = {"ipcc_code": code}
        first_matched = None
        matched_len = 0
        # generate columns for lengths from max_len down to 1
        for length in range(max_len, 0, -1):
            if length <= L:
                trunc = code[:length]
            else:
                trunc = ""  # shorter codes can't have a substring of this length
            row[f"code_len_{length}"] = trunc if trunc != "" else None
            is_match = 1 if (trunc in confirmed_set) else 0
            row[f"match_len_{length}"] = is_match
            if is_match and first_matched is None:
                first_matched = trunc
                matched_len = length
        row["first_matched"] = first_matched
        row["matched_len"] = matched_len
        row["matched_confirmed_code"] = first_matched
        # matched_was_original: True if matched_len equals length of original code (i.e., untruncated matched)
        row["matched_was_original"] = (matched_len == L and matched_len > 0)
        rows.append(row)

    df_out = pd.DataFrame(rows)
    # keep columns in a sensible order: original, code_len.., match_len.., first_matched, matched_len
    code_cols = [f"code_len_{l}" for l in range(max_len, 0, -1)]
    match_cols = [f"match_len_{l}" for l in range(max_len, 0, -1)]
    col_order = ["ipcc_code"] + code_cols + match_cols + ["first_matched", "matched_len", "matched_confirmed_code", "matched_was_original"]
    # ensure all columns exist (in case some lengths > observed_max)
    col_order = [c for c in col_order if c in df_out.columns]
    return df_out[col_order]

#Example usage with objects already in the notebook:
    
ch4_matches = match_ipcc_substrings(wcpd_ch4_ipcc_codes, CH4_confirmed)
n2o_matches = match_ipcc_substrings(wcpd_ch4_ipcc_codes, N2O_confirmed)