## Extracting Climate Finance Tools

The goal of this project is to identify climate finance laws and tools from around the world. We now have a grouping of laws themselves, news articles, press releases, and company plans that may mention these types of initatives. The next step is to comb through them to find what we need. 

### Loading the data

In [1]:
import os
import pandas as pd
import PyPDF2
import fitz  # PyMuPDF - better for detecting formatting like strikethrough
from tqdm.notebook import tqdm
from nltk import sent_tokenize
# from deep_translator import single_detection  # Old API-based method
from langdetect import detect, DetectorFactory  
DetectorFactory.seed = 0  # For consistent results
import re
import time

In [2]:
canada_folder = 'data/bndc'

In [3]:
def load_files(folder): 
    """
    Load files from a folder.
    
    Args:
        folder: Path to folder containing files
    """
    df = []
    files = os.listdir(folder)
    for file in tqdm(files, desc=f"Loading files from {folder}"): 
        file_path = os.path.join(folder, file)
        ext = os.path.splitext(file)[-1].lower()
        try:
            if ext == '.pdf':
                text = ""
                # Use PyPDF2 to extract text from PDF files
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    for page in reader.pages:
                        try:
                            text += page.extract_text() or ""
                        except Exception:
                            continue
            else:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: 
                    text = f.read()
            df.append({'file': file, 'text': text})
        except Exception as e:
            df.append({'file': file, 'text': '', 'error': str(e)})
    return pd.DataFrame(df)

In [4]:
df = load_files(canada_folder)

Loading files from data/bndc:   0%|          | 0/8 [00:00<?, ?it/s]

**Ok this is the problem. The PDF extraction is wrong**

Corrected.

In [5]:
df

Unnamed: 0,file,text
0,china.pdf,ThePeople’sRepublicofChina\nFirstBiennialTrans...
1,canada_ndc.pdf,1 CANADA’S 2021 NATIONALLY DETERMINED CONTRIB...
2,canada.pdf,\n \n \n \n \n \n \nCanada’s First Biennial ...
3,china_ndc.pdf,\n(UNOFFICIAL TRANSLATION ) \n \n \n \nChina’...
4,eu.pdf,\nEN EN \n \n \n EUROPEAN \nCOMMISSION \...
5,brasil.pdf,FIRST \nBIENNIAL \nTRANSPARENCY \nREPORT\nOF B...
6,brasil_ndc.pdf,BRAZIL’S NDC\nNational determination \nto cont...
7,eu_ndc.pdf,Update of the \nNDC of the European Union an...


In [6]:
from IPython.display import display, HTML

# Display the 'eu.pdf' text in a scrollable, large box for easy scrolling
canada = df[df['file'] == 'canada.pdf'].text.values
if len(canada) > 0:
    display(HTML(f'''
        <div style="max-height:600px; overflow:auto; border:1px solid #ccc; padding:10px; font-family:monospace; white-space:pre-wrap;">
            {canada[0].replace('<', '&lt;').replace('>', '&gt;')}
        </div>
    '''))
else:
    print("No text found for canada.pdf")

In [5]:
def filter_english_sentences(text):
    """
    Filter text to keep only English sentences.
    Detects language of each sentence and removes non-English ones.
    Returns filtered English-only text.
    Uses langdetect (local, free, no API needed!)
    """
    # Handle empty or invalid text
    if not text or not isinstance(text, str) or len(text.strip()) == 0:
        print('  ⚠️  Empty or invalid text, skipping')
        return None
    
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    print(f'  📝 Text split into {len(sentences)} sentences')
    
    english_sentences = []
    
    for i, sentence in enumerate(sentences):
        sentence = sentence.strip()
        
        # Skip empty sentences
        if not sentence:
            continue
        
        # Skip very short sentences (likely headers, page numbers, etc.)
        if len(sentence) < 10:
            english_sentences.append(sentence)  # Keep short text as-is
            continue
        
        try:
            # Detect language of this sentence using langdetect (100% local, no API!)
            detected_lang = detect(sentence)
            
            if detected_lang == 'en':
                english_sentences.append(sentence)
                print(f"    ✅ Sentence {i+1}/{len(sentences)}: English - keeping")
            else:
                print(f"    ❌ Sentence {i+1}/{len(sentences)}: {detected_lang} - removing")
        except Exception as e:
            # If detection fails, keep the sentence (conservative approach)
            print(f"    ⚠️  Sentence {i+1}/{len(sentences)}: Detection error - keeping")
            print(f"        Error details: {str(e)}")
            english_sentences.append(sentence)
    
    # Join the English sentences back together
    filtered_text = ' '.join(english_sentences)
    print(f'  ✅ Filtered to {len(english_sentences)}/{len(sentences)} sentences')
    
    return filtered_text

In [6]:
import time

# Simplified processing - just filter for English sentences
def process_text(text):
    """
    Process text to keep only English sentences.
    Returns filtered English-only text.
    """
    return filter_english_sentences(text)

def process_with_retry(text, max_retries=3, delay=2):
    """Process text with automatic retry on API errors"""
    for attempt in range(max_retries):
        try:
            filtered_text = process_text(text)
            return filtered_text
        except Exception as e:
            print(f"  ⚠️  Error (attempt {attempt + 1}/{max_retries}): {str(e)[:100]}")
            if attempt < max_retries - 1:
                print(f"  ⏳ Waiting {delay} seconds before retry...")
                time.sleep(delay)
            else:
                print(f"  ❌ Failed after {max_retries} attempts")
                return None

def is_processing_complete(original_text, filtered_text):
    """Check if processing is complete by verifying filtered text exists"""
    if not original_text or not isinstance(original_text, str):
        return True  # Can't process invalid text
    if not filtered_text or not isinstance(filtered_text, str):
        return False  # No filtered text exists
    
    # If we have filtered text, processing is complete
    return True


In [7]:
def process_dataframe_simple(df, save_path='can_df_filtered.csv'):
    """
    Process dataframe with automatic saving - filters for English sentences only.
    """
    
    # Keep the original df with fresh text data
    working_df = df.copy()
    
    # Load existing filtered results from checkpoint (if available)
    try:
        checkpoint_df = pd.read_csv(save_path, escapechar='\\')
        print(f"📂 Loaded checkpoint from {save_path}")
        
        # Merge only the 'filtered' column from checkpoint
        if 'filtered' in checkpoint_df.columns:
            # Create a mapping of file -> filtered text
            filtered_map = dict(zip(checkpoint_df['file'], checkpoint_df['filtered']))
            working_df['filtered'] = working_df['file'].map(filtered_map)
        else:
            working_df['filtered'] = None
            
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print("🆕 Starting fresh (no checkpoint found or checkpoint corrupted)")
        working_df['filtered'] = None

    for idx, row in working_df.iterrows():
        text = row['text']
        existing_filtered = row.get('filtered') if pd.notna(row.get('filtered')) else None
        
        # Check if already processed
        if existing_filtered and is_processing_complete(text, existing_filtered):
            print(f"\n⏭️  Skipping file: {row['file']} (already processed)")
            continue

        print(f"\n🔄 Processing file: {row['file']}")
        
        # Skip .DS_Store and other non-PDF files
        if not row['file'].endswith('.pdf'):
            print(f"  ⏭️  Skipping non-PDF file")
            working_df.at[idx, 'filtered'] = None
            continue

        # Process the text to filter for English sentences
        filtered = process_with_retry(text)

        if filtered:
            # Save the filtered text
            working_df.at[idx, 'filtered'] = filtered
            working_df.to_csv(save_path, index=False, escapechar='\\')
            print(f"  ✅ Filtered text saved!")
        else:
            # No progress made at all
            print(f"  ❌ Processing failed - will retry next time")
            if existing_filtered is None:
                working_df.at[idx, 'filtered'] = None
                working_df.to_csv(save_path, index=False, escapechar='\\')

        # Small delay between requests to avoid rate limiting
        time.sleep(0.5)

    print("\n🎉 All done!")
    return working_df


In [8]:
# Run the corrected processing function
df_filtered = process_dataframe_simple(df, save_path='data/bndc.csv')


📂 Loaded checkpoint from data/bndc.csv

⏭️  Skipping file: china.pdf (already processed)

⏭️  Skipping file: canada_ndc.pdf (already processed)

⏭️  Skipping file: canada.pdf (already processed)

⏭️  Skipping file: china_ndc.pdf (already processed)

⏭️  Skipping file: eu.pdf (already processed)

⏭️  Skipping file: brasil.pdf (already processed)

⏭️  Skipping file: brasil_ndc.pdf (already processed)

⏭️  Skipping file: eu_ndc.pdf (already processed)

🎉 All done!


____

In [9]:
df_filtered

Unnamed: 0,file,text,filtered
0,china.pdf,\n\n\n\n\n\nForeword\n- 1 -\n\nClimate change ...,Foreword\n- 1 -\n\nClimate change is a common ...
1,canada_ndc.pdf,"1 \n\n\n\nThrough this submission, the Governm...","1 \n\n\n\nThrough this submission, the Governm..."
2,canada.pdf,\n \n\n\n\n\n\n\n\n\n\n \nDeveloped in accord...,Developed in accordance with \nthe Paris Agre...
3,china_ndc.pdf,\n\n\n\n \n\n\n \n \n\n\n\n\n\n\n\n\n\n\n \n1...,1 This is an unofficial translation. In case o...
4,eu.pdf,Climate Action\n \n \n \n \n \n\n \n \n \n \n ...,Climate Action\n \n \n \n \n \n\n \n \n \n \n ...
5,brasil.pdf,FIRST \n\n\n\nOF BRAZIL \n\n\n\nMINISTRY OF SC...,FIRST \n\n\n\nOF BRAZIL \n\n\n\nMINISTRY OF SC...
6,brasil_ndc.pdf,\nNational determination \nto contribute and t...,National determination \nto contribute and tra...
7,eu_ndc.pdf,\nUpdate of the \nNDC of the European Union ...,Update of the \nNDC of the European Union and...


In [10]:
df_filtered.iloc[0].text



In [192]:
from transformers import AutoTokenizer

In [193]:
tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1")

In [199]:
def tokenize_and_chunk(row, tokenizer, max_tokens=512, overlap=64, text_col='text'):
    """
    Splits row[text_col] into maximally uniform chunks, exactly of length max_tokens
    (except possibly the last chunk), with specified overlap between consecutive chunks.
    """
    text = row[text_col]
    tokens = tokenizer.encode(text, add_special_tokens=False)
    total_tokens = len(tokens)
    chunks = []
    start_idx = 0

    if total_tokens <= max_tokens:
        # Only one chunk needed
        chunk_tokens = tokens
        chunk_text = tokenizer.decode(chunk_tokens)
        chunk_row = row.to_dict()
        chunk_row[text_col] = chunk_text
        chunks.append(chunk_row)
    else:
        # Multiple chunks with overlap
        while start_idx < total_tokens:
            end_idx = start_idx + max_tokens
            chunk_tokens = tokens[start_idx:end_idx]
            chunk_text = tokenizer.decode(chunk_tokens)
            chunk_row = row.to_dict()
            chunk_row[text_col] = chunk_text
            chunks.append(chunk_row)
            # Advance start index: new chunk starts after removing overlap
            if end_idx >= total_tokens:
                break
            start_idx += max_tokens - overlap

    return chunks

In [200]:
print("Chunking documents...")

chunked_rows = []
for _, row in df.iterrows():
    # Only chunk over the 'translated' column, but preserve 'file' and 'country' info
    translated_text = row.get('text', None)
    if not isinstance(translated_text, str) or not translated_text.strip():
        continue  # skip rows where translated text is not a valid string
    chunk_input = row.copy()
    chunk_input['text'] = translated_text
    try:
        chunks = tokenize_and_chunk(chunk_input, tokenizer, text_col='text')
    except TypeError:
        continue  # skip this row if tokenize_and_chunk fails
    for chunk in chunks:
        # store chunk and remember source
        chunk_record = {
            'file': row.get('file', None),
            'chunk_text': chunk['text']
        }
        chunked_rows.append(chunk_record)

chunked_df = pd.DataFrame(chunked_rows)
chunked_df = chunked_df.drop_duplicates(subset=['chunk_text']).reset_index(drop=True)

Chunking documents...


In [201]:
chunked_df

Unnamed: 0,file,chunk_text
0,china.pdf,thepeople ’ srepublicofchina firstbiennialtran...
1,china.pdf,##2022. the preparationandsubmissionofthe1btrh...
2,china.pdf,"##nfcccsecretariat. chinawill, asalways, joinh..."
3,china.pdf,.................................................
4,china.pdf,.................................................
...,...,...
2100,eu_ndc.pdf,developing the nationally determined contribut...
2101,eu_ndc.pdf,global warming potential on a 100 timescale in...
2102,eu_ndc.pdf,", assumptions, definitions, methodologies, dat..."
2103,eu_ndc.pdf,"through domestic measures only, without contri..."


In [202]:
chunked_df.file.value_counts()

file
canada.pdf        776
china.pdf         619
brasil.pdf        275
eu.pdf            223
china_ndc.pdf      83
brasil_ndc.pdf     63
canada_ndc.pdf     47
eu_ndc.pdf         19
Name: count, dtype: int64

**This is where I stopped! There is a problem with the chunking logic. I am getting a lot more chunks for canada than I am for the other countries, especially the EU docs.**

### Calling an LLM

In [224]:
from dotenv import load_dotenv
import openai
# Load environment variables from .env file
load_dotenv()

# Get OpenRouter API key from environment variables
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
if not OPENROUTER_API_KEY:
    raise ValueError("Please set OPENROUTER_API_KEY in your .env file or environment variables")

In [225]:
client = openai.OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

In [274]:
SYSTEM_PROMPT = """You are a specialized analyst trained to identify and extract climate finance policy instruments from policy documents, laws, regulations, and related texts. You will receive documents in chunks and must identify qualifying instruments in each chunk. You are very discerning and very stingy about what you qualify as climate finance policy instruments. If you get it wrong, and have things in there that are not climate finance policy instruments, you will be fired. 

## Your Task
Identify and extract policy instruments that meet ALL four criteria below:

### Inclusion Criteria (ALL must be met)

1. **Public Policy Authority**
   - Must be a formal policy: law, regulation, decree, guideline, program, financing decision, or investment decision
   - Must originate from public authorities: national/subnational legislators, governments, central banks, regulatory agencies, enforcement agencies, or state-owned entities

2. **Financial Flow Focus**
   - Must directly influence financial flows at scale (structuring, channeling, mobilizing, or redirecting)
   - Must target the behavior of financial entities and individuals making finance-related decisions
   - Must apply to financial actors or financial markets

3. **Scale Requirement**
   - Must involve financial flows "at scale" (not isolated, small transactions)

4. **Explicit Climate Purpose**
   - Must have an EXPLICIT stated purpose to materially impact:
     - **Mitigation**: reducing GHG emissions or enhancing GHG sinks
     - **Adaptation**: reducing vulnerability or increasing resilience to climate impacts
     - **Loss & Damage**: compensating for climate-related losses and damages

### Exclusion Rules

**EXCLUDE if the policy:**
- Lacks explicit financial flow implications (e.g., technical standards, emissions limits, vehicle quality rules, pollution disclosure) UNLESS combined with a financial mechanism (e.g., tradable permits, subsidies)
- Only creates internal government structures (committees, advisory bodies) WITHOUT immediate financial market consequences
- Primarily targets other environmental goals (biodiversity reserves, toxic substance bans, non-GHG air standards) even if climate co-benefits exist
- Could theoretically affect climate finance but lacks explicit environmental/climate purpose (e.g., general consumer protection laws, generic financial advisor training requirements)
- It is on the subnational level. If we are talking about the EU, policies in member states are acceptable. However, in any other jurisdiction, or within member states, we only want national level policies.

**INCLUDE if:**
- Not climate-exclusive BUT has explicit purpose to systematically affect climate-related financial flows (e.g., ESG disclosure requirements that explicitly include climate data)

### Disaggregation Rules

When analyzing laws, plans, or comprehensive policy documents:

1. **Split into separate instruments when:**
   - A law contains multiple articles establishing distinct programs, funds, or incentive mechanisms
   - Different sections create different financial mechanisms
   - Each program/fund operates independently

2. **Group together when:**
   - Multiple articles modify or detail the SAME mechanism (e.g., several clauses all describing one fund's operations)
   - Provisions are interdependent parts of a single instrument

3. **Ignore:**
   - General preambles, vision statements, objectives, and targets that don't themselves create/modify a policy instrument
   - Procedural clauses without financial implications

4. **Default approach:**
   - When uncertain, err on the side of SPLITTING rather than lumping
   - Be specific: don't name an aggregated set; identify the individual instruments

## Output Format

For each chunk analyzed, return a JSON array containing all identified instruments. Each instrument should be a JSON object with these fields. If you cannot identify the type or authority, those pieces can be left blank. the most important thing is to return policy names.:
```json
[
  {
    "policy_name": "Specific name of the program/mechanism/instrument",
    "type": "Type of instrument (e.g., subsidy, tax credit, grant program, loan guarantee, disclosure requirement, green bond framework)",
    "authority": "Issuing body or agency"
  }
]
```

**Field Guidelines:**
- `policy_name`: Required. Use the specific instrument name, not the parent law title
- `type`: Optional. Leave as empty string `""` if not findable
- `authority`: Optional. Leave as empty string `""` if not findable

**If no qualifying instruments are found in the chunk**, return an empty array:
```json
[]
```

You will process documents chunk by chunk. Analyze only what is provided in each chunk without making assumptions about content in other chunks."""

In [275]:
def get_user_prompt(chunk):
    return f"""
You are an expert in climate finance law and policy. Your goal is to identify climate finance policy instruments. Only returned NAMED policy instruments at the national level. Be discerning as to whether a policy instrument is climate finance but do air on the side of including more.  

For example, if a sentence said, "The government has implemented a policy to encourage green investments", you should return nothing. If the previous sentence said, "The government has implemented the Green Investment Act", then you would return "Green Investment Act".

That said, it doesn't have to be named in capital letters as such. Here is an example paragraph and what should be extracted: 

Example: In 2024, the Government of Canada announced a plan to deliver Made-in-Canada sustainable investment guidelines to help investors, lenders, and other stakeholders navigating the path to net-zero by identifying “green” and “transition” activities. Additionally, the Government of Canada also announced that it proposes to amend the Canada Business Corporation
Act to mandate climate-related financial disclosures by large, federally incorporated private companies.

Response:
{{\n",
  \"has_policy_instruments\": true,\n",
  \"policy_instruments\": [\"Made-in-Canada sustainable investment guidelines\", \"Canada Business Corporation Act\"]\n",
}}\n",

Even if the policy instrument is named, but there is not enough context to identify it as related to climate finance, you should return nothing.

Remember, you are looking for only policy instruments at the national or ultra-national level. So, an Eu policy but also a policy in a member state, like Germany. On the other hand, a policy in Quebec, for example, or Bavaria, should not be counted because those are subnational. 

Here is the text to analyze:
{chunk}

Please identify the climate finance policy instruments in the text. Respond only with a JSON object in one of the following two formats:

If there are no climate finance policy instruments in the text, respond with:
{{
  "has_policy_instruments": false,
  "policy_instruments": []
}}

If there are climate finance policy instruments in the text, respond with:
{{
  "has_policy_instruments": true,
  "policy_instruments": ["name of instrument 1"; "name of instrument 2"; ...]
}}

Do not provide any reasoning or any other information. Only respond with the JSON object as specified above.
"""

In [276]:
chunked_df

Unnamed: 0,file,chunk_text
0,china.pdf,thepeople ’ srepublicofchina firstbiennialtran...
1,china.pdf,##2022. the preparationandsubmissionofthe1btrh...
2,china.pdf,"##nfcccsecretariat. chinawill, asalways, joinh..."
3,china.pdf,.................................................
4,china.pdf,.................................................
...,...,...
2100,eu_ndc.pdf,developing the nationally determined contribut...
2101,eu_ndc.pdf,global warming potential on a 100 timescale in...
2102,eu_ndc.pdf,", assumptions, definitions, methodologies, dat..."
2103,eu_ndc.pdf,"through domestic measures only, without contri..."


In [277]:
chunk_1 = get_user_prompt(chunked_df['chunk_text'].iloc[40])

In [278]:
chunk_1


'\nYou are an expert in climate finance law and policy. Your goal is to identify climate finance policy instruments. Only returned NAMED policy instruments at the national level. Be discerning as to whether a policy instrument is climate finance but do air on the side of including more.  \n\nFor example, if a sentence said, "The government has implemented a policy to encourage green investments", you should return nothing. If the previous sentence said, "The government has implemented the Green Investment Act", then you would return "Green Investment Act".\n\nThat said, it doesn\'t have to be named in capital letters as such. Here is an example paragraph and what should be extracted: \n\nExample: In 2024, the Government of Canada announced a plan to deliver Made-in-Canada sustainable investment guidelines to help investors, lenders, and other stakeholders navigating the path to net-zero by identifying “green” and “transition” activities. Additionally, the Government of Canada also anno

In [279]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": chunk_1}
    ]
)

In [280]:
response_slim = response.choices[0].message.content
response_slim

'{\n  "has_policy_instruments": false,\n  "policy_instruments": []\n}'

In [281]:
def call_llm(line):
    this_prompt = get_user_prompt(line)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": this_prompt}])
    return response

In [282]:
results = []

In [283]:
# Make an empty CSV with the correct columns; only run if responses_df.csv does not exist or is empty

import os

responses_csv_path = "responses_df.csv"
if (not os.path.exists(responses_csv_path)) or (os.stat(responses_csv_path).st_size == 0):
    # Detect correct columns from chunked_df if available, else hardcode fallback
    expected_columns = ['file', 'country', 'chunk_text', 'response']
    import pandas as pd
    empty_df = pd.DataFrame(columns=expected_columns)
    empty_df.to_csv(responses_csv_path, index=False)


In [285]:
# Load the previously processed chunks
responses_df = pd.read_csv("responses_df.csv")

# Ensure both sides are strings for reliable comparison
if 'chunk_text' in responses_df.columns:
    responses_df['chunk_text'] = responses_df['chunk_text'].astype(str)
chunked_df['chunk_text'] = chunked_df['chunk_text'].astype(str)

# Find *exactly* which chunks from chunked_df['chunk_text'] are not present in responses_df['chunk_text']
# This handles any possible reordering or subset issues

# Mark all previously processed chunks for fast lookup (all unique, as set)
processed_chunks_set = set(responses_df['chunk_text'].unique())

# Find the indexes of chunks that are not yet processed
unprocessed_mask = ~chunked_df['chunk_text'].isin(processed_chunks_set)
to_process_df = chunked_df[unprocessed_mask]

total = len(chunked_df)
remaining = len(to_process_df)
print(f"Total chunks: {total} | Already processed/skip: {total-remaining} | To process: {remaining}")

for curr_idx, (i, row) in enumerate(to_process_df.iterrows(), start=1):
    chunk = row['chunk_text']
    print(f"Processing {curr_idx}/{remaining} (index in original: {i})", flush=True)
    llm_response = call_llm(chunk)
    response_text = llm_response.choices[0].message.content
    new_row = {
        'file': row['file'],
        'country': row.get('country', None),
        'chunk_text': chunk,
        'response': response_text
    }
    results.append(new_row)
    # Save the new response after each processing
    # Append and save to responses_df
    responses_df = pd.concat(
        [responses_df, pd.DataFrame([new_row])],
        ignore_index=True
    )
    responses_df.to_csv("responses_df.csv", index=False)


Total chunks: 2105 | Already processed/skip: 820 | To process: 1285
Processing 1/1285 (index in original: 820)
Processing 2/1285 (index in original: 821)
Processing 3/1285 (index in original: 822)
Processing 4/1285 (index in original: 823)
Processing 5/1285 (index in original: 824)
Processing 6/1285 (index in original: 825)
Processing 7/1285 (index in original: 826)
Processing 8/1285 (index in original: 827)
Processing 9/1285 (index in original: 828)
Processing 10/1285 (index in original: 829)
Processing 11/1285 (index in original: 830)
Processing 12/1285 (index in original: 831)
Processing 13/1285 (index in original: 832)
Processing 14/1285 (index in original: 833)
Processing 15/1285 (index in original: 834)
Processing 16/1285 (index in original: 835)
Processing 17/1285 (index in original: 836)
Processing 18/1285 (index in original: 837)
Processing 19/1285 (index in original: 838)
Processing 20/1285 (index in original: 839)
Processing 21/1285 (index in original: 840)
Processing 22/128

In [286]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,file,country,chunk_text,response
0,china.pdf,,thepeople ’ srepublicofchina firstbiennialtran...,"{\n ""has_policy_instruments"": false,\n ""poli..."
1,china.pdf,,##2022. the preparationandsubmissionofthe1btrh...,"{\n ""has_policy_instruments"": false,\n ""poli..."
2,china.pdf,,"##nfcccsecretariat. chinawill, asalways, joinh...","{\n ""has_policy_instruments"": false,\n ""poli..."
3,china.pdf,,.................................................,"{\n ""has_policy_instruments"": false,\n ""poli..."
4,china.pdf,,.................................................,"{\n ""has_policy_instruments"": false,\n ""poli..."
...,...,...,...,...
2100,eu_ndc.pdf,,developing the nationally determined contribut...,"{\n ""has_policy_instruments"": false,\n ""poli..."
2101,eu_ndc.pdf,,global warming potential on a 100 timescale in...,"{\n ""has_policy_instruments"": false,\n ""poli..."
2102,eu_ndc.pdf,,", assumptions, definitions, methodologies, dat...","{\n ""has_policy_instruments"": false,\n ""poli..."
2103,eu_ndc.pdf,,"through domestic measures only, without contri...","{\n ""has_policy_instruments"": false,\n ""poli..."


In [287]:
# extract true/false (has_policy_instruments) from response for each row

import json

def extract_true_false(response_str):
    try:
        response_json = json.loads(response_str)
        return response_json.get("has_policy_instruments")
    except Exception:
        return None

# Add a new column to results_df with the extracted true/false values
results_df['has_policy_instruments'] = results_df['response'].apply(extract_true_false)
results_df.head()


Unnamed: 0,file,country,chunk_text,response,has_policy_instruments
0,china.pdf,,thepeople ’ srepublicofchina firstbiennialtran...,"{\n ""has_policy_instruments"": false,\n ""poli...",False
1,china.pdf,,##2022. the preparationandsubmissionofthe1btrh...,"{\n ""has_policy_instruments"": false,\n ""poli...",False
2,china.pdf,,"##nfcccsecretariat. chinawill, asalways, joinh...","{\n ""has_policy_instruments"": false,\n ""poli...",False
3,china.pdf,,.................................................,"{\n ""has_policy_instruments"": false,\n ""poli...",False
4,china.pdf,,.................................................,"{\n ""has_policy_instruments"": false,\n ""poli...",False


In [288]:
true_df = results_df[results_df['has_policy_instruments'] == True]

In [289]:
true_df.iloc[0].response

'{\n  "has_policy_instruments": true,\n  "policy_instruments": ["green finance system"]\n}'

In [290]:
# Extract a comma-separated string of policy instruments from the 'response' column in true_df, removing brackets

def extract_policy_instruments_str(response_str):
    try:
        response_json = json.loads(response_str)
        instr_list = response_json.get("policy_instruments", [])
        # Join list into comma-separated string, or return empty string if not a list
        if isinstance(instr_list, list):
            return ', '.join(str(instr) for instr in instr_list)
        else:
            return str(instr_list)
    except Exception:
        return None

# Add a new column 'policy_instruments' (as string) to true_df
true_df['policy_instruments'] = true_df['response'].apply(extract_policy_instruments_str)

# Display the first few rows with policy instrument strings
true_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_df['policy_instruments'] = true_df['response'].apply(extract_policy_instruments_str)


Unnamed: 0,file,country,chunk_text,response,has_policy_instruments,policy_instruments
194,china.pdf,,"##roval, based onextensivestudiesandpubliccons...","{\n ""has_policy_instruments"": true,\n ""polic...",True,green finance system
211,china.pdf,,. we'veseena76 % cleanheatingrateacrosstheregi...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,Resource Tax Law of the People's Republic of C...
214,china.pdf,,". in2022alone, 2. 593millionchargingpilesand67...","{\n ""has_policy_instruments"": true,\n ""polic...",True,catalogue of corporate income tax incentives f...
215,china.pdf,,##dgetallocationforstate - ownedcapitaloperati...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,implementation plan for carbon peaking in the ...
222,china.pdf,,##sinks chinahasformulatedandimplementedthewet...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,Wetland Protection Law of the People's Republi...
...,...,...,...,...,...,...
2072,brasil_ndc.pdf,,##i ). in addition to modeling the second phas...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,"restore the amazon program, amazon fund, natio..."
2073,brasil_ndc.pdf,,brazil ’ s ecological transformation plan and ...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,sustainable investment platform for ecological...
2082,brasil_ndc.pdf,,2 ; “ the ultimate objective of this conventio...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,"national climate plan, ecological transformati..."
2087,eu_ndc.pdf,,the unfccc website : https : / / unfccc. int /...,"{\n ""has_policy_instruments"": true,\n ""polic...",True,"multiannual financial framework for 2021-2027,..."


In [291]:
true_df.policy_instruments.iloc[15]

'national climate change adaptation strategy'

In [292]:
def split_policy_instruments(instr_str):
    # Split only on commas followed by a space and then a capital letter
    # This identifies the start of a new policy name
    # Example: "National Plan of Energy 2030, National Energy Matrix 2030"
    # Will split into: ["National Plan of Energy 2030", "National Energy Matrix 2030"]
    # But won't split: "Regulation (EU) 2021, 1060" (no capital after comma+space)
    if not isinstance(instr_str, str) or not instr_str.strip():
        return []
    pattern = r',\s+(?=[A-Z])'
    items = [item.strip() for item in re.split(pattern, instr_str) if item.strip()]
    return items

# Build a list of dicts: each with file, country, policy_instrument
rows = []
for idx, row in true_df.iterrows():
    file = row['file']
    country = row['country']
    policy_str = row['policy_instruments']
    policy_list = split_policy_instruments(policy_str)
    for pol in policy_list:
        if pol.strip():  # skip empty
            rows.append({
                "file": file,
                "country": country,
                "policy_instrument": pol.strip()
            })

policy_instruments_df = pd.DataFrame(rows)
policy_instruments_df

Unnamed: 0,file,country,policy_instrument
0,china.pdf,,green finance system
1,china.pdf,,Resource Tax Law of the People's Republic of C...
2,china.pdf,,catalogue of corporate income tax incentives f...
3,china.pdf,,implementation plan for carbon peaking in the ...
4,china.pdf,,Wetland Protection Law of the People's Republi...
...,...,...,...
888,brasil_ndc.pdf,,"restore the amazon program, amazon fund, natio..."
889,brasil_ndc.pdf,,sustainable investment platform for ecological...
890,brasil_ndc.pdf,,"national climate plan, ecological transformati..."
891,eu_ndc.pdf,,"multiannual financial framework for 2021-2027,..."


In [293]:
import difflib

def mark_first_instance(df, policy_col='policy_instrument', threshold=0.8):
    """
    Adds a column 'first_instance' to the DataFrame, which is 1 for the first (canonical) occurrence 
    of a fuzzy-unique policy, and 0 for subsequent fuzzy duplicates.
    Args:
        df: DataFrame with a column of policy strings (policy_col)
        policy_col: string, name of policy column
        threshold: float (0-1), similarity ratio above which two entries are considered the same
    Returns:
        DataFrame with additional 'first_instance' column (1/0)
    """
    # Keep track of seen policies (canonical representatives)
    unique_policies = []
    first_instance_list = []
    for pol in df[policy_col]:
        pol_clean = pol.strip().lower()
        found = False
        for i, existing in enumerate(unique_policies):
            existing_clean = existing.strip().lower()
            # quick exact containment or equality
            if pol_clean == existing_clean or pol_clean in existing_clean or existing_clean in pol_clean:
                found = True
                # Prefer the longer string as more descriptive
                if len(pol) > len(existing):
                    unique_policies[i] = pol
                break
            # fuzzy
            ratio = difflib.SequenceMatcher(None, pol_clean, existing_clean).ratio()
            if ratio >= threshold:
                found = True
                # Prefer the longer string as more descriptive
                if len(pol) > len(existing):
                    unique_policies[i] = pol
                break
        if not found:
            unique_policies.append(pol)
            first_instance_list.append(1)  # First unique occurrence
        else:
            first_instance_list.append(0)  # Duplicate (fuzzy)
    df = df.copy()
    df['first_instance'] = first_instance_list
    return df


In [294]:
marked_df = mark_first_instance(policy_instruments_df)
marked_df.head()

Unnamed: 0,file,country,policy_instrument,first_instance
0,china.pdf,,green finance system,1
1,china.pdf,,Resource Tax Law of the People's Republic of C...,1
2,china.pdf,,catalogue of corporate income tax incentives f...,1
3,china.pdf,,implementation plan for carbon peaking in the ...,1
4,china.pdf,,Wetland Protection Law of the People's Republi...,1


In [295]:
marked_df.first_instance.value_counts()

first_instance
1    576
0    317
Name: count, dtype: int64

In [317]:
china_df = marked_df[marked_df['file'].isin(['china.pdf', 'china_ndc.pdf'])]
brasil_df = marked_df[marked_df['file'].isin(['brasil.pdf', 'brasil_ndc.pdf'])]
canada_df = marked_df[marked_df['file'].isin(['canada.pdf', 'canada_ndc.pdf'])]
europe_df = marked_df[marked_df['file'].isin(['eu.pdf', 'eu_ndc.pdf'])]

This means there are 254 policy instruments in Brazil. Let's see what they are

In [318]:
#print a list of the 254 policy instruments identified in Brazil, where the first_instance is 1
canada_list = canada_df[canada_df['first_instance'] == 1]['policy_instrument'].tolist()



In [319]:
len(canada_list)

308

In [320]:
canada_list

['pan-Canadian framework on clean growth and climate change',
 'Strengthened Climate Plan',
 'Canada Greener Homes Grant Initiative',
 "Canada Infrastructure Bank's $10 billion growth plan",
 'zero - emissions vehicles program, active transportation fund, carbon pricing',
 'federal carbon pricing system, fuel charge, output-based pricing system',
 'Clean Fuel Standard, federal GHG offset system, net-zero challenge for large emitters',
 'net-zero challenge for large emitters, net-zero accelerator fund, investment tax credit for carbon capture, utilization and storage projects, clean fuels fund, hydrogen strategy for Canada, accelerated capital cost allowance for clean energy equipment, national emissions reduction target for fertilizers',
 '$185 million agricultural climate solutions program, $165 million agricultural clean technology program, $200 million to launch immediate, on-farm climate action',
 'sectoral workforce solutions program, future skills initiative',
 'Canadian Net-Zero

Below are the results that came from running this pipeline on all of the canadian data

In [315]:
can_doc_list = ['CO2 emission credit system',
'Heavy-duty Vehicle and Engine Greenhouse Gas Emission Regulations',
'Greenhouse Gas Emission Standards',
'Output-Based Pricing System Regulations',
'Compensation and Compliance Units',
'Compensation and Issuance of Surplus Credits',
'Canadian Greenhouse Gas Offset Credit System Regulations',
'Regulations Limiting Carbon Dioxide Emissions from Natural Gas-fired Generation of Electricity',
'Reduction of Carbon Dioxide Emissions from Coal-fired Generation of Electricity Regulations',
'Règlement sur le système de tarification fondé sur le rendement',
'Pan-Canadian Framework on Clean Growth and Climate Change',
'Vancouver Declaration on Clean Growth and Climate Change',
'Western Climate Initiative cap-and-trade system, carbon levy',
'Ending Coal for Cleaner Air Act',
'Canadian Energy Strategy',
'Canada Infrastructure Bank, legislation to allow local entities to develop renewable-energy sourced electricity generation',
'Low Carbon Economy Fund',
"Ontario's Climate Change Action Plan",
'Industrial',
'Clean and Energy Technology (ICE) Venture Fund II',
'B.C. Climate Leadership Plan',
'B.C. carbon tax',
'B.C. revenue-neutral carbon tax',
'Renewable Electricity Program',
'Climate Change Innovation and Technology Framework, carbon levy',
'Climate Change Strategy',
'Climate Change Mitigation and Low-carbon Economy Act, cap and trade program',
'PACC 2013-2020',
'zero-emission vehicle (ZEV) standard, cap-and-trade system for greenhouse gas emissions allowances',
'Climate Change Adaptation Program, carbon pricing mechanism',
'Renewable Portfolio Standard',
'Nova Scotia Climate Action Plan',
'Alternative Land Use Services program, made-in-PEI approach to carbon pricing',
'2016 PEI Energy Strategy',
'Management of Greenhouse Gas Act',
'Good Energy Residential Incentives Program',
'Yukon Biomass Strategy',
'Pan-Territorial Adaptation Strategy',
'Northern Adaptation Strategy',
'Climate Change Strategic Framework',
'Nunavut Energy Retrofit Program',
'Canada Foundation for Sustainable Development Technology',
'Canada Emission Reduction Incentives Agency Act',
'Règlement sur la réduction des émissions de dioxyde de carbone — secteur de l’électricité thermique au charbon',
'Greenhouse Gas Pollution Pricing Act',
'GGPPA',
'CFR compliance obligations',
'CFR credits',
'EVAS regulations',
'Canada Green Buildings strategy',
'Canadian Net-Zero Emissions Accountability Act',
'OBPS, benchmark criteria',
'CIB Growth Plan',
'Canada Infrastructure Bank Act',
'CIB ZEBs initiative',
'Pan-Canadian Approach to Pricing Carbon Pollution',
'Investing in Canada Plan',
'CIB acceleration investments',
'Renewable Fuels Regulations',
'Règlement sur les carburants renouvelables',
'Requirements Pertaining to Gasoline',
'Diesel Fuel and Heating Distillate Oil',
"Exigences relatives à l'essence, au carburant diesel et au mazout de chauffage",
'Compliance Unit Trading System',
'Regulations Respecting Reduction in the Release of Methane and Certain Volatile Organic Compounds (Upstream Oil and Gas Sector)',
'Leak Detection and Repair Program',
'Fuel Charge Regulations',
'Règlement limitant les émissions de dioxyde de carbone provenant de la production d’électricité thermique au gaz naturel',
'Strengthening Environmental Protection for a Healthier Canada Act',
'Canadian Environmental Protection Act, 1999',
'Emissions Reduction Fund',
'Treasury Board Directive on Transfer Payments',
'Emissions Reduction Fund—Onshore Program',
'2030 Emissions Reduction Plan',
'Canada Greener Homes Loan program',
'Indigenous Leadership Fund',
'Regional Strategic Initiatives',
'Smart Renewables and Electrification Pathways Program',
'Nature Smart Climate Solutions Fund',
'Federal GHG Offset System',
'Agricultural Climate Solutions: On-Farm Climate Action Fund',
'Canada’s GHG Offset Credit System',
'Green Municipal Fund',
'Clean Fuel Regulations',
'Règlement sur les combustibles propres',
'Compliance Credits',
'CO2e-Emission-Reduction Project',
'Compliance-Credit Transfer System',
'Compliance-Credit Clearance Mechanism',
'Marché de compensation des unités de conformité',
'Registered Emission-Reduction Funding Program',
'Clean Resource Innovation Network',
'Canadian Emissions Reduction Innovation Network',
'Landfill Methane Recovery and Destruction offset protocol',
'Investing in Canada Infrastructure Program',
'Low-Carbon Economy Challenge Fund',
'Canada Community Building Fund',
'Clean Fuels Fund',
'Energy Innovation Program',
'Livestock Feed Management protocol',
'Agricultural Climate Solutions Living Labs',
'Energy Efficient Buildings program',
'Green and Inclusive Community Buildings program',
'Zero Emission Vehicle Infrastructure Program',
'Inflation Reduction Act',
'Output-Based Pricing System (OBPS)',
'Strategic Innovation Fund - Net Zero Accelerator',
'Emissions Reductions Fund',
'EDC-BMO Sustainable Finance Guarantee',
'Sustainable bond framework',
'Partnership for Carbon Accounting Financials (PCAF) guidance',
'Statement on International Public Support for the Clean Energy Transition',
'BMO-EDC Sustainable Finance Guarantee',
'Green Bond Framework',
'Social Bonds',
'Transition Bonds',
'Climate Change Policy',
'Transparency and Disclosure Policy',
'TCFD aligned Climate-Related Disclosures',
'EDC Net Zero 2050',
'PACTA',
'Bill C-12',
'Loi sur la responsabilité en matière de carboneutralité',
'GCWood program',
'Buy Clean Strategy',
'Low-Carbon Building Materials Innovation Hub',
'Passenger Automobile and Light Truck Greenhouse Gas Emission Regulations',
'Fleet Requirements — CO2 Equivalent Emissions',
'Combined Fleet Requirements — Zero-emission Vehicles',
'Early Compliance Units — Zero-emission Vehicles of the 2024 and 2025 model years',
'Registered Charging Station Installation Project',
'A Healthy Environment and a Healthy Economy',
'National Housing Co-Investment Fund',
'sales targets for zero-emission vehicles',
'Incentives for Zero-Emission Vehicles (iZEV) program',
'Atlantic Loop',
'Climate Action Incentive',
'Climate Action Incentive Fund',
'Canada Emergency Business Account',
'Low-carbon and Zero-emissions Fuels Fund',
'Canada’s Hydrogen Strategy',
'Food Waste Challenge',
'Large Employer Emergency Financing Facility',
'Sustainable Finance Action Council, green bond',
'Natural Climate Solutions for Agriculture Fund',
'Disaster Mitigation and Adaptation Fund',
'Partnership stream of the Low-Carbon Economy Fund',
'Clean Energy for Rural and Remote Communities program',
'Wataynikaneyap Power Project',
'National Inuit Climate Change Strategy',
'Paris Agreement',
'Canada-European Union Comprehensive Economic and Trade Agreement (CETA)',
'Comprehensive and Progressive Agreement for Trans-Pacific Partnership (CPTPP)',
'Canada-United States-Mexico Agreement (CUSMA)',
'Powering Past Coal Alliance (PPCA)',
'2015 $2.65 billion climate finance commitment',
'Climate and Clean Air Coalition',
'Net-Zero Advisory Body',
'Carbon Pollution Pricing',
'Clean Fuel Standard - Liquid Fuels',
'Pan-Canadian Framework on Clean Growth and Climate Change — Progress Report',
'PEI Agriculture Climate Solutions Program']
 

In [328]:
can_df = pd.read_csv('data/canada_df.csv')

In [330]:
can_df_list = can_df['Document Title'].tolist()

In [338]:
# Combine can_doc_list, can_df_list, and canada_list, removing both exact and fuzzy duplicates,
# and build a dataframe with columns "name" and "source".
# If a name appears in multiple lists, include all sources in the "source" column,
# but keep only the first spelling of the name.
# Print when exact or fuzzy duplicates are detected/merged.

from fuzzywuzzy import process
import pandas as pd

threshold = 90  # similarity threshold for considering two items as duplicates

sources = [
    (can_doc_list, 'all_docs'),
    (can_df_list, 'CPR'),
    (canada_list, 'BNDC')
]

unique_names = []
name_sources = []

# Use lowercase stripped names for fast exact matching, but preserve first instance for the canonical name
unique_names_lookup = {}  # {stripped_lower: (idx, canonical_spelling)}

def find_similar_index(doc, candidates, threshold=90):
    if not candidates:
        return None
    match, score = process.extractOne(doc, candidates)
    if score >= threshold:
        return candidates.index(match)
    return None

for src_list, src_name in sources:
    for doc in src_list:
        doc_norm = doc.strip().lower()

        # Exact duplicate check
        if doc_norm in unique_names_lookup:
            idx, canonical = unique_names_lookup[doc_norm]
            if src_name not in name_sources[idx]:
                name_sources[idx] += f", {src_name}"
            # Print exact duplicate found/merged
            print(f"Exact duplicate found: '{doc}' is exactly the same as '{canonical}' (after normalization), sources merged: {name_sources[idx]}")
            continue

        # Fuzzy duplicate check against canonical spellings
        idx = find_similar_index(doc, unique_names, threshold)
        if idx is not None:
            if src_name not in name_sources[idx]:
                name_sources[idx] += f", {src_name}"
            print(f"Fuzzy duplicate found: '{doc}' is considered the same as '{unique_names[idx]}' (score >= {threshold}), sources merged: {name_sources[idx]}")
        else:
            idx = len(unique_names)
            unique_names.append(doc)
            name_sources.append(src_name)
        unique_names_lookup[doc_norm] = (idx, unique_names[idx])

can_combined_df = pd.DataFrame({'name': unique_names, 'source': name_sources})


Fuzzy duplicate found: 'Emissions Reduction Fund—Onshore Program' is considered the same as 'Emissions Reduction Fund' (score >= 90), sources merged: all_docs
Fuzzy duplicate found: 'Low-Carbon Economy Challenge Fund' is considered the same as 'Low Carbon Economy Fund' (score >= 90), sources merged: all_docs
Fuzzy duplicate found: 'Emissions Reductions Fund' is considered the same as 'Emissions Reduction Fund' (score >= 90), sources merged: all_docs
Fuzzy duplicate found: 'BMO-EDC Sustainable Finance Guarantee' is considered the same as 'EDC-BMO Sustainable Finance Guarantee' (score >= 90), sources merged: all_docs
Fuzzy duplicate found: 'Climate Action Incentive Fund' is considered the same as 'Climate Action Incentive' (score >= 90), sources merged: all_docs
Fuzzy duplicate found: 'Partnership stream of the Low-Carbon Economy Fund' is considered the same as 'Low Carbon Economy Fund' (score >= 90), sources merged: all_docs
Fuzzy duplicate found: 'National Inuit Climate Change Strategy

In [341]:
provinces_and_territories = ['Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador', 'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Quebec', 'Saskatchewan', 'Yukon', 'Northwest Territories']

In [342]:
import re

def has_province_or_territory(name, provinces_and_territories):
    # Case insensitive, strict word boundaries
    for region in provinces_and_territories:
        if re.search(rf'\b{re.escape(region.lower())}\b', name.lower()):
            return True
    return False

can_combined_df = can_combined_df[~can_combined_df['name'].apply(lambda x: has_province_or_territory(str(x), provinces_and_territories))].reset_index(drop=True)


In [343]:
len(can_combined_df)

383

In [344]:
only_CPR = can_combined_df[can_combined_df['source'] == 'CPR']

In [345]:
only_CPR

Unnamed: 0,name,source
151,The Canadian Environmental Protection Act 1999...,CPR
152,National Building Code (2010),CPR
153,Green Construction through Wood (GCWood) Program,CPR
154,Ozone-depleting Substances and Halocarbon Alte...,CPR
155,Green Construction through Wood Program - APPL...,CPR
156,National Energy Code for Buildings 2020,CPR
157,Regulatory Framework for an Oil and Gas Sector...,CPR
158,Order Prohibiting Certain Activities in Arctic...,CPR
159,Locomotive Emissions Regulations (SOR/2017-121),CPR
160,Municipal Energy Roadmap,CPR


In [346]:
#save the combined dataframe
can_combined_df.to_csv('data/canada_combined_analysis.csv', index=False)

In [347]:
#subset to the cells that contain the word 'haiti'
haiti_df = can_combined_df[can_combined_df['name'].str.contains('haiti')].reset_index(drop=True)

In [348]:
haiti_df


Unnamed: 0,name,source
0,climate adaptation and economic development of...,BNDC
1,strengthening national systems to improve mana...,BNDC
