# Climate justice experiments
The goal of this notebook is to find out (in a very rough first experiment kind of way) how well different model set-ups are able to pick up on fairly subtle concepts such as different justice theories. 

## Loading data
Two options here:

1. Grab the file which is also used for our internal tools (vibe checker, argilla). This is a feather file that has already had a bunch of weighted sampling done to it to make it somewhat more representative of diverse perspectives. We assume this is stored in data/processed/passages_dataset.feather.
2. Load in the whole open data repository. This takes a while to do and the lack of sampling probably makes it worse (plus, you might up with paragraphs that aren't in the vibe checker yet). I'm using option one here, but leaving in the code for two, as it doesn't require you to bother the data science team for a data file.

Either way, for these experiments, we're only really interested in a small subset, so loading the whole dataset, only to then subset it to a few dozen is sort of overkill. So at the end of this, I'll write the subset to a little csv to use instead (as long as create_new_sample is set to True).

If you want to re-use the code without changing the data **and** you have a justice_subset.csv already, just set reload_data to False.

Finally, the subset is created based on some title keywords. For testing purposes, we might want to add a set of random docs. This is controlled with add_random_docs. False means don't add anything, an integer means add that nr of passages. A number between 0 and 1 will take a nr of random passages equal to that fraction of the length of the title-based subset.


In [None]:
reload_data = False
data_source = "feather"  #'feather' or 'open_data'
create_new_sample_csv = False
add_random_docs = 0.3

In [None]:
import pandas as pd

if not reload_data:
    title_df = pd.read_csv("../data/processed/finance_subset.csv", encoding="utf-8")
    df = title_df.copy()
    print("Read from CSV")

### Option 1: Feather file
This is easy mode! First, let's load in the full dataset, then create a title-based subset and, if desired, add a random subset to that.

In [None]:
if data_source == "feather" and reload_data:
    df = pd.read_feather("../data/processed/passages_dataset.feather")
    print(f"Loaded feather file with {df.shape[0]} rows")
    df.head()

In [None]:
# Create a title-based subset
# Note that we're being a bit more lenient here than with the full open data
# Because we're drawing from a smaller subset
if data_source == "feather" and reload_data:
    title_df = df[
        (
            df["document_metadata.document_title"].str.contains(
                "financial", case=False, na=False
            )
        )
        | (
            df["document_metadata.document_title"].str.contains(
                "invest", case=False, na=False
            )
        )
        |
        # (df['document_metadata.document_title'].str.contains('monetary', case=False, na=False)) |
        (
            df["document_metadata.document_title"].str.contains(
                "fund", case=False, na=False
            )
        )
        | (
            df["document_metadata.document_title"].str.contains(
                "budget", case=False, na=False
            )
        )
    ]
    print(f"Titles with matching keywords: {title_df.shape}")

    # Seems we're getting some noise from carbon budget and one massive German law
    title_df = title_df[
        ~(
            (
                title_df["document_metadata.document_title"].str.contains(
                    "carbon budget", case=False, na=False
                )
            )
            | (
                title_df["document_metadata.document_title"].str.contains(
                    "federal budget for the budget year", case=False, na=False
                )
            )
        )
    ]

    print(title_df.shape)

    # Filter for recent documents (convert to datetime first)
    if "document_metadata.publication_ts" in df.columns:
        df["document_metadata.publication_ts"] = pd.to_datetime(
            df["document_metadata.publication_ts"], errors="coerce"
        )
        title_df = title_df[
            title_df["document_metadata.publication_ts"] >= "2019-01-01"
        ]
        print(f"Taking only recent: {title_df.shape}")

    # Filter out null values for key columns
    title_df = title_df[
        (title_df["document_metadata.import_id"].notna())
        & (title_df["document_metadata.source_url"].notna())
    ]

    print("Created df based on title keywords")
    print(title_df.shape)

In [None]:
# Analyze keyword contributions
keywords = [
    "financ",
    "financial",
    "finances",
    "financing",
    "fund",
    "funding",
    "invest",
    "investment",
    "investing",
    "monetary",
    "budget",
    "carbon budget",
]

for keyword in keywords:
    hits = (
        df["document_metadata.document_title"]
        .str.contains(keyword, case=False, na=False)
        .sum()
    )
    print(f"'{keyword}': {hits:,} hits")

In [None]:
# Add random sample
if data_source == "feather" and add_random_docs:
    if add_random_docs == True:
        N = int(0.5 * len(title_df))
    elif add_random_docs < 1:
        N = int(add_random_docs * len(title_df))
    else:
        N = int(add_random_docs)

    # Sample random rows from the original df
    sampled_passages = df.sample(n=N, random_state=42)

    # Concat, drop duplicates, reset index
    title_df = pd.concat([title_df, sampled_passages], ignore_index=True)
    title_df = title_df.drop_duplicates(
        subset=["text_block.index", "document_metadata.import_id"]
    )
    title_df = title_df.reset_index(drop=True)

    print("Added random passages -- new size:")
    print(title_df.shape)

### Option 2: Open data
This is harder. Loading the whole open data takes a few minutes too.

In [None]:
import duckdb
import pandas as pd
import tqdm as notebook_tqdm
from huggingface_hub import snapshot_download

# Using the public version of the repo for now
REPO_NAME = "ClimatePolicyRadar/all-document-text-data"
REPO_URL = f"https://huggingface.co/datasets/{REPO_NAME}"
DATA_CACHE_DIR = "../../cache"

REVISION = "main"  # Use this to set a commit hash. Recommended!

In [None]:
if reload_data:
    snapshot_download(
        repo_id=REPO_NAME,
        repo_type="dataset",
        local_dir=DATA_CACHE_DIR,
        revision=REVISION,
        allow_patterns=["*.parquet"],
    )

In [None]:
# Only used if data_source is 'open_data' to speed things up as it is so large
def create_db():
    db = duckdb.connect("data.db")  # Create a persistent database

    # Authenticate (only needed if loading a private dataset)
    # You'll need to log in using `huggingface-cli login` in your terminal first
    # db.execute("CREATE SECRET hf_token (TYPE HUGGINGFACE, PROVIDER credential_chain);")

    # Drop the existing table if it exists (necessary if you want to update the fields)
    db.execute("DROP TABLE IF EXISTS open_data")

    # Check if table exists
    table_exists = (
        db.execute(
            "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'open_data'"
        ).fetchone()[0]
        > 0
    )

    if not table_exists:
        # Create a persistent table with only the columns we need
        db.execute(
            """
            CREATE TABLE open_data AS 
            SELECT 
                "document_metadata.geographies",
                "document_metadata.corpus_type_name",
                "document_metadata.publication_ts",
                "document_metadata.import_id",
                "document_metadata.translated",
                "document_metadata.source_url",
                "document_metadata.document_title",	
                "text_block.text",
                "text_block.language",
                "text_block.type",
                "text_block.index",
                "text_block.page_number"
            FROM read_parquet('{}/*.parquet')
        """.format(DATA_CACHE_DIR)
        )

        # Create indexes for common query patterns
        db.execute('CREATE INDEX idx_language ON open_data("text_block.language")')
        db.execute(
            'CREATE INDEX idx_corpus_type ON open_data("document_metadata.corpus_type_name")'
        )
        db.execute(
            'CREATE INDEX idx_publication_ts ON open_data("document_metadata.publication_ts")'
        )
        db.execute('CREATE INDEX idx_text_type ON open_data("text_block.type")')

    return db

In [None]:
# Create a subset of fairly recent documents based on title keywords
if data_source == "open_data" and reload_data:
    db = create_db()
    title_df = db.sql(
        """
    SELECT *
    FROM open_data
    WHERE (
        LOWER("document_metadata.document_title") LIKE '%finance%'
        OR LOWER("document_metadata.document_title") LIKE '%investment%'
        OR LOWER("document_metadata.document_title") LIKE '%financing%'
    )
    AND "document_metadata.publication_ts" >= '2020-01-01'
    AND "document_metadata.import_id" IS NOT NULL
    AND "document_metadata.source_url" IS NOT NULL
        """
    ).to_df()
    print("Created df based on title keywords")
    print(title_df.shape)

In [None]:
if data_source == "open_data" and add_random_docs:
    if add_random_docs == True:
        N = int(0.5 * len(title_df))
    if add_random_docs < 1:
        N = int(add_random_docs * len(title_df))
    else:
        N = int(add_random_docs)

    sampled_passages = db.execute(f"""
        SELECT *
        FROM open_data
        USING SAMPLE reservoir({N} ROWS)
        REPEATABLE(42)
    """).df()

    # Concat, drop duplicates, reset index
    title_df = pd.concat([title_df, sampled_passages], ignore_index=True)
    title_df.drop_duplicates(
        subset=["text_block.index", "document_metadata.import_id"], inplace=True
    ).reset_index(drop=True)

    print("Added random passages -- new size:")
    print(title_df.shape)

### Create csv
Regardless of which method was chosen to get the data, we can save it to a csv (or load a prior one if)

In [None]:
if create_new_sample_csv:
    title_df.to_csv(
        "../data/processed/finance_subset.csv", index=False, encoding="utf-8"
    )
    df = title_df.copy()
    print("Saved CSV")

# Let's shorten the column names
df.columns = [col.split(".")[-1] for col in df.columns]

print(f"Total nr of paras: {df.shape[0]}")
print(f"Total nr of documents: {len(df['document_title'].unique())}")
df.head()

 # Connecting to the LLMs and concept store
The LLM classifier we import from src is built using pydantic. That gives us a lot of flexibility and [known models](https://ai.pydantic.dev/api/models/base/#pydantic_ai.models.KnownModelName) to choose from, as long as they are covered by our API key

In [None]:
import os

import boto3  # AWS

# Remember to login with aws sso login --profile labs
session = boto3.Session(profile_name="labs", region_name="eu-west-1")
ssm = session.client("ssm")
api_key = ssm.get_parameter(Name="OPENAI_API_KEY", WithDecryption=True)["Parameter"][
    "Value"
]
os.environ["OPENAI_API_KEY"] = api_key
print("Connected with Open AI API key")

In [None]:
import nest_asyncio  # Since we're working on notebookes, we need to handle the async bits

from knowledge_graph.classifier.large_language_model import LLMClassifier
from knowledge_graph.concept import Concept
from knowledge_graph.wikibase import WikibaseSession

nest_asyncio.apply()

wikibase = WikibaseSession()
async with WikibaseSession() as wikibase:
    finance = await wikibase.get_concept_async("Q1343")

    print(finance)

classifier = LLMClassifier(finance, model_name="gpt-4-turbo")
sentence = "This money is directed towards climate mitiation"

classifier.predict(sentence)

In [None]:
sentence = "We are spending 1 million USD on climate action"
classifier.predict(sentence)

In [None]:
sentence = "We are spending 1 million USD on solar panels"
classifier.predict(sentence)
# Nothing so model doesn't understand mitigation?

In [None]:
classifier = LLMClassifier(finance, model_name="gpt-4o")
sentence = "We are spending 1 million USD on solar panels"

classifier.predict(sentence)

In [None]:
classifier = LLMClassifier(finance, model_name="gpt-4-turbo")
sentence = "We are spending 1 million USD on solar panels"

classifier.predict(sentence)

In [None]:
async with WikibaseSession() as wikibase:
    mitigation = await wikibase.get_concept_async("Q1344")

    print(mitigation)

classifier = LLMClassifier(mitigation, model_name="gpt-4o")
sentence = "We are spending 1 million USD on solar panels"

classifier.predict(sentence)

The tech works! But it is pretty discouraging that even a fairly expensive model can't get a simple example of climate finance... 

Let's explore the basic functionality
-  By default, the LLM classifier takes all the info it can find from the concept and puts this in the prompt. But we can also create a custom concept. 
- Plus, we can run in batches also. Sentences without a predicted span return an empty list. Still a bit unclear to me if batch prediction with nest_asyncio might lead to problems, but it seems OK so far.
- Let's also explore the default prompt and play around with that a tiny bit. 



In [None]:
# custom test concept
new_concept = Concept(
    preferred_label="Finance flow",
    alternative_labels=["flow of finance", "financial transfer", "monetary flow"],
    description="Money is either being spent or transferred between actors",
    definition="A movement of monetary value between entities over time",
)

classifier = LLMClassifier(new_concept, model_name="gpt-4-turbo")

# batch predict
sentences = [
    "This sentence is about climate finance flows",
    "We are spending 100 million USD on solar panels",
    "This sentence is about money but it's not going anywhere",
]
[print(f"\n{c}") for c in classifier.predict_batch(sentences)]

Generic finance flows might be an easier starting point, under the assumption that the bottleneck is climate knowledge of the models. 

In [None]:
# Keep in mind the difference between the template prompt and the fully filled out system prompt
print("PROMPT TEMPLATE:")
print(classifier.system_prompt_template)
print("\n\nSYSTEM PROMPT:")
print(classifier.system_prompt)

## Let's try some concepts in a larger sample
We'll create a smaller dataframe. Ones with only a short amount of text are unlikely to have enough information, so let's pick out slightly longer ones and only in English.

In [None]:
# First, let's figure out how long this really takes
import time

df.dropna(subset="text", inplace=True)
df = df[df["language"] == "en"]
df = df[df["text"].str.len() > 60]
small_df = df.sample(n=250, random_state=42)
[print(f"{row['text']}\n") for i, row in small_df.head().iterrows()]

In [None]:
# custom test concept
new_concept = Concept(
    preferred_label="Finance flow",
    alternative_labels=[
        "flow of finance",
        "financial transfer",
        "establish fund",
        "monetary flow",
        "investing USD",
        "loan to",
        "subsidy totalling",
        "financial flow",
        "payment to",
        "paying taxes",
        "spending euros on",
        "raising money for",
        "add to the financial budget",
    ],
    description="Something of monetary value is either being spent or transferred between actors.",
    definition="""
    A finance flow is an economic flow that reflects the creation, transformation, exchange, transfer, or extinction of economic value and involves changes in ownership of goods and/or financial assets, the provision of services, or the provision of labor and capital.
    Leave out generic descriptions or financial actors. Include only statements where it is clear that actual money or something of monetary value is being exchanged, or will be exchanged in the future.
    Only label the flow itself (i.e. how much is being spent on what).
    """,
)

wikibase = WikibaseSession()

classifier = LLMClassifier(new_concept, model_name="gpt-4o")

print("Starting prediction")
t0 = time.time()
prediction_spans = classifier.predict_batch(small_df["text"].astype(str))
tseconds = time.time() - t0
print(f"Finished in {tseconds:.2f} seconds")
print(f"That's {tseconds / len(small_df):.3f} s per passage\n\nPositive examples:")

for prediction in prediction_spans:
    if len(prediction) > 0:
        for p in prediction:
            print(p.text)
            print(f"=> {p.labelled_text}")
            print()

From this (and a few similar runs) we conclude:
- this is an OK starting point
- 4o mini seems good enough to get consistent results. Less luck with 4 nano and 3.5. 
- Giving enough examples that closely match actual language in texts is useful
- Even so, the model is not super strict (which we might want it to be). In particular, it still includes potential finance or the *need* for finance.
- the model is better at sentence classification; not great at pulling out the actual flow in the sentence (e.g. GW instead of the power purchase agreements), though it's not terrible at it either. 
- 4o is the exception here, getting OK at actually classifying the relevant part of the passage, especially with an additional nudge in the concept description ("Only label the flow itself (i.e. how much is being spent on what").

In [None]:
# Let's compare a few different prompts and models to each other more formally.
async with WikibaseSession() as wikibase:
    finance = await wikibase.get_concept_async("Q1343")

setups = {
    # "4o-mini": LLMClassifier(new_concept,
    #                         model_name='gpt-4o-mini',
    #                         system_prompt_template=
    #                         classifier.system_prompt_template  # Current template
    # ),
    # "4turbo": LLMClassifier(new_concept,
    #                     model_name='gpt-4-turbo',
    #                     system_prompt_template=
    #                     classifier.system_prompt_template  # Current template
    # ),
    # "concept_store_4o": LLMClassifier(finance,
    #                         model_name = 'gpt-4o',
    #                         system_prompt_template=
    #                         classifier.system_prompt_template
    # ),
    # "4o":LLMClassifier(new_concept,
    #                        model_name='gpt-4o',
    #                        system_prompt_template= classifier.system_prompt_template
    # ),
    "best_effort": LLMClassifier(
        new_concept,
        model_name="gpt-4o",
        system_prompt_template="""
    You are a specialist analyst, tasked with identifying mentions of 
    concepts in policy documents. You will mark up references to concepts with 
    XML tags.

    First, carefully review the following description of the concept:

    <concept_description>
    {concept_description}
    </concept_description>

    Instructions:

    1. Read through each passage carefully, thinking about the concept and different ways it is used in documents.
    2. Identify any mentions of the concept, including direct references and related terms.
    3. Surround each identified mention with <concept> tags.
    4. If a passage contains multiple instances, each one should be tagged separately.
    5. If a passage does not contain any instances, it should be reproduced exactly as given, without any additional tags.
    6. If an entire passage refers to the concept without specific mentions, the entire passage should be wrapped in a <concept> tag.
    7. The input text must be reproduced exactly, down to the last character, only adding concept tags.
    8. Double check that you have tagged all financial flows and that every tagged part is describing an actual financial flow, including the monetary value, if any is given.
    """,
    ),
    "examples_added": LLMClassifier(
        new_concept,
        model_name="gpt-4o",
        system_prompt_template="""
    You are a specialist analyst, tasked with identifying mentions of 
    concepts in policy documents. You will mark up references to concepts with 
    XML tags.

    First, carefully review the following description of the concept:

    <concept_description>
    {concept_description}
    </concept_description>

    then read the annotation guidelines and examples
    <guidelines>
    Rules
    - Include when the text describes money or financial assets moving (e.g. payments, loans, investments, disbursements, repayments).
    - Exclude when the text only states amounts held, owed, or valued at a point in time (these are stocks, not flows).
    - Include both one-off transactions and ongoing streams (e.g. monthly payments, yearly disbursements).
    - Exclude metaphorical/non-financial uses of “flow” (e.g. “flow of information”).

    Positive Examples (label as “financial flow”)
    “The government confirmed $10 million to level up towns.”
    “Foreign direct investment inflows totalling €2 billion in 2023.”
    “The charity disbursed £500,000 to local projects.”
    “Additional funds shall be made available to provide grants to States”

    Negative Examples (do NOT label as “financial flow”)
    “The company’s assets are worth $5 billion.” (stock/valuation)
    “GDP increased by $2 billion” (not flowing between parties)
    “Information flows quickly in digital markets.” (not financial)

    Borderline / Ambiguous Cases
    “Interest accrued on deposits.” → Only label if text explicitly describes interest being paid or transferred.
    “The value of foreign reserves increased by $10 billion.” → Could be due to flows or valuation changes; only label if the movement (purchase/sale/transfer) is clear from the text or context.
    </guidelines>

    Instructions:

    1. Read through each passage carefully, thinking about the concept and different ways it is used in documents.
    2. Identify any mentions of the concept, including direct references and related terms.
    3. Surround each identified mention with <concept> tags.
    4. If a passage contains multiple instances, each one should be tagged separately.
    5. If a passage does not contain any instances, it should be reproduced exactly as given, without any additional tags.
    6. If an entire passage refers to the concept without specific mentions, the entire passage should be wrapped in a <concept> tag.
    7. The input text must be reproduced exactly, down to the last character, only adding concept tags.
    8. Double check that you have tagged all financial flows and that every tagged part is describing an actual financial flow, including the monetary value, if any is given.
    """,
    ),
}

In [None]:
# Let's write some helper functions to make my life easier
# and then use that to set up some experiments
# First, let's create a function to get tagged passages in a readable format
def get_tagged_passages(classifier, texts):
    results = []
    # Use predict_batch instead of individual predict calls
    try:
        all_spans = classifier.predict_batch(texts)

        # Match spans with their original texts
        for text, spans in notebook_tqdm.tqdm(
            zip(texts, all_spans), total=len(texts), desc="Processing tagged passages"
        ):
            if spans:  # Only include texts that got tagged
                results.append(
                    {
                        "text": text,
                        "tagged_spans": [span.labelled_text for span in spans],
                    }
                )
        return results
    except:
        print(f"ERROR for classifier {classifier}")
        return []


def compare_setups(setups, sample_texts):
    """
    setups: dict of {setup_name: classifier}
    sample_texts: list of texts to test
    """
    results = {}

    for setup_name, classifier in notebook_tqdm.tqdm(
        setups.items(), desc="Comparing setups"
    ):
        tagged = get_tagged_passages(classifier, sample_texts)
        results[setup_name] = {"n_tagged": len(tagged), "tagged_passages": tagged}

    return results


def create_comparison_table(results):
    """
    Create a comparison table that includes both quantitative and qualitative data.
    """
    # Initialize dictionary to store the data
    comparison_data = {
        "text": sample_texts,  # Original texts
    }

    # Add columns for each setup
    for setup_name, result in results.items():
        # Create a dictionary mapping texts to their tagged spans
        text_to_spans = {
            p["text"]: p["tagged_spans"] for p in result["tagged_passages"]
        }

        # Add both the binary tag and the actual spans
        comparison_data[f"{setup_name}_tagged"] = [
            1 if text in text_to_spans else 0 for text in sample_texts
        ]
        comparison_data[f"{setup_name}_spans"] = [
            text_to_spans.get(text, []) for text in sample_texts
        ]

    # Create DataFrame only once at the end
    return pd.DataFrame(comparison_data)

In [None]:
# Run the comparison
sample_texts = (
    df[df["language"] == "en"].sample(n=150, random_state=42)["text"].tolist()
)
sample_texts = [str(t) for t in sample_texts]
results = compare_setups(setups, sample_texts)

In [None]:
# Create and display the comparison table
comparison_table = create_comparison_table(results)

# Show summary statistics
print("Total passages tagged by each setup:")
print(
    comparison_table[
        [col for col in comparison_table.columns if col.endswith("_tagged")]
    ].sum()
)

# Show passages where setups disagree
print("\nPassages where setups disagree:")
disagreements = comparison_table[
    comparison_table[
        [col for col in comparison_table.columns if col.endswith("_tagged")]
    ].nunique(axis=1)
    > 1
]

# For each disagreeing passage, show the different spans identified
for _, row in disagreements.iterrows():
    print("\nOriginal text:", row["text"])
    for setup in setups.keys():
        if row[f"{setup}_tagged"]:
            print(f"{setup} identified:", row[f"{setup}_spans"])
        else:
            print(f"{setup} identified: [NO TAGS]")

In [None]:
comparison_table[
    (comparison_table["best_effort_tagged"] + comparison_table["examples_added_tagged"])
    > 0
].to_csv("examples_add.csv")

- GPT 4 mini only gives errors. 4o-mini gives fairly consistent outputs but it's better at finding "finance vibes" than "finance flows". 4o is much more strictly adhering to the description and relies less on the examples. Markedly better overall. 
- general template is pretty general, but sometimes this means it picks up on useful grey areas
- unsure why the small change in the template to be non-climate specific led to such different results.
- "tagging every instance" seems to be interpreted differently between models & might be responsible for some of the over-enthusiasm. 
- double-checking step at the end might be useful? Particularly because it encourages to include the actual flow & currencies. 


# From small examples to medium size
We got this working for a few hundred passages at a time. Now that we have an idea of what works, let's try to predict (in batches) for a much larger dataframe so we can do a more formal evaluation.

In [None]:
# Create the subset
medium_df = df.sample(n=3000, random_state=420)
medium_df.head()

In [None]:
# Make some final tweaks to create our true best effort classifier
# Leaving out the alt labels as they seem to bias it to short snippets
# Add in examples, but a slightly more varied bunch than earlier
# And sharpen the language to be more in line with CPI's understanding of a finance flow

guidelines = """

Rules
    - Include when the text describes money or financial assets moving (e.g. payments, loans, investments, disbursements, repayments).
    - Include both one-off transactions and ongoing streams (e.g. monthly payments, yearly disbursements).
    - Exclude when the text only states amounts held, owed, or valued at a point in time (these are stocks, not flows).
    - Exclude metaphorical/non-financial uses of “flow” (e.g. “flow of information”).

    Positive Examples (label as “financial flow”)
    “The government confirmed $10 million to level up towns.”
    “Foreign direct investment inflows totalling €2 billion in 2023.”
    “The charity disbursed £500,000 to local projects.”
    “Additional funds shall be made available to provide grants to States”
    "GCF funding for capacity building at similar levels to last year"

    Negative Examples (do NOT label as “financial flow”)
    “The company’s assets are worth $5 billion.” (stock/valuation)
    “GDP increased by $2 billion” (not flowing between parties)
    “Information flows quickly in digital markets.” (not financial)
"""

final_concept = Concept(
    preferred_label="Finance flow",
    wikibase_id="Q1829",  # Placeholder - needed for Argilla
    # alternative_labels = ['flow of finance', 'financial transfer', 'establish fund', 'monetary flow', 'investing USD', 'loan to', 'subsidy totalling', 'payment to', 'paying taxes', 'spending euros on', 'raising money for', 'add to the financial budget'],
    description="Either money or something of monetary value is transferred between two or more organisations, people or countries.",
    definition=f"""
    A finance flow is an economic flow that reflects the creation, transformation, exchange, transfer, or extinction of economic value and involves changes in ownership of goods and/or financial assets, the provision of services, or the provision of labor and capital.
    ideally, a financial flow describes four elements: 
    1. the source (who is sending the financial asset, such as a bank or organisation); 
    2. the financial instrument or mechanism (how it is being sent, such as a grant, loan or a subsidy);
    3. the use or destination (the purpose for which the asset will be used, which is often expressed as the recipient organisation or their sectoral categorisation); and 
    4. the value (which can, but does not need to be, expressed in monetary terms directly). 
    It is acceptable if not all of these elements are present, but where they are, all of these elements should be considered part of the same financial flow.

    Guidelines for annotation are:
    {guidelines}

    """,
)

classifier = LLMClassifier(
    final_concept,
    model_name="gpt-4o",
    system_prompt_template="""
    You are a specialist analyst, tasked with identifying mentions of concepts in policy documents. 
    These documents are mostly drawn from a climate and development context.
    You will mark up references to concepts with XML tags.

    First, carefully review the following description of the concept:

    <concept_description>
    {concept_description}
    </concept_description>

    Instructions:

    1. Read through each passage carefully, thinking about the concept and different ways it can be used in documents.
    2. Identify any mentions of the concept, including references that are not included as an example, but which match the definition and guidelines.
    3. Surround each identified mention with <concept> tags.
    4. If a passage contains multiple instances, each one should be tagged separately.
    5. If a passage does not contain any instances, it should be reproduced exactly as given, without any additional tags.
    6. If an entire passage refers to the concept without specific mentions, the entire passage should be wrapped in a <concept> tag.
    7. The input text must be reproduced exactly, down to the last character, only adding concept tags.
    8. Double check that you have tagged all financial flows and that every tagged part is describing an actual financial flow, including the source, destination, instrument and value, if any is given.
    """,
)

In [None]:
# Lil safety mechanism to stop me burning through our credits
predict_the_medium_sample = False

In [None]:
import pickle
from datetime import date

from knowledge_graph.labelled_passage import LabelledPassage

# Predict in batches of 100 at a time
# Note that we are planning to upload this to argilla/vibe checker so need metadata
if predict_the_medium_sample:
    batch_labelled_passages = []
    for batch in range(0, len(medium_df), 100):
        batch_dataset = medium_df.iloc[batch : batch + 100]
        batch_texts = batch_dataset["text"].tolist()
        batch_spans = classifier.predict_batch(batch_texts)
        batch_labelled_passages.extend(
            LabelledPassage(
                text=text, spans=spans, metadata=batch_dataset.iloc[i].to_dict()
            )
            for i, (text, spans) in enumerate(zip(batch_texts, batch_spans))
        )
    # To stop me automatically running it again
    predict_the_medium_sample = False
    date = date.today().isoformat()

    with open(f"../data/processed/{date}_llm_finance_predictions.pickle", "wb") as file:
        pickle.dump(batch_labelled_passages, file)
else:
    date = date.today().isoformat()

    with open(
        "../data/processed/2025-09-18_llm_finance_predictions.pickle", "rb"
    ) as file:
        batch_labelled_passages = pickle.load(file)


batch_labelled_passages[0:5]

## Writing to vibe checker and Argilla
Both of these take lists of labelled passages with the LabelledPassage object.
Every prediction has already been made into such an object above.

In [None]:
# Just to show how it works
from knowledge_graph.labelled_passage import LabelledPassage

sentence = "This sentence is about climate finance flows"

classifier = LLMClassifier(new_concept, model_name="gpt-4o")

spans = classifier.predict(sentence)
labelled_passage = LabelledPassage(text=sentence, spans=spans)
labelled_passage.model_dump_json()

In [None]:
# Create the set to upload: positive ones plus some share of negatives
# First, let's just see how many positive ones we have
n = 0
for passage in batch_labelled_passages:
    prediction = passage.spans
    if len(prediction) > 0:
        n += 1
print(f"{n} passages with at least one tag")

In [None]:
# Let's take 350 of those and then add in an additional 150 passages without any matches.
# 1) 350 with matches, 2) +150 without matches
import random

random.seed(42)

positives = [lp for lp in batch_labelled_passages if len(lp.spans) > 0]
negatives = [lp for lp in batch_labelled_passages if len(lp.spans) == 0]

pos_sample = random.sample(positives, min(350, len(positives)))
neg_sample = random.sample(negatives, min(200, len(negatives)))

selected_passages = pos_sample + neg_sample
random.shuffle(selected_passages)

print(f"Positives sample: {len(pos_sample)} out of {len(positives)}")
print(f"Negatives sample: {len(neg_sample)} out of {len(negatives)}")

In [None]:
from datetime import datetime


def fix_pub_ts(md):
    for k in ("publication_ts", "document_metadata.publication_ts"):
        if k in md:
            v = md[k]
            try:
                v = pd.to_datetime(v, errors="raise")
            except:
                print(v)
                v = None
                pass
            if isinstance(v, (pd.Timestamp, datetime)):
                md[k] = v.isoformat()
            elif isinstance(v, (float, np.floating)) and np.isnan(v):
                md[k] = None
            else:
                md[k] = str(v)


for lp in selected_passages:
    if isinstance(lp.metadata, dict):
        fix_pub_ts(lp.metadata)

In [None]:
out_path = "../data/processed/finance_flow_labelled_passages_add_id.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    f.write("\n".join([passage.model_dump_json() for passage in selected_passages]))
out_path

In [None]:
# Also save as a csv -- but first clean up the file a little
with open(out_path, "r") as f:
    df_passage = pd.read_json(f, lines=True)
df_passage.head()

In [None]:
df_meta = pd.DataFrame(df_passage["metadata"].to_list())
df_passage.drop(columns="metadata")
df_passage = pd.concat([df_passage, df_meta], axis=1)
df_passage.head()

In [None]:
# adding in the spans is a bit more of a pain as some of them are None
spans_list = []
for row in df_passage["spans"]:
    tl = []
    for match in row:
        if match != None:
            tl.append(match["labelled_text"])
    spans_list.append(tl)

spans_df = pd.DataFrame(spans_list)
df_passage = pd.concat([spans_df, df_passage], axis=1)
df_passage.to_csv(
    f"../data/out/{date}_finance_flows.csv", index=False, encoding="utf-8"
)
df_passage.head()

In [None]:
# Harrison will do: JSONL of labelled passages => S3 bucket => vibe checker

In [None]:
modified_passages = selected_passages.copy()

In [None]:
# Metadata seems like it's causing problems in the upload
import math

import numpy as np


def sanitize(v):
    # pandas NA
    if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
        return None
    if isinstance(v, (np.floating,)):
        return None if (np.isnan(v) or np.isinf(v)) else float(v)
    if pd.isna(v):  # catches pd.NaT, pd.NA
        return None
    if isinstance(v, (np.integer,)):
        return int(v)
    if hasattr(v, "tolist"):  # numpy arrays
        return sanitize(v.tolist())
    if isinstance(v, dict):
        return {str(k): sanitize(val) for k, val in v.items()}
    if isinstance(v, (list, tuple, set)):
        return [sanitize(x) for x in v]
    return v


for lp in modified_passages:
    if isinstance(lp.metadata, dict):
        md = {k: sanitize(v) for k, v in lp.metadata.items() if k != "text"}
        lp.metadata = md

In [None]:
field_mapper = {
    "text": "text",
    "text_block_id": "text_block-text_block_id",
    "language": "text_block-language",
    "type": "text_block-type",
    "type_confidence": "text_block-type_confidence",
    "page_number": "text_block-page_number",
    "coords": "text_block-coords",
    "document_id": "document_id",
    "document_name": "document_name",
    "document_source_url": "document_source_url",
    "document_content_type": "document_content_type",
    "document_md5_sum": "document_md5_sum",
    "languages": "languages",
    "translated": "translated",
    "has_valid_text": "has_valid_text",
    "pipeline_metadata": "pipeline_metadata",
    "name": "document_metadata-name",
    "document_title": "document_metadata-document_title",
    "description": "document_metadata-description",
    "import_id": "document_metadata-import_id",
    "slug": "document_metadata-slug",
    "family_import_id": "document_metadata-family_import_id",
    "family_slug": "document_metadata-family_slug",
    "publication_ts": "document_metadata-publication_ts",
    "date": "document_metadata-date",
    "source_url": "document_metadata-source_url",
    "download_url": "document_metadata-download_url",
    "corpus_import_id": "document_metadata-corpus_import_id",
    "corpus_type_name": "document_metadata-corpus_type_name",
    "collection_title": "document_metadata-collection_title",
    "collection_summary": "document_metadata-collection_summary",
    # "document_metadata-type",
    "source": "document_metadata-source",
    "category": "document_metadata-category",
    "geography": "document_metadata-geography",
    "geographies": "document_metadata-geographies",
    # "document_metadata-languages",
    "metadata": "document_metadata-metadata",
    "document_description": "document_description",
    "document_cdn_object": "document_cdn_object",
    "document_slug": "document_slug",
    "md5sum": "pdf_data-md5sum",
    "dimensions": "pdf_data_page_metadata-dimensions",
    # "pdf_data_page_metadata-page_number",
    "detected_title": "_html_data-detected_title",
    "detected_date": "_html_data-detected_date",
    # "_html_data-has_valid_text",
    "parser_metadata": "pipeline_metadata-parser_metadata",
    "index": "text_block-index",
    "world_bank_region": "world_bank_region",
}

for passage in modified_passages:
    # Create new metadata dict with only mapped fields
    new_metadata = {}

    for old_key, value in passage.metadata.items():
        if old_key in field_mapper:
            new_key = field_mapper[old_key]
            new_metadata[new_key] = value
        # Any keys not in field_mapper will be dropped (superfluous keys)

    # Replace the old metadata with cleaned metadata
    passage.metadata = new_metadata

In [None]:
modified_passages[0].metadata

In [None]:
# Argilla: look in the github; has an example

from knowledge_graph.labelling import ArgillaSession

session = ArgillaSession()  # .env credentials!

# If we need to create a new workspace:
# workspace_to_create = Workspace(name="finance-experiments")
# workspace = workspace_to_create.create()

# If it already exists:
workspace = session.client.workspaces(name="finance-experiments")

argilla_data = session.labelled_passages_to_dataset(
    labelled_passages=modified_passages, concept=final_concept, workspace=workspace
)

In [None]:
user = session.client.users("sion")
workspace = session.client.workspaces("finance-experiments")

added_user = user.add_to_workspace(workspace)