In [0]:
from dotenv import load_dotenv
import pandas as pd
from service import Service
load_dotenv()
from typing import List, Literal, Optional
from pydantic import BaseModel, Field
import json
import math
from typing import Dict, Any, List
from tqdm import tqdm
import pandas as pd

from service import Service
from consts import DEFAULT_LLM_MODEL
service = Service()
model_name = DEFAULT_LLM_MODEL


system_prompt = '''You are a public financial management (PFM) diagnostic analyst.  
Your job is to identify whether a passage provides evidence for Bottleneck 6.1:  
“Ad hoc, political, and fragmented funding channels contribute to ineffective and inefficient delivery.”

Judge only what is in the text.  
1️. Detect **structural fragmentation** — multiple or parallel funding/management channels, off-budget or ad-hoc allocations, donor vs government systems, mid-year or discretionary funding, different rules or processes, etc.  
2. Detect **delivery failure or inefficiency** — unpredictability, duplication, coordination failure, arrears, start-stop projects, or high administrative costs.  
3️. Apply PFM knowledge: fragmentation is a *diagnostic feature* that normally implies inefficiency, but mark evidence only when cues are clear and quotable.  
4️. Follow the output schema exactly; quote trigger spans from the text; abstain if uncertain.

Be precise, conservative, and evidence-based.
'''

In [0]:
df = pd.read_excel('bottleneck_final_summarized_for_review.xlsx', sheet_name='6.1')
df.shape

In [0]:
df['Review Result (Relevant, Irrelevant)'].value_counts()

In [0]:
df = pd.read_excel('bottleneck_final_summarized_for_review.xlsx', sheet_name='6.1')
df = df[df['Review Result (Relevant, Irrelevant)'].notna()]
rel_cols = ['node_id','chunk_id', 'extended_context','extracted_evidence', 'Review Result (Relevant, Irrelevant)',
            'Reason for irrelevant or unsure ']
df = df[rel_cols]
df = df[df['Review Result (Relevant, Irrelevant)'].notna()]

In [0]:
challenge = '''Unreliable, delayed and fragmented funding for delivery'''
bottleneck = '''6.1 Ad hoc, political and fragmented funding channels contribute to ineffective and inefficient delivery'''
extended_definition = '''
Extended definition: The funding for service delivery and projects might be fragmented due to the existence of different funding sources or channels, such as budget general funds (general non-earmarked funding from the budget) from different ministries/agencies, earmarked funds from transfers coming from other governments, institutions and organizations and from earmarked government revenues, as well as funding coming from donors. Dealing with multiple funding can be challenging, especially if there is not an integrated and consolidated planning and management of such funding (these funds being managed with their corresponding parallel systems). Different management stages and procedures, non-consolidated information about the expected, distributed and consumed funds, as well as different levels of discretion in regard the use of the funds can make difficult a rational policy planning (due to uncertainty and lack of predictability of funding) and implementation, in addition to a more costly administration and higher risks of corruption and funds diversion.     

Political discretion on the management of specific funds may create relevant obstacles to adequate planning and execution of service delivery and projects, as it may imply significant levels of uncertainty in regard to the allocated resources. Changing priorities, biases towards new projects and other elements may reduce the predictability and effective availability of funding for specific services and projects, making it difficult for agencies and departments to plan effectively and sustain long-term projects. This volatile nature of funding can lead to start-stop patterns in project execution, for example, where projects are initiated but then stalled due to sudden withdrawals or delays in funding.   

The atomization and fragmentation of funding, with different parallel management (with their own stages and processes), reporting mechanisms and information systems contributes to administrative burdens and inefficient delivery, as well as making more difficult ensure a sound coordination, planning and execution of projects. These parallel systems may exist, for example, because of donors' contributions to the financing of specific services or projects, in order to fulfill their own financial management and control and reporting requirements, but at the cost of hindering client government’s systems and the integrated management of funds and service and project planning and coordination.  For example, donor’s use of Ethiopia’s public financial management and procurement systems significantly declined between the middle and end of the decade of 2010, with more than 50% of donor funding routed through parallel systems , creating complexity and lack of integration of services funding. Alike, in Uganda and Indonesia, the existence of multiple fragmented sources of financing created major challenges for local government planning, budget formulation and execution in regard of education policies.   

In addition, the effects of the fragmentation of the different funding channels can foster, and be amplified, by the lack of adequate coordination between sectors, levels of government and partners, resulting in a duplication of efforts, and inefficiencies on the allocation, channeling and execution of resources.  For example, different agencies might end up competing for the same funds or, conversely, some funds might remain underutilized due to lack of awareness or coordination among potential beneficiaries, or, for policies planned and executed across different agencies and levels of government, poor coordination may end up with an inefficient policy development and allocation of resources (not coordinated planned actions and prepared budgets towards common/shared policy goals), as well as with delays and/or shortages in regard the distribution of funds. For instance, in Rwanda108, the funding and the implementation of gender-based violence policies managed by many agencies across the government (at least four ministries) and district governments, largely relying on donors funding, and with non-public funded services provided by outside of government institutions, organizations and communities, resulting challenging in terms of policy integration and optimal management of resources. Particularly relevant and frequent might be the case of fragmented and uncoordinated different funding mechanisms across levels of governments. In Nigeria health financing streams were highly fragmented across and within levels of government, with health facilities receiving both federal and state resources, contributory health insurance schemes being managed by separate small pools, with the Basic Healthcare Provision Fund creates even more funding pools managed by state and insurance agencies, setting up a very complex system between different levels of governments and operators. 
'''

In [0]:
PROMPT_TEMPLATE = """
You are a PFM diagnostic coder. Your task is to judge whether the INPUT TEXT
contains evidence for a Bottleneck classified under the larger topic: {challenge}.

The specific bottleneck is defined as: {bottleneck}.

An extended definition with more context is given below:

{extended_definition}

Output MUST follow the JSON schema you are given (no extra keys, no prose).

––––– SCOPE LOCK (IMPORTANT) –––––
Treat fragmentation strictly as **funding/financial-management channel fragmentation**.
Program/administrative separation is irrelevant **unless** it creates a **distinct funding flow or a
parallel financial control stream** (e.g., separate accounts, off-budget handling, different release/disbursement rules).

––––– DEFINITIONS (OPERABLE) –––––
• Fragmentation cues (diagnostic features) — funding/finance anchored:
  STRONG (any ONE is sufficient for feature-based evidence):
    - off-budget flows (outside the main budget/TSA/IFMIS); parallel accounts/systems
    - donor vs government **follow different rules/processes** (allocation/release/procurement/reporting)
    - political/ad-hoc channels bypassing normal process (e.g., block/discretionary funds, mid-year approvals outside ADP/MTEF)
    - multiple **financing pools** for the same service managed by different agencies
    - explicit bypass of government systems; pooled fund dissolved → parallel handling
  MODERATE (need TWO together if no STRONG):
    - earmarked/tied funds with separate handling
    - vertical (intergovernmental) fragmentation with **distinct pools** (federal/state/insurer)
    - procedural divergence across **funding streams** (release/reporting/procurement/IFMIS)
    - volatility explicitly revealing separate handling (e.g., “moved off-budget”, “bypassed TSA”)

• Delivery failure / inefficiency (consequences):
  - higher administrative costs/inefficiency/duplication
  - unpredictability/volatility in funding; start–stop projects
  - coordination failures (duplication, underutilization, competition for funds)
  - delays/arrears/cash shortfalls
  - corruption/leakage risk

––––– HARD NEGATIVES (EXCLUDE) –––––
If the text is ONLY about any of the following (with no qualifying funding-channel cue), set hard_negative=true and decision="irrelevant":
  - wage-bill/personnel/payroll control (PSC, hiring discretion, casual workers, **separate budget vote/line**) **on-budget**
  - generic program-budget/accountability structure (e.g., “draws from multiple subprograms”) without distinct funding channels/systems
  - capacity/procurement/audit findings without alternate channels
  - revenue politics (divisible pool shares, opposing OSR reforms) **without** off-budget/ad-hoc fees/parallel accounts
  - donor dependence/withdrawal **without** off-budget/parallel handling or different rules

––––– TRIGGER SPAN REQUIREMENT –––––
Before labeling Relevant, extract verbatim **finance trigger spans** from the INPUT TEXT (not the context) that reference money-flow/control terms such as:
  “off-budget”, “TSA”, “IFMIS”, “parallel account/system”, “pooled fund/pool”, “allocation”, “release”, “warrant”,
  “disbursement”, “on-budget”, “block/discretionary fund”, “mid-year approval”, “ADP/MTEF”, “donor-funded vs government-funded (different rules)”.

––––– ACCEPTANCE RULES –––––
1) is_fragmentation_evidence = true  iff
   ((STRONG >= 1) OR (MODERATE >= 2)) AND trigger_spans (finance words) nonempty AND hard_negative = false.
2) is_fragmentation_plus_failure = true  iff
   is_fragmentation_evidence = true AND failure_present = true.
3) If no explicit **finance** phrase can be quoted for trigger_spans, set decision to "abstain" (even if you suspect relevance).
4) Always fill trigger_spans and failure_spans with verbatim phrases from the INPUT TEXT that justify your labels.

––––– OUTPUT FORMAT –––––
Return ONLY JSON that conforms to the schema provided below. Do not include commentary.

––––– INPUT TEXT –––––
{input_text}

This input text is extracted from the following extended source context. Use it only for corroboration (do not quote from it):
{extended_context}
"""




StrongCue = Literal[
    "off_budget",
    "parallel_systems",
    "separate_administration",
    "different_rules_processes_across_streams",
    "political_ad_hoc_allocation",
    "mid_year_approval_outside_cycle",
    "block_allocation_outside_process",
    "multiple_financing_pools_different_agencies",
]

ModerateCue = Literal[
    "earmarked_or_tied_grants",
    "vertical_fragmentation_intergovernmental",
    "procedural_divergence_reporting_procurement_ifmis",
    "pooled_fund_dissolved_or_bypass",
    "volatility_revealing_fragmentation",
]

FailureType = Literal[
    "inefficiency_higher_admin_costs",
    "unpredictability_funding_volatility",
    "coordination_failure_duplication_underutilization_competition",
    "delays_start_stop",
    "arrears_or_cash_shortfalls",
    "corruption_or_leakage_risk",
]

Subtype = Literal[
    "donor_gov_parallel",
    "domestic_vs_donor_different_rules",
    "political_ad_hoc_intragovernment",
    "vertical_intergovernmental_fragmentation",
    "program_administration_fragmentation",
    "other",
]

Decision = Literal[
    "relevant_feature",                 # feature-based evidence satisfied
    "relevant_feature_and_failure",     # fragmentation + failure satisfied
    "irrelevant",                       # hard negative or no cues
    "abstain",                          # insufficient quotable spans or uncertain
]

class BottleneckEvidence(BaseModel):
    """
    Structured judgment for PFM Bottleneck 6.1.
    Follows precision-first rules: strong/moderate cues, hard negatives, and span extraction.
    """
    # Primary decisions
    is_fragmentation_evidence: bool = Field(
        ..., description="Feature-based verdict (diagnostic). True iff STRONG>=1 or MODERATE>=2 and trigger_spans present and not hard_negative."
    )
    is_fragmentation_plus_failure: bool = Field(
        ..., description="Consequence-based verdict. True iff is_fragmentation_evidence and failure_present."
    )
    decision: Decision = Field(
        ..., description='Overall decision among {"relevant_feature","relevant_feature_and_failure","irrelevant","abstain"}.'
    )

    # Evidence for fragmentation
    strong_cues: List[StrongCue] = Field(default_factory=list, description="Which strong cues were found.")
    moderate_cues: List[ModerateCue] = Field(default_factory=list, description="Which moderate cues were found.")
    hard_negative: bool = Field(False, description="True if the passage matches any exclusion rule.")
    trigger_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases from the text that support fragmentation cues (must be substrings of the input).",
    )
    subtype: Subtype = Field("other", description="Subtype of fragmentation, if any.")

    # Evidence for failure/inefficiency
    failure_present: bool = Field(False, description="True if explicit or clearly implied delivery failure/inefficiency is present.")
    failure_types: List[FailureType] = Field(default_factory=list, description="Categorization of the failure/inefficiency mentioned.")
    failure_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases supporting failure/inefficiency (must be substrings of the input).",
    )

    rationale: str = Field(
        ...,
        description="2-3 sentence justification referencing the trigger_spans/failure_spans; avoid speculation; no new facts.",
    )


In [0]:
#node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant = df.sample().iloc[0].values.tolist()

In [0]:
output

In [0]:


l = []

In [0]:
start, end = 150, df.shape[0]

for item in tqdm(df.values.tolist()[start:end]):
    node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant = item
    prompt = PROMPT_TEMPLATE.format(
        challenge = challenge,
        bottleneck = bottleneck,
        extended_definition=extended_definition,
        input_text=extracted_evidence,
        extended_context=extended_context,
    )
    
    output = service.execute(
        prompt=prompt,
        model=DEFAULT_LLM_MODEL,
        response_model=BottleneckEvidence,
        temperature=0.0,
        system_message=system_prompt
    )
    l.append((node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant, output, ))
    

In [0]:
def relevance_boolean(x):
    return x=='Relevant'

In [0]:
m = [(relevance_boolean(item[-3]), item[-1].is_fragmentation_evidence, item[-1].is_fragmentation_plus_failure) for item in l]
df_results = pd.DataFrame(m, columns = ['expert_label', 'model1_label', 'model2_label'])
precision_m1 = (
    df_results.query("model1_label == True and expert_label == True").shape[0]
    / df_results.query("model1_label == True").shape[0]
    if df_results.query("model1_label == True").shape[0] > 0
    else 0
)

precision_m2 = (
    df_results.query("model2_label == True and expert_label == True").shape[0]
    / df_results.query("model2_label == True").shape[0]
    if df_results.query("model2_label == True").shape[0] > 0
    else 0
)

print(f"Precision (Model 1): {precision_m1:.2f}")
print(f"Precision (Model 2): {precision_m2:.2f}")

In [0]:
df['model1_label'] = df_results.model1_label.values.tolist()
df['model2_label'] = df_results.model2_label.values.tolist()

In [0]:
df.sample(5)

In [0]:
df.to_csv('6_1_revised_extarction_validation_results.csv', index=False)

### Run on the prefiltered list of 600 chunks

In [0]:
from enum import Enum

In [0]:
df_chunks = pd.read_csv('per_pfr_chunks_with_prefilter_results.csv')
df_rel_chunks = df_chunks[df_chunks.prefilter_results]

In [0]:
# The prefliter was done on the full chunk so we dont have the associated extracted evidence text. We first get this potential evidence
class ConfidenceLevel(str, Enum):
    strong = "strong"
    borderline = "borderline"
    weak = "weak"

class BottleneckBase(BaseModel):
    confidence: Optional[ConfidenceLevel] = Field(
        None,
        description=(
            "How confidently the extracted evidence supports the bottleneck. "
            "Choose 'strong' if the evidence clearly and directly supports the bottleneck, "
            "'borderline' if it is somewhat relevant but may be open to interpretation, "
            "and 'weak' if the evidence is tenuous, ambiguous, or only indirectly related."
        )
    )

class Bottleneck_2_1(BottleneckBase):
    extracted_evidence: Optional[str] = Field(
        None,
        description=(
            "Verbatim excerpt from the text that provides concrete evidence of fragmented, inconsistent, or uncoordinated policy design. "
            "Look for examples of conflicting mandates, duplicative schemes, lack of alignment across sectors or institutions, or absence of cross-sector coordination mechanisms. "
            "Do not extract vague critiques of policy or general governance weakness without explicit reference to inter-policy inconsistency or siloed formulation. "
            "Use only direct text from the source; do not paraphrase or infer."
        )
    )
    reasoning: Optional[str] = Field(
        None,
        description=(
            "Brief explanation of how the extracted text illustrates fragmented or uncoordinated policy design. "
            "The reasoning should clarify why the excerpt demonstrates lack of alignment or duplication, and avoid interpretation beyond the quoted material."
        )
    )

def make_bottleneck_prompt(text: str) -> str:

    return f"""
        You are analyzing a public finance document to identify specific bottlenecks affecting development outcomes.
        
        The context for your analysis is as follows:
        
        Role of Public Finance: Effective Resource Mobilization & Distribution
        role_description: Governments need to raise and allocate and influence private financial resources in support of the pursuit of their policy objectives, ensuring both that sufficient resources are available when needed, and that these are allocated according to needs and cost-effectiveness criteria. How governments do this has important distributional impacts and can influence public and private behaviour towards achievement of objectives.
        
        PFM Challenge: Unreliable, delayed and fragmented funding for delivery
        → challenge_description: Assesses how predictable, timely, and well-coordinated funding flows are, and whether fragmentation or delays impede delivery
        
        Specific Bottleneck: Ad hoc, Political and Fragmented Funding Channels
        → bottleneck description: Governments and public sector entities often rely on multiple, uncoordinated funding mechanisms—such as general funds, earmarked revenues, donor funding, and intergovernmental transfers. These mechanisms often lack integration, leading to volatility, administrative duplication, and fragmented service delivery. Discretionary or politically influenced allocation, parallel management systems (especially from donors), and poor coordination across agencies or levels of government compound inefficiencies. Common issues include delays, incomplete disbursements, excessive reporting burdens, and poor alignment of funding with long-term plans or shared objectives.
        
        ---
        
        Your task:
        
        - Carefully read the excerpt below.
        - Extract direct evidence from the text that clearly supports the presence of the specific bottleneck listed above.
        - Only extract text that is explicitly present in the excerpt.
        - Do not infer, assume, or include information that is not stated.
        - If you find no clear evidence, return null.
        
        For each piece of extracted evidence, briefly explain your reasoning (i.e., why this excerpt indicates the bottleneck), and indicate if the match is ambiguous.
        
        Text to analyze:
        
        {text}
        """

In [0]:
DEFAULT_SYSTEM_PROMPT = '''You are a public finance expert working with a multilateral development institution.

    Your task is to carefully read and analyze Public Finance Review (PFR) documents, budget support documents, or other fiscal diagnostics and project reports produced by institutions such as the World Bank or IMF.

    You are trained to identify and extract supporting evidence for specific Public Financial Management (PFM) bottlenecks, based on a predefined set of challenges and bottlenecks.

    The evidence may appear in explicit or implicit form. You should be attentive to:
    - Descriptions of institutional weakness or fragmentation
    - Observations about policy design vs implementation
    - Statements about fiscal sustainability, resource adequacy, or funding flows
    - Structural, capacity-related, or political-economy constraints
    - Trends, examples, or observations—both qualitative and quantitative

    You will return evidence *only if it clearly supports the described bottleneck*, otherwise leave it blank.

    Be concise but specific. Use quotes from the document when possible or summarize tightly if quoting is impractical.
    '''
l = []

for item in tqdm(df_rel_chunks.iterrows()):
    node_id, chunk_id, text, _ = item[1]
    prompt = make_bottleneck_prompt(text=text)
    result = service.execute(
        prompt=prompt,
        model=DEFAULT_LLM_MODEL,
        response_model=Bottleneck_2_1,
        temperature=0.0,
        system_message=DEFAULT_SYSTEM_PROMPT
    )
    if result.extracted_evidence:
        l.append(item[1].values.tolist()+[result.extracted_evidence])
        
        

In [0]:
cols =  ['node_id', 'chunk_id', 'chunk', 'prefilter_tag', 'extracted_evidence']
ddf = pd.DataFrame(l, columns =cols)

In [0]:
ddf.sample(5)

In [0]:
new_output = []

In [0]:
# run the revised validation model on these chunks

start, end = 300, ddf.shape[0]

for item in tqdm(ddf.values.tolist()[start:end]):
    node_id, chunk_id, extended_context, _, extracted_evidence = item
    prompt = PROMPT_TEMPLATE.format(
        challenge = challenge,
        bottleneck = bottleneck,
        extended_definition=extended_definition,
        input_text=extracted_evidence,
        extended_context=extended_context,
    )
    
    output = service.execute(
        prompt=prompt,
        model=DEFAULT_LLM_MODEL,
        response_model=BottleneckEvidence,
        temperature=0.0,
        system_message=system_prompt
    )
    new_output.append((node_id, chunk_id, extended_context, extracted_evidence, output))
    

In [0]:
ddf['model1_label'] = [item[-1].is_fragmentation_evidence for item in new_output]
ddf['model2_label'] = [item[-1].is_fragmentation_plus_failure for item in new_output]

In [0]:
ddf.sample(5)

In [0]:
ddf.to_csv('prefiltered_6_1_extraction_validation_results.csv', index=False)

In [0]:
ddf.model1_label.value_counts()

### Add summaries

In [0]:
ddf=pd.read_csv('prefiltered_6_1_extraction_validation_results.csv')

In [0]:
df_chunks = pd.read_csv('per_pfr_chunks.csv')
df_docs =  pd.read_csv('per_pfr_document_data.csv')
d_chunks = {(x[0], x[1]):x[2] for x in df_chunks.values.tolist()}

In [0]:
def get_extended_context(node_id, chunk_id, n=2):
    return '\n\n'.join([d_chunks.get((node_id, chunk_id+i), '') for i in [-2, -1, 0, 1, 2]])

def get_metadata(node_id):
    try:
        return df_docs[df_docs.node_id==node_id][['cntry_name', 'doc_name', 'admin_rgn_name', 'ent_topic_text']].to_records().tolist()[0][1:]
    except:
        return ('', '', '', '')


In [0]:
get_metadata(5669851)

In [0]:
ddf['extended_context'] = ddf.apply(lambda x: get_extended_context(x['node_id'], x['chunk_id']), axis=1)

In [0]:
ddf.sample(3)

In [0]:
examples = [
    "Student loan repayment rates in Lesotho and Tanzania: There is an announced policy on loan repayment but in practice repayments are not collected with much effort or consistency, so a larger share of public financing for education goes to post-secondary education than the policy requires.",
    "Liberia: Budget credibility remains weak. Liberia has struggled to implement budgets as planned – aggregate expenditure outturns have significantly deviated from approved budgets, indicating limited credibility. Public spending has been highly volatile (rising sharply during aid-fueled booms and then contracting), undermining a stable counter-cyclical policy.",
    "Political commitment to gender equality: According to a recent survey of 12 developing countries, almost all policy-makers (96%) state that 'schools should promote gender equality' yet at the same time almost half (47%) also believe that 'mothers working is bad for their children'. This demonstrates the often large gap between policy rhetoric and policy commitment regarding whether girls should have the same opportunities as boys.",
    "UNFPA estimate that $42 billion of investment is needed to end gender-based violence in 132 priority countries by 2030, of which only $9bn has been identified, leaving a funding gap of $32.5 billion.",
    "Despite evidence that primary health care is the most cost-effective and equitable way to make progress towards UHC, government health spending devoted to primary health care is low across all income groups: 33% in LICs, 36% in LMICs, 34% in UMICs and 36% in HICs (Hanson et al., 2022).",
    "Albania projected a 17% funding gap against its GBV strategy costs.",
    "In Belgium, cuts in the federal budget had resulted in disparities in the GBV policies issued at the regional level, as well as the reduction of funding of the voluntary sector."
]

bottleneck_name = 'Inadequate Commitment of Political and Technical Leadership'
bottleneck_description = (
                    "This bottleneck applies when there is a clear lack of sustained commitment by political or technical leaders to implement approved policies. "
                    "This includes delays, resistance, or failure to act when reforms threaten the status quo, require politically difficult trade-offs, "
                    "or demand resource shifts that are not followed through despite stated priorities. "
                    "Examples include: approved reforms not being enacted, persistent underfunding of a priority despite commitments, or misalignment between stated goals and actual budget execution. "
                    "Do **not** classify general governance weakness, vague statements, or budget/funding gaps **unless** directly tied to political/technical unwillingness or inaction. "
                    "Be careful to distinguish from other bottlenecks like 2.1 (coordination failures), 5.2 (disconnect between budgets and policy), or 6.3 (weak execution)."
                )


class StructuredSummaryFields(BaseModel):
    country: str = Field(..., description="Country referenced in the extarcted evidence.")
    issue_area: Optional[str] = Field(None, description="Main sector or topic affected by the bottleneck.")
    reference_to_policy_or_program: Optional[str] = Field(None, description="Any specific or general policy referenced in the context of the extracted evidence.")
    reference_outcome: Optional[str] = Field(None, description="What development or sectoral outcome is affected by the lack of leadership commitment? This could include unmet policy goals, reduced service delivery, poor implementation, or failure to achieve intended change. This is from the reference extracted eivdence for the bottleneck")
    key_constraint: str = Field(..., description="Describe what constraint is being evidenced.")
    observed_consequence: Optional[str] = Field(None, description="Any outcome or effect of the constraint.")
    metric_or_statistic: Optional[str] = Field(None, description="Quantitative detail if present (e.g., a funding gap, percentage).")
    closest_sdg: Optional[str] = Field(None, description="If directly relevant, mention the closest Sustainable Development Goal (SDG). Leave blank if nothing is directly relevant. IF present and relevant then report this in the form  in this example format'SDG 16: Peace, Justice and Strong Institutions'")
    closest_sdg_target: Optional[str] = Field(None, description="If directly relevant, mention the closest Sustainable Development Goal (SDG) target. Leave blank if nothing is directly relevant. If relevant use the format in the following example: 'Eliminate all harmful practices, such as child, early and forced marriage and female genital mutilation' ")

class StylizedSummary(BaseModel):
    summary_text: str = Field(..., description="A short, stylized summary suitable for inclusion in a public finance report.")


def make_structured_summary_prompt(
    context_text: str,
    extracted_evidence: str,
    bottleneck_name: str,
    bottleneck_description: str,
    metadata_tuple: tuple,
    examples: list[str]
) -> str:

    country, doc_name, region, topic_text = metadata_tuple
    example_block = "\n".join(f"- {ex}" for ex in examples)

    return f"""
    You are a public finance expert working at a multilateral development bank.

    Your task is to extract structured components that support stylized, policy-relevant summaries
    of public financial management (PFM) bottlenecks.

    You are given:
    - Context text from a fiscal diagnostic report
    - A validated excerpt from that context which supports a known PFM bottleneck referred to as the extracted evidence
    - Document metadata (e.g., country, region, topics)
    - A bottleneck definition (e.g., inadequate leadership commitment, funding fragmentation)

    ---

    Document:
    - Country: {country}
    - Title of document: {doc_name}
    - Region: {region}
    - Topics: {topic_text}

    Bottleneck:
    **{bottleneck_name}** → {bottleneck_description}

    ---

    Here is the full context from the document:
    {context_text}

    Here is the specific quote that was validated as evidence (extracted evidence) for the bottleneck titled {bottleneck_name}:
    \"\"\"{extracted_evidence}\"\"\"

    ---

    Your task is to extract the following structured fields:
    - Country
    - Issue Area (e.g., health, education, fiscal management, GBV, etc.)
    - Reference to policy or program (if applicable)
    - Key constraint (explain what’s blocking implementation or delivery)
    - Observed consequence (if mentioned)
    - Metric or statistic (if present)

    Only extract what is clearly grounded in the text. Do not invent or infer missing details.

    ---

    To guide your understanding of the level of detail and relevance we expect, here are some example summaries that your structured fields will ultimately help produce:

    {example_block}
    """.strip()

def make_summary_from_structure_prompt(
    extracted_evidence: str,
    structured: StructuredSummaryFields,
    bottleneck_name: str,
    examples: list[str]
) -> str:
    example_block = "\n".join(f"- {ex}" for ex in examples)

    return f"""
You are writing a concise, report-style summary of a public finance implementation bottleneck. The main content for this summary is the extracted evidence from document chunks: {extracted_evidence}

In order to frame the summary -- you can refer to information in the structured input as well as some sample examples as shown below. 

Match the tone following examples:

{example_block}

---

Use the structured information below to generate a clear, concise summary (1–5 sentences).
The summary should:
- Mention the **country**
- Describe the **constraint** clearly
- Refer to the **policy or program** (if relevant)
- Include **consequence or impact**
- Refer to the **outcome** (if relevant)
- Mention **quantitative figures** if present
- Do not remove any information that is already present. Keep all existing details but structure it in form anf tone of the examples shown.
- Do not add **extra commentary** unless this information is already present in the input information. 
- Do not explicitly include SDGs unless mentioned in the extracted evidence. 
- Do not use **any** language **suggesting causal effects and conclusions if not explicitly present** in the input context 

Structured input:
{structured.model_dump_json(indent=2)}
""".strip()



In [0]:
ddf_selected = ddf[ddf.model2_label]

In [0]:
final_output = []
for item in tqdm(ddf_selected.iterrows()):
    extracted_evidence = item[1]['extracted_evidence']
    node_id = item[1]['node_id']
    chunk_id = item[1]['chunk_id']
    context_text = item[1]['extended_context']
    metadata_tuple = get_metadata(node_id)

    structured_output = service.execute(
        prompt=make_structured_summary_prompt(
            context_text=context_text,
            extracted_evidence=extracted_evidence,
            bottleneck_name=bottleneck,
            bottleneck_description=extended_definition,
            metadata_tuple=metadata_tuple,
            examples=examples
        ),
        model=DEFAULT_LLM_MODEL,
        response_model=StructuredSummaryFields
    )

    summary_result = service.execute(
        prompt=make_summary_from_structure_prompt(
            extracted_evidence=extracted_evidence,
            structured=structured_output,
            bottleneck_name=bottleneck_name,
            examples=examples
        ),
        model=DEFAULT_LLM_MODEL,
        response_model=StylizedSummary
    )
    final_summary = summary_result.summary_text
    result  = {**{
        'node_id': node_id,
        'chunk_id': chunk_id, 
        'extended_context': context_text, 
        'extracted_evidence': extracted_evidence, 
        'final_summary': final_summary}, **structured_output.model_dump()}
    final_output.append(result)

In [0]:
df_final = pd.DataFrame(final_output)

In [0]:
df_final.to_csv('prefiltered_6_1_extraction_validation_results_with_summaries.csv', index=False)