In [0]:
from dotenv import load_dotenv
import pandas as pd
from service import Service
load_dotenv()
from typing import List, Literal, Optional
from pydantic import BaseModel, Field
import json
import math
from typing import Dict, Any, List
from tqdm import tqdm
import pandas as pd
node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant
from service import Service
from consts import DEFAULT_LLM_MODEL
service = Service()
model_name = DEFAULT_LLM_MODEL



In [0]:
df = pd.read_excel('bottleneck_final_summarized_for_review.xlsx', sheet_name='6.1')
df = df[df['Review Result (Relevant, Irrelevant)'].notna()]
rel_cols = ['node_id','chunk_id', 'extended_context','extracted_evidence', 'Review Result (Relevant, Irrelevant)',
            'Reason for irrelevant or unsure ']
df = df[rel_cols]
df = df[df['Review Result (Relevant, Irrelevant)'].notna()]

In [0]:
challenge = '''Unreliable, delayed and fragmented funding for delivery'''
bottleneck = '''6.1 Ad hoc, political and fragmented funding channels contribute to ineffective and inefficient delivery'''
extended_definition = '''
Extended definition: The funding for service delivery and projects might be fragmented due to the existence of different funding sources or channels, such as budget general funds (general non-earmarked funding from the budget) from different ministries/agencies, earmarked funds from transfers coming from other governments, institutions and organizations and from earmarked government revenues, as well as funding coming from donors. Dealing with multiple funding can be challenging, especially if there is not an integrated and consolidated planning and management of such funding (these funds being managed with their corresponding parallel systems). Different management stages and procedures, non-consolidated information about the expected, distributed and consumed funds, as well as different levels of discretion in regard the use of the funds can make difficult a rational policy planning (due to uncertainty and lack of predictability of funding) and implementation, in addition to a more costly administration and higher risks of corruption and funds diversion.     

Political discretion on the management of specific funds may create relevant obstacles to adequate planning and execution of service delivery and projects, as it may imply significant levels of uncertainty in regard to the allocated resources. Changing priorities, biases towards new projects and other elements may reduce the predictability and effective availability of funding for specific services and projects, making it difficult for agencies and departments to plan effectively and sustain long-term projects. This volatile nature of funding can lead to start-stop patterns in project execution, for example, where projects are initiated but then stalled due to sudden withdrawals or delays in funding.   

The atomization and fragmentation of funding, with different parallel management (with their own stages and processes), reporting mechanisms and information systems contributes to administrative burdens and inefficient delivery, as well as making more difficult ensure a sound coordination, planning and execution of projects. These parallel systems may exist, for example, because of donors' contributions to the financing of specific services or projects, in order to fulfill their own financial management and control and reporting requirements, but at the cost of hindering client government’s systems and the integrated management of funds and service and project planning and coordination.  For example, donor’s use of Ethiopia’s public financial management and procurement systems significantly declined between the middle and end of the decade of 2010, with more than 50% of donor funding routed through parallel systems , creating complexity and lack of integration of services funding. Alike, in Uganda and Indonesia, the existence of multiple fragmented sources of financing created major challenges for local government planning, budget formulation and execution in regard of education policies.   

In addition, the effects of the fragmentation of the different funding channels can foster, and be amplified, by the lack of adequate coordination between sectors, levels of government and partners, resulting in a duplication of efforts, and inefficiencies on the allocation, channeling and execution of resources.  For example, different agencies might end up competing for the same funds or, conversely, some funds might remain underutilized due to lack of awareness or coordination among potential beneficiaries, or, for policies planned and executed across different agencies and levels of government, poor coordination may end up with an inefficient policy development and allocation of resources (not coordinated planned actions and prepared budgets towards common/shared policy goals), as well as with delays and/or shortages in regard the distribution of funds. For instance, in Rwanda108, the funding and the implementation of gender-based violence policies managed by many agencies across the government (at least four ministries) and district governments, largely relying on donors funding, and with non-public funded services provided by outside of government institutions, organizations and communities, resulting challenging in terms of policy integration and optimal management of resources. Particularly relevant and frequent might be the case of fragmented and uncoordinated different funding mechanisms across levels of governments. In Nigeria health financing streams were highly fragmented across and within levels of government, with health facilities receiving both federal and state resources, contributory health insurance schemes being managed by separate small pools, with the Basic Healthcare Provision Fund creates even more funding pools managed by state and insurance agencies, setting up a very complex system between different levels of governments and operators. 
'''

In [0]:
#node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant = df.sample().iloc[0].values.tolist()

In [0]:
def relevance_boolean(x):
    return x=='Relevant'

In [0]:


StrongCue = Literal[
    "off_budget",
    "parallel_systems",
    "separate_administration",
    "different_rules_processes_across_streams",
    "political_ad_hoc_allocation",
    "mid_year_approval_outside_cycle",
    "block_allocation_outside_process",
    "multiple_financing_pools_different_agencies",
]

ModerateCue = Literal[
    "earmarked_or_tied_grants",
    "vertical_fragmentation_intergovernmental",
    "procedural_divergence_reporting_procurement_ifmis",
    "pooled_fund_dissolved_or_bypass",
    "volatility_revealing_fragmentation",
]

FailureType = Literal[
    "inefficiency_higher_admin_costs",
    "unpredictability_funding_volatility",
    "coordination_failure_duplication_underutilization_competition",
    "delays_start_stop",
    "arrears_or_cash_shortfalls",
    "corruption_or_leakage_risk",
]

Subtype = Literal[
    "donor_gov_parallel",
    "domestic_vs_donor_different_rules",
    "political_ad_hoc_intragovernment",
    "vertical_intergovernmental_fragmentation",
    "program_administration_fragmentation",
    "other",
]

Decision = Literal[
    "relevant_feature",                 # feature-based evidence satisfied
    "relevant_feature_and_failure",     # fragmentation + failure satisfied
    "irrelevant",                       # hard negative or no cues
    "abstain",                          # insufficient quotable spans or uncertain
]

class BottleneckEvidence(BaseModel):
    """
    Structured judgment for PFM Bottleneck 6.1.
    Follows precision-first rules: strong/moderate cues, hard negatives, and span extraction.
    """
    # Primary decisions
    is_fragmentation_evidence: bool = Field(
        ..., description="Feature-based verdict (diagnostic). True iff STRONG>=1 or MODERATE>=2 and trigger_spans present and not hard_negative."
    )
    is_fragmentation_plus_failure: bool = Field(
        ..., description="Consequence-based verdict. True iff is_fragmentation_evidence and failure_present."
    )
    decision: Decision = Field(
        ..., description='Overall decision among {"relevant_feature","relevant_feature_and_failure","irrelevant","abstain"}.'
    )

    # Evidence for fragmentation
    strong_cues: List[StrongCue] = Field(default_factory=list, description="Which strong cues were found.")
    moderate_cues: List[ModerateCue] = Field(default_factory=list, description="Which moderate cues were found.")
    hard_negative: bool = Field(False, description="True if the passage matches any exclusion rule.")
    trigger_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases from the text that support fragmentation cues (must be substrings of the input).",
    )
    subtype: Subtype = Field("other", description="Subtype of fragmentation, if any.")

    # Evidence for failure/inefficiency
    failure_present: bool = Field(False, description="True if explicit or clearly implied delivery failure/inefficiency is present.")
    failure_types: List[FailureType] = Field(default_factory=list, description="Categorization of the failure/inefficiency mentioned.")
    failure_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases supporting failure/inefficiency (must be substrings of the input).",
    )

    rationale: str = Field(
        ...,
        description="2-3 sentence justification referencing the trigger_spans/failure_spans; avoid speculation; no new facts.",
    )


In [0]:
SYSTEM_PROMPT_BASE = """
You are a public financial management (PFM) diagnostic analyst.
Task: determine if a passage evidences Bottleneck 6.1: “Ad hoc, political, and fragmented funding channels contribute to ineffective and inefficient delivery.”
Scope lock: Treat fragmentation strictly as funding/financial-management channel fragmentation. Program/administrative separation is irrelevant unless it creates a distinct funding flow or parallel financial control stream.
Always quote verbatim trigger spans from the INPUT TEXT. Be precise and conservative.
"""

PROMPT_SHARED_HEADER = """
Bottleneck family: {challenge}
Specific bottleneck: {bottleneck}

Extended definition:
{extended_definition}

INPUT TEXT:
{input_text}

Extended source context (for corroboration only; do not quote from it):
{extended_context}
"""

PROMPT_ADVOCATE = PROMPT_SHARED_HEADER + """
ROLE = ADVOCATE
Goal: make the best good-faith case for relevance (maximize recall) while staying within the scope lock.

What to look for:
- Funding-channel fragmentation cues:
  STRONG: off-budget; parallel accounts/systems; donor vs government with different rules/processes; political/ad-hoc channels (block/mid-year) bypassing normal cycle; multiple financing pools managed by different agencies; bypass of government systems/pooled fund dissolved→parallel handling.
  MODERATE: earmarked/tied with separate handling; vertical IG fragmentation with distinct pools; procedural divergence across funding streams (release/reporting/procurement/IFMIS); volatility revealing separate handling (“moved off-budget”, “bypassed TSA”).
- Delivery failure/inefficiency: higher admin costs, unpredictability/start–stop, coordination failure/duplication/underutilization/competition, delays/arrears/cash shortfalls, corruption/leakage.

Output JSON strictly as the given schema. Quote finance trigger spans from the INPUT TEXT.
"""

PROMPT_SKEPTIC = PROMPT_SHARED_HEADER + """
ROLE = SKEPTIC
Goal: test and narrow (maximize precision). Accept only funding-channel fragmentation. Reject hard negatives.

Hard negatives (irrelevant unless a qualifying funding-channel cue exists):
- Wage-bill/personnel/payroll control (PSC, casual workers, “separate budget vote/line”) on-budget
- Program-budget/accountability structure without distinct funding channels/systems
- Capacity/procurement/audit issues without alternate channels
- Revenue politics (divisible pool shares, OSR) without off-budget/ad-hoc fees/parallel accounts
- Donor dependence/withdrawal without off-budget/parallel handling or different rules

Require a finance trigger span (“off-budget”, “TSA/IFMIS”, “parallel account/system”, “pool/pooled fund”, “allocation/release/warrant/disbursement”, “block/discretionary”, “mid-year approval”, “ADP/MTEF”, “donor-funded vs government-funded (different rules)”).
If no such span exists, push for abstain/irrelevant.

Output JSON strictly as the given schema. Quote finance trigger spans from the INPUT TEXT.
"""

PROMPT_JUDGE = """
ROLE = JUDGE
You will see the original task info, the INPUT TEXT, and JSON opinions from an Advocate and a Skeptic.
Reconcile and emit the final structured judgment using the BottleneckEvidence schema and the acceptance rules:

Acceptance rules:
1) is_fragmentation_evidence = true iff ((STRONG >= 1) OR (MODERATE >= 2)) AND trigger_spans (finance words) nonempty AND hard_negative = false.
2) is_fragmentation_plus_failure = true iff is_fragmentation_evidence AND failure_present = true.
3) If no explicit finance phrase can be quoted for trigger_spans, decision = "abstain".
4) Always fill trigger_spans and failure_spans with verbatim phrases from the INPUT TEXT that justify your labels.
5) Keep scope lock: fragmentation means funding/financial-management channels.

TASK INFO
---------
Bottleneck family: {challenge}
Specific bottleneck: {bottleneck}

Extended definition:
{extended_definition}

INPUT TEXT:
{input_text}

Extended source context (for corroboration only; do not quote from it):
{extended_context}

ADVOCATE JSON:
{advocate_json}

SKEPTIC JSON:
{skeptic_json}
"""

StrongCue = Literal[
    "off_budget",
    "parallel_systems",
    "donor_vs_gov_different_rules",
    "political_ad_hoc_channel",
    "mid_year_outside_cycle",
    "multiple_financing_pools",
    "bypass_gov_systems_or_pooled_dissolved"
]

ModerateCue = Literal[
    "earmarked_tied_with_separate_handling",
    "vertical_fragmentation_distinct_pools",
    "procedural_divergence_across_streams",
    "volatility_revealing_separate_handling"
]

FailureType = Literal[
    "inefficiency_higher_admin_costs",
    "unpredictability_funding_volatility",
    "coordination_failure_duplication_underutilization_competition",
    "delays_start_stop",
    "arrears_or_cash_shortfalls",
    "corruption_or_leakage_risk",
]

class PanelOpinion(BaseModel):
    proposed_decision: Literal["relevant_feature","relevant_feature_and_failure","irrelevant","abstain"] = Field(...)
    strong_cues: List[StrongCue] = Field(default_factory=list)
    moderate_cues: List[ModerateCue] = Field(default_factory=list)
    trigger_spans: List[str] = Field(default_factory=list, description="Verbatim finance-related spans from INPUT TEXT.")
    hard_negative_flags: List[Literal[
        "wagebill_personnel_on_budget_only",
        "program_admin_without_finance_split",
        "procurement_capacity_audit_only",
        "revenue_politics_without_alt_channel",
        "donor_volatility_without_parallelization"
    ]] = Field(default_factory=list)
    failure_present: bool = False
    failure_types: List[FailureType] = Field(default_factory=list)
    failure_spans: List[str] = Field(default_factory=list)
    rationale: str = Field(..., description="2–4 sentences, reference quoted spans, no speculation.")

def classify_with_panel(
    service,
    challenge: str,
    bottleneck: str,
    extended_definition: str,
    input_text: str,
    extended_context: str = "",
    model: str = DEFAULT_LLM_MODEL,
    system_prompt_base: str = SYSTEM_PROMPT_BASE,
    temperature: float = 0.0,
):
    # Advocate
    advocate_prompt = PROMPT_ADVOCATE.format(
        challenge=challenge,
        bottleneck=bottleneck,
        extended_definition=extended_definition,
        input_text=input_text,
        extended_context=extended_context,
    )
    advocate = service.execute(
        prompt=advocate_prompt,
        model=model,
        response_model=PanelOpinion,
        temperature=temperature,
        system_message=system_prompt_base + "\nROLE: ADVOCATE"
    )

    # Skeptic
    skeptic_prompt = PROMPT_SKEPTIC.format(
        challenge=challenge,
        bottleneck=bottleneck,
        extended_definition=extended_definition,
        input_text=input_text,
        extended_context=extended_context,
    )
    skeptic = service.execute(
        prompt=skeptic_prompt,
        model=model,
        response_model=PanelOpinion,
        temperature=temperature,
        system_message=system_prompt_base + "\nROLE: SKEPTIC"
    )

    # Judge
    import json
    judge_prompt = PROMPT_JUDGE.format(
        challenge=challenge,
        bottleneck=bottleneck,
        extended_definition=extended_definition,
        input_text=input_text,
        extended_context=extended_context,
        advocate_json=json.dumps(advocate.dict(), ensure_ascii=False),
        skeptic_json=json.dumps(skeptic.dict(), ensure_ascii=False),
    )
    final = service.execute(
        prompt=judge_prompt,
        model=model,
        response_model=BottleneckEvidence,
        temperature=temperature,
        system_message=system_prompt_base + "\nROLE: JUDGE"
    )

    return final, advocate, skeptic


In [0]:
l = []

In [0]:
start, end = 100, df.shape[0]
for item in tqdm(df.values.tolist()[start:end]):
    node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant = item
    final, advocate, skeptic = classify_with_panel(
        service,
        challenge = challenge,
        bottleneck = bottleneck,
        extended_definition = extended_definition,
        input_text = extracted_evidence,
        extended_context = extended_context,
        model = DEFAULT_LLM_MODEL,
        system_prompt_base = SYSTEM_PROMPT_BASE,
        temperature = 0.0,
    )
    l.append((node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant,
            final, advocate, skeptic))

In [0]:
def relevance_boolean(x):
    return x=='Relevant'

In [0]:
m = [(relevance_boolean(item[-5]), item[-3].is_fragmentation_evidence, item[-3].is_fragmentation_plus_failure) for item in l]
df_results = pd.DataFrame(m, columns = ['expert_label', 'model1_label', 'model2_label'])
precision_m1 = (
    df_results.query("model1_label == True and expert_label == True").shape[0]
    / df_results.query("model1_label == True").shape[0]
    if df_results.query("model1_label == True").shape[0] > 0
    else 0
)

precision_m2 = (
    df_results.query("model2_label == True and expert_label == True").shape[0]
    / df_results.query("model2_label == True").shape[0]
    if df_results.query("model2_label == True").shape[0] > 0
    else 0
)

print(f"Precision (Model 1): {precision_m1:.2f}")
print(f"Precision (Model 2): {precision_m2:.2f}")

In [0]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """
    Compute the cosine similarity between two vectors.
    Args:
        vec1 (list or np.ndarray): First vector
        vec2 (list or np.ndarray): Second vector
    Returns:
        float: Cosine similarity value
    """
    v1 = np.array(vec1)
    v2 = np.array(vec2)
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    return dot_product / (norm_v1 * norm_v2)


In [0]:
import os
import numpy as np
from openai import OpenAI

client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
    """
    Convert input text into an embedding vector using OpenAI's embedding API.
    """
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return np.array(response.data[0].embedding)

def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm_v1 = np.linalg.norm(vec1)
    norm_v2 = np.linalg.norm(vec2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    return dot_product / (norm_v1 * norm_v2)

def similarity_between_texts(text1, text2):
    """
    Vectorize two input texts and compute their cosine similarity.
    """
    vec1 = get_embedding(text1)
    vec2 = get_embedding(text2)
    return cosine_similarity(vec1, vec2)


In [0]:

text_a = "Public financial management reform in Africa."
text_b = "Budget and expenditure systems improvement in African countries."

similarity = similarity_between_texts(text_a, text_b)
print(f"Cosine similarity: {similarity:.4f}")


In [0]:
cosine_similarity(v1, v2)

In [0]:
q = 'poverty'
t1 = "Trang Nguyen is a professional with expertise in poverty reduction, shared prosperity, and sustainable development. Trang's work focuses on providing policy-oriented diagnostics, technical assistance, and improved communication on reform options. Trang has experience in various projects related to poverty assessment, fiscal policy, and economic growth. Trang's work involves building engagement with governments and stakeholders to promote growth and job creation. Trang has also contributed to the operationalization of the twin goals of poverty reduction and shared prosperity in the Western Balkans. Trang has supported the generation of new data and analytics on SOGI inclusion. Trang's location and additional details are not provided in the given data"
t2 = "FIRST_NAME is a transport specialist with a strong background in urban transport, railway systems, and sustainable transport solutions.  Expertise encompasses economic analysis, policy development, project management, and data analysis.  Has worked on various projects related to improving transport infrastructure and services, focusing on efficiency, safety, and sustainability. Experience includes working on BRT systems, integrated urban transport planning, railway connectivity, economic corridor development, multimodal logistics, and institutional reform in the transport sector.\n"


In [0]:
similarity_between_texts(q,t1)

In [0]:
similarity_between_texts(q,t2)

In [0]:
similarity_between_texts(t1,t2)

In [0]:
cosine_similarity(v1, v2)

In [0]:
vec1 = get_embedding(t1)
cosine_similarity(vec1, v1)