In [0]:
from dotenv import load_dotenv
import pandas as pd
from service import Service
load_dotenv()
from typing import List, Literal, Optional
from pydantic import BaseModel, Field
import json
import math
from typing import Dict, Any, List
from tqdm import tqdm
import pandas as pd
from service import Service
from consts import DEFAULT_LLM_MODEL
service = Service()
model_name = DEFAULT_LLM_MODEL

In [0]:
df = pd.read_excel('bottleneck_2_1_3_1_for_review.xlsx')
rel_cols = ['node_id', 'chunk_id', 'extracted_evidence', 'chunk', 'Review_status', 'Reason']

df = df[rel_cols]

In [0]:

system_prompt = """
You are a public financial management (PFM) diagnostic analyst.
Task: determine if a passage evidences **fragmented institutional responsibilities and weak coordination for macro-fiscal management**.
Scope lock: Treat fragmentation strictly as institutional or mandate fragmentation across macro-fiscal agencies
(e.g., Ministry of Finance, Treasury, Debt Management Office, Central Bank, donor-financing units).
Budget, debt, and monetary functions that operate in silos or without coordination qualify.
Sectoral policy overlaps, intergovernmental financing issues, fragmented data systems, or generic coordination problems are outside scope.
Always quote verbatim trigger spans from the INPUT TEXT. Be precise and conservative.
"""


PROMPT_TEMPLATE = f"""
You are a PFM diagnostic coder. Your task is to judge whether the INPUT TEXT
contains evidence for the Bottleneck: Policy Incoherence & Fragmentation:

Here is an extended definition with examples for context: 
{extended_definition}

Additionally use:
- Hierarchical Category: {challenge}
- Bottleneck Name: {bottleneck}

Output MUST follow the JSON schema you are given (no extra keys, no prose).

––––– CODING GOAL –––––
We use TWO complementary lenses:
A) Feature-based (policy design): Evidence that **multiple policies/strategies/programs/frameworks** are **conflicting, duplicative, or uncoordinated** (within a sector or across sectors).
B) Consequence-based: The above **plus** a stated consequence (duplication, inefficiency, stranded assets, distortions, etc.).

––––– DEFINITIONS (OPERABLE) –––––
• Policy-fragmentation cues (diagnostic features):
  STRONG (any ONE is sufficient for feature-based evidence):
    - conflicting_or_overlapping_policies              # two+ policies/strategies/plans at odds or duplicative
    - parallel_strategies_or_plans                     # parallel (siloed) frameworks addressing same goal
    - uncoordinated_programs_same_objective            # separate programs w/ same objective run unaligned
    - misaligned_targets_across_policies               # targets/eligibility/criteria clash across policies
    - policy_frameworks_ignore_interdependencies       # explicit statement that frameworks omit complementarities

  MODERATE (need TWO together if no STRONG):
    - explicit_policy_terms_present                    # words like policy/strategy/plan/framework/program named
    - cross_sector_interface_ignored                   # policy design misses cross-sector linkages (e.g., energy↔transmission; agri↔climate)
    - causal_link_stated                               # text states how incoherence → consequence
    - concrete_consequence_present                     # duplication, stranded assets, inefficiency, etc.
    - sufficient_specificity                           # named country/sector/institution/policy/program (not generic)

• Consequences (examples):
  - inefficiency/duplication/overlap
  - stranded assets / bottlenecks / market distortions
  - missed synergies / poor outcomes
  - higher-than-optimal costs / waste

––––– HARD NEGATIVES (EXCLUDE from 6.1) –––––
Mark hard_negative = true and decision = "irrelevant" if the text is ONLY about:
  A) Macro-institutional fragmentation (2.1): MoF/Treasury/DMO/Central Bank roles, donor–government financing integration, budget formulation vs macro planning misalignment.
  B) Intergovernmental financing/delivery (6.2/7.6): devolution/transfer incoherence; federal–provincial/local overlaps in spending responsibilities.
  C) Data/MIS fragmentation (9.2): fragmented data systems, non-interoperable reporting, reconciliation issues.
  D) Policy vacuum: absence of any framework (ad hoc because there is none), with no conflicting/parallel policies.
  E) Operational-only lack of coordination: execution/communication failures within a single existing framework.
  F) Generic/Vague: abstract statement that could apply anywhere; no concrete actors/policies/sectors or mechanisms.

––––– ACCEPTANCE RULES –––––
1) is_policy_fragmentation_evidence = true  iff
   (len(STRONG) >= 1  OR  len(MODERATE) >= 2)
   AND trigger_spans is nonempty AND hard_negative = false.
2) is_policy_fragmentation_plus_failure = true  iff
   is_policy_fragmentation_evidence = true AND failure_present = true.
3) If no explicit phrase can be quoted for trigger_spans, set decision to "abstain" (even if you suspect relevance).
4) Always fill trigger_spans and failure_spans with verbatim phrases from the INPUT TEXT that justify your labels.

––––– OUTPUT FORMAT –––––
Return ONLY JSON that conforms to the schema provided below. Do not include commentary.

––––– INPUT TEXT –––––
{{input_text}}

––––– EXTENDED CONTEXT –––––
{extended_context}
"""


challenge = 'Incoherence and Fragmentation of Policy'

bottleneck = 'Fragmented, Inconsistent and Uncoordinated Policies Across or Within Sectors'

extended_definition = '''

Fragmentation manifests both within and across sectors when policy design does not consider interdependencies, complementarities, or broader strategic frameworks. In Malawi, despite strong consensus on a national Health Benefits Package, donor funding has remained locked into vertical, disease-specific programs rather than aligning with the unified strategy, creating duplication and missed synergies (Connolly et al., 2024). Similarly, in Kenya, overlapping and uncoordinated youth skills initiatives have created inefficiencies and blurred accountability for results. In Pakistan’s energy sector, renewable energy transition policies, such as the Alternative and Renewable Energy Policy (2019), were initially developed in isolation from the broader National Power Policy frameworks, leading to incoherent strategies. Only recently, through the 2021 National Electricity Policy and the 2023 Integrated Generation Capacity Expansion Plan, has Pakistan moved toward a more coordinated energy sector approach (World Bank, 2021).

Poor prioritization compounds these challenges, diverting limited resources away from high-impact investments. In Indonesia, infrastructure investments in education outpaced reforms in teaching quality, limiting improvements in learning outcomes (World Bank, 2018). Similarly, in Ghana, agricultural strategies have remained disconnected from the country’s climate goals, undermining climate resilience efforts. In Liberia, unbalanced funding across education levels persists, although universal basic education is publicly endorsed, political incentives in the Senate lead to disproportionate budget allocations favoring universities at the expense of primary education. In Uganda, classic studies such as Reinikka and Svensson (2001) found that as much as 87% of capitation grants intended for primary schools were diverted before reaching their intended beneficiaries—a problem echoed by similar findings in Tanzania using Public Expenditure Tracking Surveys (World Bank, 2006).

The consequences of fragmentation are profound and widespread. In Vietnam, while generous feed-in tariffs successfully catalyzed solar deployment, inadequate parallel investment in transmission infrastructure created bottlenecks that now limit the sector’s expansion (World Bank, 2019). In Kenya, although strong public procurement frameworks helped triple power generation capacity since 1990, poor alignment between supply expansion and demand projections led to stranded assets and market distortions.  

'''

In [0]:


StrongPolicyCue = Literal[
    "conflicting_or_overlapping_policies",
    "parallel_strategies_or_plans",
    "uncoordinated_programs_same_objective",
    "misaligned_targets_across_policies",
    "policy_frameworks_ignore_interdependencies",
]

ModeratePolicyCue = Literal[
    "explicit_policy_terms_present",
    "cross_sector_interface_ignored",
    "causal_link_stated",
    "concrete_consequence_present",
    "sufficient_specificity",
]

FailureType = Literal[
    "inefficiency_or_duplication",
    "stranded_assets_or_bottlenecks",
    "market_distortions",
    "missed_synergies_or_poor_outcomes",
    "higher_costs_or_waste",
]

Subtype = Literal[
    "within_sector",
    "cross_sector",
    "other",
]

Decision = Literal[
    "relevant_feature",                 # feature-based evidence satisfied
    "relevant_feature_and_failure",     # policy fragmentation + consequence satisfied
    "irrelevant",                       # hard negative or no cues
    "abstain",                          # insufficient quotable spans or uncertain
]

class EvidenceSignals(BaseModel):
    multiple_policies_or_programs: bool
    cross_sector_or_within_sector_policy_conflict: bool
    donor_government_financing_issue: bool
    intergovernmental_overlap: bool
    data_systems_fragmentation: bool
    explicit_policy_terms_present: bool
    causal_link_present: bool
    concrete_consequence_present: bool
    specificity_ok: bool

AltReason = Literal[
    "Sectoral policy incoherence or fragmentation across/within sectors (multiple uncoordinated policies or strategies)",
    "Intergovernmental financing or delivery incoherence between levels of government (federal, provincial, local overlaps, incomplete devolution)",
    "Fragmented or non-interoperable data and information systems (multiple MIS, reporting or reconciliation gaps)",
    "Policy vacuum or absence of a guiding framework (no conflicting frameworks exist, ad-hoc decisions due to lack of policy)",
    "Operational-only coordination failure within one framework (implementation problem, not institutional fragmentation)",
    "Generic or vague statement without concrete macro-institutional context (applies universally, lacks specificity)",
    "Other or unclear reason for irrelevance"
]


class BottleneckEvidence(BaseModel):
    """
    Structured judgment for PFM Bottleneck 2.1 (Policy incoherence/fragmentation).
    Mirrors your precision-first pattern: strong/moderate cues, hard negatives, span extraction.
    """
    # Primary decisions
    is_policy_fragmentation_evidence: bool = Field(
        ..., description="Feature-based verdict. True iff STRONG>=1 or MODERATE>=2 and trigger_spans present and not hard_negative."
    )
    is_policy_fragmentation_plus_failure: bool = Field(
        ..., description="Consequence-based verdict. True iff is_policy_fragmentation_evidence and failure_present."
    )
    decision: Decision = Field(
        ..., description='Overall decision among {"relevant_feature","relevant_feature_and_failure","irrelevant","abstain"}.'
    )

    # Evidence for policy fragmentation
    strong_cues: List[StrongPolicyCue] = Field(default_factory=list, description="Which strong policy cues were found.")
    moderate_cues: List[ModeratePolicyCue] = Field(default_factory=list, description="Which moderate policy cues were found.")
    hard_negative: bool = Field(False, description="True if the passage matches any exclusion rule  vacuum / operational / generic / other).")
    trigger_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases from the INPUT TEXT that support policy fragmentation cues (must be substrings).",
    )
    subtype: Subtype = Field("other", description="Within-sector or cross-sector fragmentation, if determinable.")

    # Evidence for consequence/failure
    failure_present: bool = Field(False, description="True if explicit or clearly implied consequence is present.")
    failure_types: List[FailureType] = Field(default_factory=list, description="Categorization of the consequence mentioned.")
    failure_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases supporting consequence (must be substrings).",
    )

    non_2_1_reason: Optional[AltReason] = Field(
        default=None,
        description=(
            "If the passage is judged not relevant to Bottleneck 2.1, specify the reason — "
            "for example: sectoral policy incoherence (fragmented or uncoordinated policies across/within sectors); "
            "intergovernmental financing or delivery incoherence between government levels; "
            "fragmented or non-interoperable data systems; "
            "absence of macro-fiscal institutional content (policy vacuum); "
            "operational-only coordination failure; or overly generic evidence."
        ),
    )

    # Signals + auditability
    signals: EvidenceSignals = Field(
        ..., description="Binary features to support audits and calibrations."
    )

    # Justification + confidence
    rationale: str = Field(
        ...,
        description="2–4 sentence justification referencing trigger_spans/failure_spans; avoid speculation; no new facts.",
    )



In [0]:
l = []

In [0]:
start, end = 100, df.shape[0]
for item in tqdm(df.values.tolist()[start:end]):
    node_id, chunk_id, extracted_evidence, extended_context, expert_label, expert_reason = item
    prompt = PROMPT_TEMPLATE.format(
        challenge = challenge,
        bottleneck = bottleneck,
        extended_definition=extended_definition,
        input_text=extracted_evidence,
        extended_context=extended_context,
    )
    
    output = service.execute(
        prompt=prompt,
        model=DEFAULT_LLM_MODEL,
        response_model=BottleneckEvidence,
        temperature=0.0,
        system_message=system_prompt
    )
    l.append((node_id, chunk_id, extended_context, extracted_evidence, expert_label, expert_reason, output ))
    

In [0]:
def relevance_boolean(x):
    return str(x).lower()=='yes'

In [0]:
m = [(relevance_boolean(item[-3]), item[-1].is_policy_fragmentation_evidence, item[-1].is_policy_fragmentation_plus_failure) for item in l]
df_results = pd.DataFrame(m, columns = ['expert_label', 'model1_label', 'model2_label'])
precision_m1 = (
    df_results.query("model1_label == True and expert_label == True").shape[0]
    / df_results.query("model1_label == True").shape[0]
    if df_results.query("model1_label == True").shape[0] > 0
    else 0
)

precision_m2 = (
    df_results.query("model2_label == True and expert_label == True").shape[0]
    / df_results.query("model2_label == True").shape[0]
    if df_results.query("model2_label == True").shape[0] > 0
    else 0
)

print(f"Precision (Model 1): {precision_m1:.2f}")
print(f"Precision (Model 2): {precision_m2:.2f}")

In [0]:


StrongCue = Literal[
    "off_budget",
    "parallel_systems",
    "separate_administration",
    "different_rules_processes_across_streams",
    "political_ad_hoc_allocation",
    "mid_year_approval_outside_cycle",
    "block_allocation_outside_process",
    "multiple_financing_pools_different_agencies",
]

ModerateCue = Literal[
    "earmarked_or_tied_grants",
    "vertical_fragmentation_intergovernmental",
    "procedural_divergence_reporting_procurement_ifmis",
    "pooled_fund_dissolved_or_bypass",
    "volatility_revealing_fragmentation",
]

FailureType = Literal[
    "inefficiency_higher_admin_costs",
    "unpredictability_funding_volatility",
    "coordination_failure_duplication_underutilization_competition",
    "delays_start_stop",
    "arrears_or_cash_shortfalls",
    "corruption_or_leakage_risk",
]

Subtype = Literal[
    "donor_gov_parallel",
    "domestic_vs_donor_different_rules",
    "political_ad_hoc_intragovernment",
    "vertical_intergovernmental_fragmentation",
    "program_administration_fragmentation",
    "other",
]

Decision = Literal[
    "relevant_feature",                 # feature-based evidence satisfied
    "relevant_feature_and_failure",     # fragmentation + failure satisfied
    "irrelevant",                       # hard negative or no cues
    "abstain",                          # insufficient quotable spans or uncertain
]

class BottleneckEvidence(BaseModel):
    """
    Structured judgment for PFM Bottleneck 6.1.
    Follows precision-first rules: strong/moderate cues, hard negatives, and span extraction.
    """
    # Primary decisions
    is_fragmentation_evidence: bool = Field(
        ..., description="Feature-based verdict (diagnostic). True iff STRONG>=1 or MODERATE>=2 and trigger_spans present and not hard_negative."
    )
    is_fragmentation_plus_failure: bool = Field(
        ..., description="Consequence-based verdict. True iff is_fragmentation_evidence and failure_present."
    )
    decision: Decision = Field(
        ..., description='Overall decision among {"relevant_feature","relevant_feature_and_failure","irrelevant","abstain"}.'
    )

    # Evidence for fragmentation
    strong_cues: List[StrongCue] = Field(default_factory=list, description="Which strong cues were found.")
    moderate_cues: List[ModerateCue] = Field(default_factory=list, description="Which moderate cues were found.")
    hard_negative: bool = Field(False, description="True if the passage matches any exclusion rule.")
    trigger_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases from the text that support fragmentation cues (must be substrings of the input).",
    )
    subtype: Subtype = Field("other", description="Subtype of fragmentation, if any.")

    # Evidence for failure/inefficiency
    failure_present: bool = Field(False, description="True if explicit or clearly implied delivery failure/inefficiency is present.")
    failure_types: List[FailureType] = Field(default_factory=list, description="Categorization of the failure/inefficiency mentioned.")
    failure_spans: List[str] = Field(
        default_factory=list,
        description="Verbatim phrases supporting failure/inefficiency (must be substrings of the input).",
    )

    rationale: str = Field(
        ...,
        description="2-3 sentence justification referencing the trigger_spans/failure_spans; avoid speculation; no new facts.",
    )


In [0]:
SYSTEM_PROMPT_BASE = """
You are a public financial management (PFM) diagnostic analyst.
Task: determine if a passage evidences Bottleneck 6.1: “Ad hoc, political, and fragmented funding channels contribute to ineffective and inefficient delivery.”
Scope lock: Treat fragmentation strictly as funding/financial-management channel fragmentation. Program/administrative separation is irrelevant unless it creates a distinct funding flow or parallel financial control stream.
Always quote verbatim trigger spans from the INPUT TEXT. Be precise and conservative.
"""

PROMPT_SHARED_HEADER = """
Bottleneck family: {challenge}
Specific bottleneck: {bottleneck}

Extended definition:
{extended_definition}

INPUT TEXT:
{input_text}

Extended source context (for corroboration only; do not quote from it):
{extended_context}
"""

PROMPT_ADVOCATE = PROMPT_SHARED_HEADER + """
ROLE = ADVOCATE
Goal: make the best good-faith case for relevance (maximize recall) while staying within the scope lock.

What to look for:
- Funding-channel fragmentation cues:
  STRONG: off-budget; parallel accounts/systems; donor vs government with different rules/processes; political/ad-hoc channels (block/mid-year) bypassing normal cycle; multiple financing pools managed by different agencies; bypass of government systems/pooled fund dissolved→parallel handling.
  MODERATE: earmarked/tied with separate handling; vertical IG fragmentation with distinct pools; procedural divergence across funding streams (release/reporting/procurement/IFMIS); volatility revealing separate handling (“moved off-budget”, “bypassed TSA”).
- Delivery failure/inefficiency: higher admin costs, unpredictability/start–stop, coordination failure/duplication/underutilization/competition, delays/arrears/cash shortfalls, corruption/leakage.

Output JSON strictly as the given schema. Quote finance trigger spans from the INPUT TEXT.
"""

PROMPT_SKEPTIC = PROMPT_SHARED_HEADER + """
ROLE = SKEPTIC
Goal: test and narrow (maximize precision). Accept only funding-channel fragmentation. Reject hard negatives.

Hard negatives (irrelevant unless a qualifying funding-channel cue exists):
- Wage-bill/personnel/payroll control (PSC, casual workers, “separate budget vote/line”) on-budget
- Program-budget/accountability structure without distinct funding channels/systems
- Capacity/procurement/audit issues without alternate channels
- Revenue politics (divisible pool shares, OSR) without off-budget/ad-hoc fees/parallel accounts
- Donor dependence/withdrawal without off-budget/parallel handling or different rules

Require a finance trigger span (“off-budget”, “TSA/IFMIS”, “parallel account/system”, “pool/pooled fund”, “allocation/release/warrant/disbursement”, “block/discretionary”, “mid-year approval”, “ADP/MTEF”, “donor-funded vs government-funded (different rules)”).
If no such span exists, push for abstain/irrelevant.

Output JSON strictly as the given schema. Quote finance trigger spans from the INPUT TEXT.
"""

PROMPT_JUDGE = """
ROLE = JUDGE
You will see the original task info, the INPUT TEXT, and JSON opinions from an Advocate and a Skeptic.
Reconcile and emit the final structured judgment using the BottleneckEvidence schema and the acceptance rules:

Acceptance rules:
1) is_fragmentation_evidence = true iff ((STRONG >= 1) OR (MODERATE >= 2)) AND trigger_spans (finance words) nonempty AND hard_negative = false.
2) is_fragmentation_plus_failure = true iff is_fragmentation_evidence AND failure_present = true.
3) If no explicit finance phrase can be quoted for trigger_spans, decision = "abstain".
4) Always fill trigger_spans and failure_spans with verbatim phrases from the INPUT TEXT that justify your labels.
5) Keep scope lock: fragmentation means funding/financial-management channels.

TASK INFO
---------
Bottleneck family: {challenge}
Specific bottleneck: {bottleneck}

Extended definition:
{extended_definition}

INPUT TEXT:
{input_text}

Extended source context (for corroboration only; do not quote from it):
{extended_context}

ADVOCATE JSON:
{advocate_json}

SKEPTIC JSON:
{skeptic_json}
"""

StrongCue = Literal[
    "off_budget",
    "parallel_systems",
    "donor_vs_gov_different_rules",
    "political_ad_hoc_channel",
    "mid_year_outside_cycle",
    "multiple_financing_pools",
    "bypass_gov_systems_or_pooled_dissolved"
]

ModerateCue = Literal[
    "earmarked_tied_with_separate_handling",
    "vertical_fragmentation_distinct_pools",
    "procedural_divergence_across_streams",
    "volatility_revealing_separate_handling"
]

FailureType = Literal[
    "inefficiency_higher_admin_costs",
    "unpredictability_funding_volatility",
    "coordination_failure_duplication_underutilization_competition",
    "delays_start_stop",
    "arrears_or_cash_shortfalls",
    "corruption_or_leakage_risk",
]

class PanelOpinion(BaseModel):
    proposed_decision: Literal["relevant_feature","relevant_feature_and_failure","irrelevant","abstain"] = Field(...)
    strong_cues: List[StrongCue] = Field(default_factory=list)
    moderate_cues: List[ModerateCue] = Field(default_factory=list)
    trigger_spans: List[str] = Field(default_factory=list, description="Verbatim finance-related spans from INPUT TEXT.")
    hard_negative_flags: List[Literal[
        "wagebill_personnel_on_budget_only",
        "program_admin_without_finance_split",
        "procurement_capacity_audit_only",
        "revenue_politics_without_alt_channel",
        "donor_volatility_without_parallelization"
    ]] = Field(default_factory=list)
    failure_present: bool = False
    failure_types: List[FailureType] = Field(default_factory=list)
    failure_spans: List[str] = Field(default_factory=list)
    rationale: str = Field(..., description="2–4 sentences, reference quoted spans, no speculation.")

def classify_with_panel(
    service,
    challenge: str,
    bottleneck: str,
    extended_definition: str,
    input_text: str,
    extended_context: str = "",
    model: str = DEFAULT_LLM_MODEL,
    system_prompt_base: str = SYSTEM_PROMPT_BASE,
    temperature: float = 0.0,
):
    # Advocate
    advocate_prompt = PROMPT_ADVOCATE.format(
        challenge=challenge,
        bottleneck=bottleneck,
        extended_definition=extended_definition,
        input_text=input_text,
        extended_context=extended_context,
    )
    advocate = service.execute(
        prompt=advocate_prompt,
        model=model,
        response_model=PanelOpinion,
        temperature=temperature,
        system_message=system_prompt_base + "\nROLE: ADVOCATE"
    )

    # Skeptic
    skeptic_prompt = PROMPT_SKEPTIC.format(
        challenge=challenge,
        bottleneck=bottleneck,
        extended_definition=extended_definition,
        input_text=input_text,
        extended_context=extended_context,
    )
    skeptic = service.execute(
        prompt=skeptic_prompt,
        model=model,
        response_model=PanelOpinion,
        temperature=temperature,
        system_message=system_prompt_base + "\nROLE: SKEPTIC"
    )

    # Judge
    import json
    judge_prompt = PROMPT_JUDGE.format(
        challenge=challenge,
        bottleneck=bottleneck,
        extended_definition=extended_definition,
        input_text=input_text,
        extended_context=extended_context,
        advocate_json=json.dumps(advocate.dict(), ensure_ascii=False),
        skeptic_json=json.dumps(skeptic.dict(), ensure_ascii=False),
    )
    final = service.execute(
        prompt=judge_prompt,
        model=model,
        response_model=BottleneckEvidence,
        temperature=temperature,
        system_message=system_prompt_base + "\nROLE: JUDGE"
    )

    return final, advocate, skeptic


In [0]:
l = []

In [0]:
start, end = 100, df.shape[0]
for item in tqdm(df.values.tolist()[start:end]):
    node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant = item
    final, advocate, skeptic = classify_with_panel(
        service,
        challenge = challenge,
        bottleneck = bottleneck,
        extended_definition = extended_definition,
        input_text = extracted_evidence,
        extended_context = extended_context,
        model = DEFAULT_LLM_MODEL,
        system_prompt_base = SYSTEM_PROMPT_BASE,
        temperature = 0.0,
    )
    l.append((node_id, chunk_id, extended_context, extracted_evidence, expert_classification, reason_for_expert_irrelevant,
            final, advocate, skeptic))

In [0]:
def relevance_boolean(x):
    return x=='Relevant'

In [0]:
m = [(relevance_boolean(item[-5]), item[-3].is_fragmentation_evidence, item[-3].is_fragmentation_plus_failure) for item in l]
df_results = pd.DataFrame(m, columns = ['expert_label', 'model1_label', 'model2_label'])
precision_m1 = (
    df_results.query("model1_label == True and expert_label == True").shape[0]
    / df_results.query("model1_label == True").shape[0]
    if df_results.query("model1_label == True").shape[0] > 0
    else 0
)

precision_m2 = (
    df_results.query("model2_label == True and expert_label == True").shape[0]
    / df_results.query("model2_label == True").shape[0]
    if df_results.query("model2_label == True").shape[0] > 0
    else 0
)

print(f"Precision (Model 1): {precision_m1:.2f}")
print(f"Precision (Model 2): {precision_m2:.2f}")

In [0]:
import numpy as np

def cosine_similarity(vec1, vec2):
    """
    Compute the cosine similarity between two vectors.
    Args:
        vec1 (list or np.ndarray): First vector
        vec2 (list or np.ndarray): Second vector
    Returns:
        float: Cosine similarity value
    """
    v1 = np.array(vec1)
    v2 = np.array(vec2)
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    return dot_product / (norm_v1 * norm_v2)


In [0]:
import os
import numpy as np
from openai import OpenAI

client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
    """
    Convert input text into an embedding vector using OpenAI's embedding API.
    """
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return np.array(response.data[0].embedding)

def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm_v1 = np.linalg.norm(vec1)
    norm_v2 = np.linalg.norm(vec2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    return dot_product / (norm_v1 * norm_v2)

def similarity_between_texts(text1, text2):
    """
    Vectorize two input texts and compute their cosine similarity.
    """
    vec1 = get_embedding(text1)
    vec2 = get_embedding(text2)
    return cosine_similarity(vec1, vec2)


In [0]:

text_a = "Public financial management reform in Africa."
text_b = "Budget and expenditure systems improvement in African countries."

similarity = similarity_between_texts(text_a, text_b)
print(f"Cosine similarity: {similarity:.4f}")


In [0]:
cosine_similarity(v1, v2)

In [0]:
q = 'poverty'
t1 = "Trang Nguyen is a professional with expertise in poverty reduction, shared prosperity, and sustainable development. Trang's work focuses on providing policy-oriented diagnostics, technical assistance, and improved communication on reform options. Trang has experience in various projects related to poverty assessment, fiscal policy, and economic growth. Trang's work involves building engagement with governments and stakeholders to promote growth and job creation. Trang has also contributed to the operationalization of the twin goals of poverty reduction and shared prosperity in the Western Balkans. Trang has supported the generation of new data and analytics on SOGI inclusion. Trang's location and additional details are not provided in the given data"
t2 = "FIRST_NAME is a transport specialist with a strong background in urban transport, railway systems, and sustainable transport solutions.  Expertise encompasses economic analysis, policy development, project management, and data analysis.  Has worked on various projects related to improving transport infrastructure and services, focusing on efficiency, safety, and sustainability. Experience includes working on BRT systems, integrated urban transport planning, railway connectivity, economic corridor development, multimodal logistics, and institutional reform in the transport sector.\n"


In [0]:
similarity_between_texts(q,t1)

In [0]:
similarity_between_texts(q,t2)

In [0]:
similarity_between_texts(t1,t2)

In [0]:
cosine_similarity(v1, v2)

In [0]:
vec1 = get_embedding(t1)
cosine_similarity(vec1, v1)