In [0]:
!pip install instructor
!pip install azure.identity openai
!pip install tqdm
dbutils.library.restartPython()

In [0]:
from random import sample
from tqdm import tqdm
from time import sleep


from consts import pf_challenges, llm_model
from models.registry import MODEL_REGISTRY, VALIDATION_MODEL_REGISTRY
from prompts import make_bottleneck_prompt, make_validation_prompt
from azure_service import service



In [0]:
# only run this for the selected bottlenecks
ACTIVE_BOTTLENECK_IDS = ["2.1", "3.1"]

active_bottlenecks = []
for challenge in pf_challenges:
    for b in challenge["bottlenecks"]:
        if b["bottleneck_id"] in ACTIVE_BOTTLENECK_IDS:
            active_bottlenecks.append({
                "challenge_name": challenge["challenge_name"],
                "challenge_id": challenge["challenge_id"],
                "challenge_description": challenge["challenge_description"],
                "role_of_public_finance": challenge["role_of_public_finance"],
                "role_description": challenge["role_description"],
                **b
            })

In [0]:
# retrieve chunk data
df_chunks = spark.read.table("prd_mega.sboost4.per_pfr_chunks").toPandas()
chunk_data = [{'node_id':x[0], 'chunk_id':x[1], 'text':x[2]} for x in df_chunks.values.tolist()]

In [0]:
results_2_1_3_1 = []

for i, chunk in enumerate(tqdm(chunk_data)):
    if i%10==0:
        sleep(3)
    results = []
    for b in active_bottlenecks:
        model_cls = MODEL_REGISTRY[b["bottleneck_id"]]
        validation_cls = VALIDATION_MODEL_REGISTRY.get(b["bottleneck_id"])
        try:
            prompt = make_bottleneck_prompt(
                text=chunk["text"],
                role_of_public_finance=b["role_of_public_finance"],
                role_description=b["role_description"],
                challenge_name=b["challenge_name"],
                challenge_description=b["challenge_description"],
                bottleneck_name=b["bottleneck_name"],
                bottleneck_description=b["bottleneck_description"]
            )

            result = service.execute(
                prompt=prompt,
                model=llm_model,
                response_model=model_cls
            )

            if not result.extracted_evidence:
                pass
            else:
                validation_prompt = make_validation_prompt(
                    extracted_evidence=result.extracted_evidence,
                    reasoning=result.reasoning,
                    role_of_public_finance=b["role_of_public_finance"],
                    role_description=b["role_description"],
                    challenge_name=b["challenge_name"],
                    challenge_description=b["challenge_description"],
                    bottleneck_name=b["bottleneck_name"],
                    bottleneck_description=b["bottleneck_description"],
                    validation_model_cls=validation_cls
                )

                validation = service.execute(
                    prompt=validation_prompt,
                    model=llm_model,
                    response_model=validation_cls
                ) if validation_cls else None

                results.append({
                    "node_id": chunk["node_id"],
                    "chunk_id": chunk["chunk_id"],
                    "chunk": chunk["text"],
                    "challenge_id": b["challenge_id"],
                    "bottleneck_id": b["bottleneck_id"],
                    "was_processed": True,
                    "extracted_evidence": result.extracted_evidence,
                    "reasoning": result.reasoning,
                    "extraction_confidence": result.confidence,
                    "validation_reasoning": validation.validation_reasoning if validation else None,
                    "validation_confidence": validation.confidence if validation else None,
                    "is_valid": validation.is_valid if validation else None,
                    **(validation.model_dump(exclude_unset=True, exclude={
                        "validation_reasoning", "validation_confidence", "is_valid"
                    }) if validation else {})
                })

        except Exception as e:
            print(f"Error for bottleneck {b['bottleneck_id']}: {e}")

    results_2_1_3_1.extend(results)
