# prompt_builder

> Fill in a module description here

In [None]:
#| default_exp prompt_builder

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from typing import List, Dict
from llm_data_extractor.models import Question, AnswerType
import sys, logging
import ast
import json

In [None]:
#| exporti

log = logging.getLogger(__name__)
def setup_logging(level=logging.INFO):
    # Use IPython-bypass in notebooks, normal stdout in scripts
    stream = getattr(sys, "__stdout__", sys.stdout)
    root = logging.getLogger()
    root.handlers.clear()
    root.setLevel(level)
    h = logging.StreamHandler(stream)
    h.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(threadName)s | %(name)s | %(message)s"))
    root.addHandler(h)

# setup_logging(level=logging.INFO)

In [None]:
#| export

def build_prompt(questions: List[Question], source_text: str) -> str:
    """
    Build a structured prompt for LLM data extraction.
    
    Args:
        questions: List of Question objects to extract data for
        source_text: The unstructured text to extract data from
        
    Returns:
        Formatted prompt string
    """
    
    prompt_parts = [
        "You are a data extraction assistant. Extract the requested information from the provided text.",
        "",
        "INSTRUCTIONS:",
        "- Answer each question precisely based only on the information in the text",
        "- If information is not available, respond with 'NULL'",
        "- Follow the exact format requirements for each question",
        "- Provide a confidence score (0.0-1.0) for each answer",
        "- Answer must follow format. No other notes or text can be included with the response json.",
        "",
        "TEXT TO ANALYZE:",
        "---",
        source_text.strip(),
        "---",
        "",
        "QUESTIONS TO ANSWER:",
    ]
    
    for i, question in enumerate(questions, 1):
        question_block = [
            f"Question {i} (ID: {question.id}):",
            f"Q: {question.text}",
            f"Expected format: {_get_format_instruction(question)}",
            f"Response format: {{\"answer\": your_answer, \"confidence\": 0.0-1.0}}",
            ""
        ]
        prompt_parts.extend(question_block)
    
    prompt_parts.extend([
        "Respond with a JSON object containing all answers:",
        "{",
        '  "responses": [',
        '    {"question_id": "...", "answer": "...", "confidence": 0.0-1.0},',
        '    ...',
        "  ]",
        "}"
    ])
    
    return "\n".join(prompt_parts)


def _get_format_instruction(question: Question) -> str:
    """Generate format instruction based on question type and answer_config."""
    log.debug(f'Called _get_format_instructuions() with {Question}')
    answer_type = question.answer_type
    log.debug(f'Getting answer_config')
    answer_config = question.answer_config
    log.debug(f'Answer Config: {answer_config} ({type(answer_config)})')

    if isinstance(answer_config,str):
        log.debug('Converting Answer Config to Dict')
        answer_config = json.loads(answer_config)
        if isinstance(answer_config,str):
            log.debug('Converted')
        else:
            log.debug('Could not convert')
    
    if answer_type == AnswerType.BOOLEAN:
        return "true or false"
    
    elif answer_type == AnswerType.ENUM:
        valid_values = answer_config.get("values", [])
        return f"Select one from the list: [{', '.join(str(v) for v in valid_values)}]"
    
    elif answer_type == AnswerType.INTEGER:
        min_val = answer_config.get("min")
        max_val = answer_config.get("max")
        if min_val is not None and max_val is not None:
            return f"Integer between {min_val} and {max_val}"
        elif min_val is not None:
            return f"Integer >= {min_val}"
        elif max_val is not None:
            return f"Integer <= {max_val}"
        else:
            return "Integer"
    
    elif answer_type == AnswerType.FLOAT:
        return "Decimal number"
    
    elif answer_type == AnswerType.DATE:
        date_format = answer_config.get("format", "YYYY-MM-DD")
        return f"Date in format: {date_format}"
    
    elif answer_type == AnswerType.LIST:
        item_type = answer_config.get("item_type", "string")
        return f"List of {item_type} values, comma-separated"
    
    else:  # STRING
        max_length = answer_config.get("max_length")
        if max_length:
            return f"Text (max {max_length} characters)"
        return "Text"

In [None]:
example_text = """
Hello
I’m a concerned husband trying to see if I’m able to talk my wife into treatment to come off of benzos, vyvanse , and other miscellaneous drugs that are frequently mixed daily with alcohol Condition , THC gummies and occasionally kratom. She is not herself and making very serious and impulsive decisions that are badly affecting our family. I need to remain anonymous and make sure all communication is done via [EMAIL] to avoid alerting [PERSON] in case we do an intervention. I have Aetna Insurance PPO and wondering what the cost of 30 days of treatment would be if she were to opt to receive help."""

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()