# LLM to summarize each document for the given question 

In [1]:
import prompts

### calling the LLM

In [2]:
import os
from dotenv import load_dotenv
import openai
# Load environment variables from .env file
load_dotenv()

# Get OpenRouter API key from environment variables
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
if not OPENROUTER_API_KEY:
    raise ValueError("Please set OPENROUTER_API_KEY in your .env file or environment variables")

In [3]:
client = openai.OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

## Putting this into a pipeline

Now that we have seen this work in isolation, let's operationalize it so we can have summaries saved for each of the policies of interest

In [4]:
def perform_summarization(doc_prompts):
    summaries = []
    for idx, prompt in enumerate(doc_prompts):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": prompts.SYSTEM_PROMPT},
                {"role": "user", "content": prompt}
            ]
        )
        text_response = response.choices[0].message.content
        summaries.append(text_response)
        print(f'Assessed question: question_{idx}_prompt')
    return summaries

In [5]:
all_docs_path = "../data/scraped_documents/"
all_docs = os.listdir(all_docs_path)
all_docs = [doc for doc in all_docs if doc.endswith('.txt')]

In [6]:
def get_all_prompts(doc_text):
    question_2_prompt = prompts.summarizing_prompt(doc_text, "sectoral focus", prompts.question_2_json_schema)
    question_3_prompt = prompts.summarizing_prompt(doc_text, "subject of intervention", prompts.question_3_json_schema, prompts.question_3_note)
    question_4_prompt = prompts.summarizing_prompt(doc_text, "market failure", prompts.question_4_json_schema)
    question_5_prompt = prompts.summarizing_prompt(doc_text, "type of instrument", prompts.question_5_json_schema)
    question_6_prompt = prompts.summarizing_prompt(doc_text, "metadata and logistical details", prompts.question_6_json_schema, prompts.question_6_note)
    all_prompts = [question_2_prompt, question_3_prompt, question_4_prompt, question_5_prompt, question_6_prompt]

    return all_prompts


## Perform summarization

Fix this so it works of unique ids instead of names!!! 

In [15]:
import tiktoken

summaries_base_path = "../data/summaries/"
MAX_TOKENS = 128000
SAFETY_MARGIN = 1024  # Reserve tokens for prompt/system messages, etc.

def prompt_token_count(prompt, encoding="gpt-4"):
    """
    Utility to count tokens in a single prompt.
    """
    try:
        enc = tiktoken.encoding_for_model(encoding)
    except KeyError:
        enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(prompt))

def available_tokens_for_doc(doc_prompts, max_tokens=MAX_TOKENS, safety_margin=SAFETY_MARGIN, encoding="gpt-4"):
    """
    Determines the maximum tokens that can be used for doc_text
    to ensure that the prompt plus system messages fit in context.
    """
    # Estimate length of the largest prompt (prompt is built from doc_text)
    prompt_overhead = max([prompt_token_count(p, encoding) for p in doc_prompts]) if doc_prompts else 0
    # Just to be sure, allow a safety margin on top of the max_tokens
    return max_tokens - prompt_overhead - safety_margin

def truncate_text_to_token_limit(text, max_tokens, encoding="gpt-4"):
    """
    Truncate text so its tokenized length is <= max_tokens.
    """
    try:
        enc = tiktoken.encoding_for_model(encoding)
    except KeyError:
        enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    if len(tokens) > max_tokens:
        print(f"Truncating base document from {len(tokens)} tokens to {max_tokens} tokens.")
        tokens = tokens[:max_tokens]
        text = enc.decode(tokens)
    return text

for doc in all_docs:
    doc_name_no_ext = os.path.splitext(doc)[0]
    doc_summary_folder = os.path.join(summaries_base_path, doc_name_no_ext)
    if os.path.exists(doc_summary_folder):
        print(f"Skipping {doc}: summary already exists for {doc_name_no_ext}.")
        continue

    print("working on: ", doc)
    doc_text = open(os.path.join(all_docs_path, doc), "r").read()

    # Step 1: Estimate what doc_prompts would look like for a chunk of doc_text
    # Temporarily build prompts with the full doc_text
    temp_doc_prompts = get_all_prompts(doc_text)
    # Calculate maximum allowed tokens in doc_text (subtracting prompt overhead and margin)
    max_allowed_tokens = available_tokens_for_doc(temp_doc_prompts, max_tokens=MAX_TOKENS, safety_margin=SAFETY_MARGIN)

    # Step 2: Actually truncate doc_text to fit
    doc_text = truncate_text_to_token_limit(doc_text, max_allowed_tokens)

    # Step 3: Build prompts again (using truncated doc_text)
    doc_prompts = get_all_prompts(doc_text)
    doc_summaries = perform_summarization(doc_prompts)

    # Create subfolder for document summary
    os.makedirs(doc_summary_folder, exist_ok=True)

    # Save each summary as a separate file inside the subfolder
    for idx, summary in enumerate(doc_summaries, start=2):
        summary_filename = f"question_{idx}_summary.txt"
        summary_path = os.path.join(doc_summary_folder, summary_filename)
        with open(summary_path, "w") as f:
            f.write(summary)


Skipping OYQNb4RUEiT0b8Hk.txt: summary already exists for OYQNb4RUEiT0b8Hk.
Skipping ldyDWFana7Mtg9DA.txt: summary already exists for ldyDWFana7Mtg9DA.
Skipping FfzBsvXzD9fkY2WQ.txt: summary already exists for FfzBsvXzD9fkY2WQ.
Skipping Y4A9q1tIiEMnRZWG.txt: summary already exists for Y4A9q1tIiEMnRZWG.
Skipping hUDZml18YmFdtiCU.txt: summary already exists for hUDZml18YmFdtiCU.
Skipping hYFpxqkLQQDZxafo.txt: summary already exists for hYFpxqkLQQDZxafo.
Skipping OP35ITBZ63LXSMj9.txt: summary already exists for OP35ITBZ63LXSMj9.
Skipping fbjFEgjb9FLcJ2e6.txt: summary already exists for fbjFEgjb9FLcJ2e6.
Skipping K3RMAeovpdI1sbWI.txt: summary already exists for K3RMAeovpdI1sbWI.
Skipping aeoKi1dhWO8Tmoyi.txt: summary already exists for aeoKi1dhWO8Tmoyi.
working on:  vZaY5DRdtKwXlKQZ.txt
Assessed question: question_0_prompt
Assessed question: question_1_prompt
Assessed question: question_2_prompt
Assessed question: question_3_prompt
Assessed question: question_4_prompt
Skipping dD9SoORj2oSp