# Structured Chunking

from https://blog.langchain.dev/a-chunk-by-any-other-name/

In [119]:
# Required for csv to markdown conversion
!pip install tabulate

[autoreload of langchain failed: Traceback (most recent call last):
  File "/Users/dvdblk/miniconda/envs/hack4good/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/dvdblk/miniconda/envs/hack4good/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 455, in superreload
    if not append_obj(module, old_objects, name, obj):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dvdblk/miniconda/envs/hack4good/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 423, in append_obj
    in_module = hasattr(obj, "__module__") and obj.__module__ == module.__name__
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/dvdblk/miniconda/envs/hack4good/lib/python3.11/site-packages/langchain/agents/__init__.py", line 93, in __getattr__
    raise AttributeError(f"{name} does not exist")
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ImportError: __module__

In [1]:
from dotenv import load_dotenv

### Workaround for local package imports

To import from app.preprocessing package in the root of the repo we need to CD into the root.

Note: this code is only necessary in a Jupyter notebook. In a regular python script, the imports work fine.

In [2]:
import os
import sys

# This assumes your working directory is in the experiments/<name>/ dir
repo_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Check if app/ dir exists in current working directory along with .env (to make this cell re-runnable)
if os.path.exists(os.path.join(os.getcwd(), "app")) and os.path.exists(os.path.join(os.getcwd(), ".env")):
    app_path = os.path.join(os.getcwd(), "app")
    sys.path.append(app_path)
    print("app/ dir and .env found in current working directory, keeping CWD as is.")
else:
    os.chdir(repo_root)
    print("app/ dir and/or .env not found in current working directory, changing CWD to assumed repo root.")

# Load .env
load_dotenv()

app/ dir and/or .env not found in current working directory, changing CWD to assumed repo root.


True

## Load structured document 

Steps to get `Document` object from a pdf path:
1. create `AdobeExtractAPIManager` instance
2. call `doc = extract_document` method with path to document
3. call `AdobeDocumentSplitter().document_to_chunks(doc)` 

In [22]:
from app.preprocessing.adobe.manager import AdobeExtractAPIManager

adobe_manager = AdobeExtractAPIManager(
    os.getenv("ADOBE_CLIENT_ID"),
    os.getenv("ADOBE_CLIENT_SECRET"),
    extract_dir_path="data/interim/000-adobe-extract/"
)

####
# Edit the path below to point to a PDF file on your local machine
####
document = adobe_manager.get_document(
    "/Users/dvdblk/Downloads/pdf_files_complete/UK_10.pdf"
)

2023-11-20 23:31:11.134 app.preprocessing.adobe.manager INFO     Initialized AdobeExtractAPIManager (with extract_dir_path=data/interim/000-adobe-extract/)


#### Visualize the sections of the document

In [23]:
def print_hierarchy(section, level=0):
    print("\t" * level, section.section_type, section.title, section.starting_page, section.id)
    for subsection in section.subsections:
        print_hierarchy(subsection, level + 1)

# Print document section title hierarchy
print_hierarchy(document)

 document None None 
	 H1 Skills for Jobs: Lifelong Learning for Opportunity and Growth Presented to Parliament by the Secretary of State for Education by Command of Her Majesty 2 1
	 H1 Contents 4 2
	 H1 Foreword by the Secretary of State for Education 6 3
		 H2 Rt. Hon. Gavin Williamson CBE MP 7 3.1
	 H1 Executive Summary 8 4
		 H2 Lifetime Skills Guarantee 8 4.1
		 H2 The case for change 9 4.2
		 H2 Our plan for reform 11 4.3
		 H2 Summary of decisions 12 4.4
			 H3 Putting employers at the heart of post-16 skills: 12 4.4.1
			 H3 Providing the advanced technical and higher technical skills the nation needs: 13 4.4.2
			 H3 A flexible Lifetime Skills Guarantee: 13 4.4.3
			 H3 Responsive providers supported by effective accountability, governance, and intervention 14 4.4.4
			 H3 Supporting outstanding teaching 15 4.4.5
	 H1 Chapter 1: Putting employers at the heart of post-16 skills 16 5
		 H2 A tailored plan to meet local skills needs 18 5.1
		 H2 Investing in local skills priorit

## Create chunks from sections

In [24]:
from app.preprocessing.adobe.splitter import DocumentSplitter

# Get chunks
docs = DocumentSplitter().document_to_chunks(document)

In [25]:
# Print avg chunk length
avg_chunk_len = sum([len(doc.page_content) for doc in docs]) / len(docs)
avg_chunk_len

2735.6481481481483

## Structured chunking methods below

In [26]:
def print_result(response_obj):
    print("SOURCES: \n")
    cnt = 1
    for source_doc in response_obj["source_documents"]:
        print(f"Chunk #{cnt}")
        cnt += 1
        print("Source Metadata: ", source_doc.metadata)
        print("Source Text:")
        print(source_doc.page_content)
        print("\n")
    print("RESULT: \n")
    print(response_obj["result"] + "\n\n")

In [27]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA


llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0, request_timeout=15)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embedding=embeddings)
retriever = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

In [None]:
query = "Are there any mentions of skills or technologies?"
response = retriever({"query": query})
print_result(response)

SOURCES: 

Chunk #1
Source Metadata:  {'H1': '4. Talent and Skills'}
Source Text:
Vision: The UK has a large, varied base of skilled, technical and entrepreneurial talent which is agile and quickly responds to the needs of industry, academia and government. This includes talent in STEM, digital and data, commercialisation and national security.


Chunk #2
Source Metadata:  {'H1': '8. Access to Physical and Digital Infrastructure'}
Source Text:
Vision: Accessibility and coordination of infrastructure attracts talent and investment, establishes anchors for innovation clusters and enables companies to scale. The UK has diverse, agile and resilient facilities to support its technology choices and works with partners globally to deliver major science and technology projects.


Chunk #3
Source Metadata:  {'H2': 'Outcomes – by 2030 we will have:', 'H1': '4. Talent and Skills'}
Source Text:
● Created an agile and responsive skills system, which delivers the skills needed to support a world-cla

# Summarizing each section

(structured chunking ends with the cell above)

In [28]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
from typing import Optional

"""
    Your answer will be exactly in the following format:
    ```json
    {{
        "summary": "<your summary here>"
    }}
    ```
"""

class SectionSummaryOutput(BaseModel):
    """Contains summary of a given section"""
    summary: Optional[str] = Field(None, description="the summary of the section")

# If we pass in a model explicitly, we need to make sure it supports the OpenAI function-calling API.
prompt_template_structured = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document in no more than 15-20 sentences. Don't make a longer summary than the original text.

The section text will be given to you in the following json format:
```json
{{
    "section": {{
        "title": "<section title>",
        "text": "<section text to summarize>"
    }}
}}
```

Make sure to follow these rules while summarizing (as if your life depended on it):
1. absolutely make sure that you don't skip any mentions of technologies, skills, capabilities or investments related to any of these topics: Advanced Computing, Battery Technologies, Semiconductors, Clean Energy.
2. pay attention to the intention of the section especially with regards to sentiment towards adoption or promotion of any skills.
3. if the text mentions or discusses policy initiatives related to inclusion, health, digital, green resilience make sure to include them in the summary.
4. mention any discussion of funding, investments or budget allocations.
5. in the summary, make sure to mention whether there is a certain future need for any skills or technologies
6. mention any explicit skill needs that are mentioned in the text.
7. if the section is a table of contents or an index, just return "table of contents" as the summary
8. if the entire section contains only publication citations, don't summarize it just return "references" as the summary.
            """,
        ),
        (
            "human",
            """
Here is the section json of a document to summarize:
```json
{{
    "section": {{
        "title": {section_title},
        "text": {section_text}
    }}
}}
```
            """,
        ),
        ("human", "Tip: Make sure to answer in the correct format"),
    ]
)

In [29]:
from langchain.chains.openai_functions import create_structured_output_runnable
import json

def get_all_document_sections(document):
    sections = []

    if subsections := document.subsections:
        for subsection in subsections:
            sections.append(subsection)
            if s := get_all_document_sections(subsection):
                sections.extend(s)

    return sections


all_sections = get_all_document_sections(document)

runnable = create_structured_output_runnable(SectionSummaryOutput, llm, prompt_template_structured)

section_summaries = []

for section in all_sections:
    title, text = section.title, "\n".join([p.text for p in section.paragraphs])

    if len(text) > 0:
        # Response now has no content, it's the pydantic object instead
        response = runnable.invoke({"section_title": title, "section_text": text})
        pretty_resp = json.dumps(response.dict(), indent=2)

        section_summaries.append((section, response))

        print(section.section_type, section.title, section.starting_page, section.id)
        print(pretty_resp)
        print("\n\n")
    else:
        section_summaries.append((section, SectionSummaryOutput(summary=None)))
        print("Not querying section with no text:", section.title)
        print("Saved summary as None")
        print("\n\n")


H1 Skills for Jobs: Lifelong Learning for Opportunity and Growth Presented to Parliament by the Secretary of State for Education by Command of Her Majesty 2 1
{
  "summary": "references"
}



H1 Contents 4 2
{
  "summary": "table of contents"
}



H1 Foreword by the Secretary of State for Education 6 3
{
  "summary": "The section emphasizes the need for skills development to support the nation's recovery and growth post-coronavirus pandemic and Brexit. It highlights the introduction of a Lifetime Skills Guarantee by the Prime Minister to ensure access to education and training throughout individuals' lives. The document aims to address the undervaluation of further education and technical skills, emphasizing the importance of these pathways for successful careers. There is a focus on closing the skills gap in areas such as technicians, engineers, and health and social care professionals to improve productivity and international competitiveness. The White Paper outlines plans to strengt

KeyboardInterrupt: 

In [21]:
for line in document.text.split("\n"):
    print(line)

Essential Digital Skills Methodology
Sample
Fieldwork dates
Fieldwork methodology
Caveats to changing framework
•
•
•
•
•
•
•
•
•
•
•
UK Representivity and Population Estimates
https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/summary
https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/bulletins/annual midyearpopulationestimates/mid2020
oflabourmarketstatistics
Impairment classification
Ethnicity classification
Employment classification
Internet access question
Impact of higher cost of living question
A new question was asked to participants in the 2022 survey to understand how they perceive that the ongoing cost of living crisis in the UK has impacted their use of the internet and other online activities:
Given the increasing cost of living in the UK driven by higher prices on goods and services such as food, energy and fuel, which of the following apply to you?
1. I will have to give up interne

### PDFTriage style OpenAI functions

1. initial prompt question (context=document section hierarchy + summaries)


the following is an impl of openai functions with langchain (https://python.langchain.com/docs/modules/chains/how_to/openai_functions#getting-structured-outputs):

In [12]:
from typing import Optional

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_openai_fn_runnable,
    create_structured_output_chain,
    create_structured_output_runnable,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field

In [67]:
class Person(BaseModel):
    """Identifying information about a person."""

    name: str = Field(..., description="The person's name")
    age: int = Field(..., description="The person's age")
    fav_food: Optional[str] = Field(None, description="The person's favorite food")

# If we pass in a model explicitly, we need to make sure it supports the OpenAI function-calling API.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a world class algorithm for extracting information in structured formats.",
        ),
        (
            "human",
            "Use the given format to extract information from the following input: {input}",
        ),
        ("human", "Tip: Make sure to answer in the correct format"),
    ]
)

runnable = create_structured_output_runnable(Person, llm, prompt)
runnable.invoke({"input": "Sally is 13"})

In [15]:
from typing import Sequence


class People(BaseModel):
    """Identifying information about all people in a text."""

    people: Sequence[Person] = Field(..., description="The people in the text")


runnable = create_structured_output_runnable(People, llm, prompt)
runnable.invoke(
    {
        "input": "Sally is 13, Joey just turned 12 and loves spinach. Caroline is 10 years older than Sally."
    }
)

People(people=[Person(name='Sally', age=13, fav_food=None), Person(name='Joey', age=12, fav_food='spinach'), Person(name='Caroline', age=23, fav_food=None)])

## Tools as OpenAI Functions

https://python.langchain.com/docs/modules/agents/tools/tools_as_openai_functions

In [100]:
from langchain.schema import HumanMessage
from langchain.tools import format_tool_to_openai_function, BaseTool
from typing import Type, Sequence
from langchain.prompts import HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.callbacks.manager import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)


# Fetch Section tool
class FetchSectionsSchema(BaseModel):
    reasoning: str = Field(description="the reasoning behind the selection of a section to fetch")
    section_ids: Sequence[str] = Field(description="the exact ID(s) of the section(s) to fetch")

class FetchSectionsTool(BaseTool):
    name = "fetch_sections"
    description = "fetches an entire section or sections from a document that might contain an answer to the question"
    args_schema: Type[FetchSectionsSchema] = FetchSectionsSchema

    def _run(
        self,
        reasoning: str,
        section_ids: Sequence[str],
        run_manager: Optional[CallbackManagerForToolRun] = None,
        **kwargs,
    ) -> Sequence[str]:
        """Use the tool."""
        sections = []
        # get full section text from document
        for section, _ in section_summaries:
            if section.id in section_ids:
                result = {
                    "title": section.title_clean,
                    "id": section.id,
                    "text": "\n".join([p.text for p in section.paragraphs])
                }
                sections.append(result)
        return sections


# Fetch pages tool
class FetchPagesSchema(BaseModel):
    page_numbers: Sequence[int] = Field(description="the page numbers to fetch")

class FetchPagesTool(BaseTool):
    name = "fetch_pages"
    description = "useful when you need to fetch specific pages from a document, for example to fetch multiple sections at a time"
    args_schema: Type[FetchPagesSchema] = FetchPagesSchema

# Create langchain tools
tools = [
    FetchSectionsTool(),
    #FetchPagesTool()
]
# Transform to openai functions
openai_functions = [format_tool_to_openai_function(t) for t in tools]

"""
<< Example structure of the document >>
```json
{{
    "document": {{
        "title": <title of the document>,
        "sections": [
            {{
                "title": <title of the section>,
                "pages": <list of page numbers this section spans over>,
                "summary": <brief summary of the section>,
                "sections": <list of nested sections in this section, same structure as above>
            }}
        ]
    }}
}}
```

"""
# Structured metadata prompt (initial for most questions)
structured_metadata_system_prompt = SystemMessagePromptTemplate.from_template(
"""
You're an expert policy analyst that needs to find the appropriate sections of an economic policy document that answers the given question.
Your task is to look at the summaries in the following structural metadata json and find the appropriate section IDs of the document that might contain the answer.

Strictly adhere to these rules under all circumstances:
1. if you can't find the answer from the summaries, just fetch the most relevant sections to the question
2. for questions that are similar to "what is the document about?" or "what is the summary of the document?": try to fetch initial or final sections with "summary" or "conclusion" in their title.
3. always make sure to return all sections or subsections that might be relevant to the question as their respective IDs
""")

structured_metadata_prompt = HumanMessagePromptTemplate.from_template("""
<< Question >>
{question}

<< Document >>
{document_structural_metadata}
""")
structured_metadata_prompt

HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['document_structural_metadata', 'question'], template='\n<< Question >>\n{question}\n\n<< Document >>\n{document_structural_metadata}\n'))

In [101]:
def document_to_structured_metadata(section):
    """Convert document to structured metadata"""
    # Check if the document is the root node
    if section.section_type == "document":
        return {
            "document": {
                "title": document.title,
                "sections": [
                    document_to_structured_metadata(section) for section in document.subsections
                ]
            }
        }
    else:
        # find section from section summaries
        section, summary_response = next(filter(lambda x: x[0] == section, section_summaries))
        result = {
            "title": section.title_clean,
            "id": section.id,
            "pages": sorted(section.pages),
            "summary": summary_response.summary
        }
        if subsections := [document_to_structured_metadata(subsection) for subsection in section.subsections]:
            result["sections"] = subsections

        return result


print(json.dumps(document_to_structured_metadata(document), indent=2))


{
  "document": {
    "title": null,
    "sections": [
      {
        "title": "CONTENTS",
        "id": "1",
        "pages": [
          2
        ],
        "summary": "table of contents"
      },
      {
        "title": "EXECUTIVE SUMMARY",
        "id": "2",
        "pages": [
          4,
          5,
          6
        ],
        "summary": "The section discusses the government's goal of creating a modern class of technicians to address skills shortages and the aging workforce in the UK economy. It investigates the role of technicians in the space industry, their duties, required skills, and how employers obtain them. The report focuses on the space sector's upstream and downstream activities, highlighting the sector's significant contribution to the UK GDP and employment. It also emphasizes the rapid growth and high labor productivity of the space sector. The data collected indicates that most technicians are employed by upstream manufacturers and possess qualifications such

In [102]:
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser

llm_with_fns = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0, request_timeout=15)
llm_with_fns.bind(functions=openai_functions)

fns_prompt = ChatPromptTemplate.from_messages(
    [
        structured_metadata_system_prompt,
        structured_metadata_prompt,
    ]
)

fns_agent = (
    fns_prompt
    | llm_with_fns
)

# fns_agent.invoke({
#     "document_structural_metadata": document_to_structured_metadata(document),
#     "question": "Which section of the document mentions funding?"
# })

In [103]:
# This works fairly well
question = "Does the document discuss specific degrees, qualifications or professions with regard to Advanced Computing, if so, how?"
fns_response_message = llm.predict_messages(
    [
        structured_metadata_system_prompt.format(),
        structured_metadata_prompt.format(
            document_structural_metadata=document_to_structured_metadata(document),
            question=question
        )
    ],
    functions=openai_functions,
)

llm.invoke

In [104]:
fns_response_message

AIMessage(content='', additional_kwargs={'function_call': {'name': 'fetch_sections', 'arguments': '{"reasoning":"The document discusses the qualifications and roles of technicians in the UK space industry, particularly in the upstream and downstream segments. The section \'RESULTS I: THE CURRENT TECHNICIAN WORKFORCE: SIZE, ROLES, QUALIFICATIONS, AND ORIGINS\' provides detailed information about the qualifications possessed by technicians and the types of technician roles within the space industry. Additionally, the section \'QUALIFICATIONS\' discusses the qualifications typically possessed by technicians working in the space industry. These sections are likely to contain information about specific degrees, qualifications, or professions with regard to Advanced Computing.","section_ids":["6","6.1","6.2","6.3"]}'}})

In [105]:
import json

# Refine all sections into one answer if there are more than 1 section returned by the chain above
def parse_function_output(response) -> str:
    # Get the function call
    fn_call = response.additional_kwargs.get("function_call")

    # Check if the response content is empty and that there is a function call
    if response.content == "" and fn_call is not None:
        # Get the attributes of the function call
        tool_name = fn_call["name"]
        tool_args = json.loads(fn_call["arguments"])
        # Get the correct tool from the tools list
        tool = next(filter(lambda x: x.name == tool_name, tools))
        fn_output = tool._run(**tool_args)
        return fn_output
    else:
        # Otherwise return the content
        return response.content

# Fetched sections
fetched_sections = parse_function_output(fns_response_message)

In [106]:
fetched_sections

[{'title': ' RESULTS I:THE CURRENT TECHNICIAN WORKFORCE: SIZE, ROLES, QUALIFICATIONS,AND ORIGINS',
  'id': '6',
  'text': 'This section of the report outlines what the research carried out for this project reveals about issues such as: the size of the technician workforce; the types of roles that are typically undertaken by technicians in the space industry and the kinds of duties that are associated with those roles; the kind (level, and subject-matter) of qualifications those technicians typically possess; and how organisations in the space industry have until now gone about satisfying their need for technicians, in particular the balance they have struck between recruitment and (various forms of) training as a means of acquiring the technicians they need.'},
 {'title': 'THE SIZE OF THE TECHNICIAN WORKFORCE',
  'id': '6.1',
  'text': 'The evidence gathered for this study indicates that the vast majority of the technicians who are employed in the case study organisations work on upstr

### Map re-rank chain

In [62]:
# Map re-rank all sections into one answer and return the id of the sections
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import PydanticOutputFunctionsParser
from langchain.prompts import PromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema.prompt_template import format_document
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

# Chain to apply to each individual document. Chain
# provides an answer to the question based on the document
# and scores it's confidence in the answer.
map_prompt = PromptTemplate.from_template(
    "You're a world class policy analyst that is analyzing an economic policy document. Your goal is to answer the question based on the given context. "
    "\n\n<< Context >>:\n\n{context}\n\n<<Question>>: {question}"
)


class AnswerAndScore(BaseModel):
    """Return the answer to the question and a relevance score."""

    answer: str = Field(
        description="The answer to the question, which is based ONLY on the provided context."
    )
    score: float = Field(
        decsription="A 0.0-1.0 relevance score, where 1.0 indicates the provided context answers the question completely and 0.0 indicates the provided context does not answer the question at all."
    )


function = convert_pydantic_to_openai_function(AnswerAndScore)
map_chain = (
    map_prompt
    | ChatOpenAI().bind(
        temperature=0, functions=[function], function_call={"name": "AnswerAndScore"}
    )
    | PydanticOutputFunctionsParser(pydantic_schema=AnswerAndScore)
).with_config(run_name="Map")

# Final chain, which after answer and scoring based on
# each doc return the answer with the highest score
def top_answer(scored_answers):
    return max(scored_answers, key=lambda x: x.score).answer


document_prompt = PromptTemplate.from_template("{page_content}")
map_rerank_chain = (
    (
        lambda x: [
            {
                "context": section_json_str,
                "question": x["question"],
            }
            for section_json_str in x["fetched_sections"]
        ]
    )
    | map_chain.map()
    | top_answer
).with_config(run_name="Map rerank")

In [63]:
# Runs the map-rerank chain
# map_rerank_chain.invoke({"fetched_sections": fetched_sections, "question": question})

'Section 6.2.1.1 talks about Machinists.'

### Refine chain

Recursive summary with intermediate steps over all the answers while keeping the relevant sections

In [111]:
class RefineIO(BaseModel):
    intermediate_answer: str = Field(description="your previous intermediate answer that might need to be refined with the additional context")
    section_ids: Sequence[str] = Field(description="the exact ID(s) of the sections that were used to generate the intermediate answer")

refine_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You're a world class policy analyst that is analyzing an economic policy document by going section over section. Your goal is to answer the question based on the given section text and intermediate_answer.

            """
        ),
        (
            "human",
            """
            Here is the intermediate_answer you generated along with the section IDs that were used to generate it: \n{refine_io}
            Use the given format to refine your previous intermediate_answer with the following section: \n{section}
            Here is the question that you need to answer in intermediate_answer: {question}
            """
        ),
        ("human", "Tip: Make sure to answer in the correct format"),
    ]
)


refine_runnable = create_structured_output_runnable(RefineIO, llm, refine_prompt)

# Run the refine runnable for each fetched section while feeding the previous answer as the intermediate answer
initial_refine_io = RefineIO(intermediate_answer="", section_ids=[])
for section in fetched_sections:
    refine_io = refine_runnable.invoke({
        "refine_io": initial_refine_io.json(),
        "section": section,
        "question": question
    })
    print(refine_io)
    initial_refine_io = refine_io

refine_result = initial_refine_io

intermediate_answer='The document discusses the qualifications possessed by technicians in the space industry, including the level and subject-matter of qualifications. It does not specifically mention degrees or professions related to Advanced Computing.' section_ids=['6']
intermediate_answer='The document discusses the qualifications possessed by technicians in the space industry, including the level and subject-matter of qualifications. It does not specifically mention degrees or professions related to Advanced Computing. The evidence gathered for this study indicates that the vast majority of the technicians who are employed in the case study organisations work on upstream manufacturing projects. All 16 of the organisations that are involved, in whole or part, in upstream manufacturing activities – namely, the 11 manufacturers of components and sub-systems, the two space primes, and the three other organisations that combine upstream manufacturing with some downstream service provi

In [112]:
print("Original question: ", question)
print("Final text output", refine_result.intermediate_answer)
print("Identified relevant sections IDs: ", refine_result.section_ids)

Original question:  Does the document discuss specific degrees, qualifications or professions with regard to Advanced Computing, if so, how?
Final text output The document discusses the qualifications possessed by technicians in the space industry, including the level and subject-matter of qualifications. It does not specifically mention degrees or professions related to Advanced Computing. The evidence gathered for this study indicates that the vast majority of the technicians who are employed in the case study organisations work on upstream manufacturing projects. All 16 of the organisations that are involved, in whole or part, in upstream manufacturing activities \\u2013 namely, the 11 manufacturers of components and sub-systems, the two space primes, and the three other organisations that combine upstream manufacturing with some downstream service provision \\u2013 employ technicians. On average, in the space primes and specialist upstream manufacturers, technicians account for aro

---
#### Utils

In [240]:
for section, _ in section_summaries:
    if section.title == "5. RESULTS II:THE FUTURE TECHNICIAN WORKFORCE":
        for para in section.paragraphs:
            print(para.text)
            print()

Having discussed the origins of the case study organisations’ current technicians, we move on now to consider how the organisations in question propose to satisfy their future need for technicians.That is to say, we shall consider in this section the workforce planning strategies adopted by those space companies that employ technicians.This is an interesting and important issue, for a number of reasons.The first is that, as noted above, the increasingly difficulty of recruiting experienced technicians means that the approach most commonly adopted hitherto, namely recruitment-plus-upgrading, may not be as sustainable in the future as it was in the past. Second, many of the case study organisations, including seven of the 12 who are either currently training apprentices or are planning to do so – are growing, often very rapidly, and require increasing numbers of technicians. Of course, this reflects the more general point – made above – that the space industry is growing rapidly, both in

In [113]:
from pathlib import Path

input_pdfs = [f"UK_{i:02}.pdf" for i in range(86, 125)]


for input_pdf in input_pdfs:
    try:
        doc = adobe_manager.get_document(
            os.path.join("/Users/dvdblk/Downloads/pdf_files_complete/", input_pdf)
        )

        # Convert to text
        document_text = doc.text

        # write document_text to processed dir
        output_basename = Path(input_pdf).stem + ".txt"
        output_dir = "data/processed/adobe-extract/"

        # create dir if not exists
        os.makedirs(output_dir, exist_ok=True)

        with open(os.path.join(output_dir, output_basename), "w") as f:
            f.write(document_text)

        print(f"Processed file: {input_pdf}")
    except Exception as e:
        print(f"Error processing file: {input_pdf}")
        print(e)

2023-11-19 17:29:36.521 app.preprocessing.adobe.manager INFO     Calling Adobe Extract API
Error processing file: UK_86.pdf





























































































































































Processed file: UK_87.pdf
Processed file: UK_88.pdf








Error processing file: UK_89.pdf
'NoneType' object has no attribute 'endswith'








Error processing file: UK_90.pdf
unsupported operand type(s) for +: 'NoneType' and 'str'
Processed file: UK_91.pdf




Processed file: UK_92.pdf
























Processed file: UK_93.pdf
2023-11-19 17:29:43.172 app.preprocessing.adobe.manager INFO     Calling Adobe Extract API


INFO:app.preprocessing.adobe.manager:Calling Adobe Extract API


Processed file: UK_94.pdf
Error processing file: UK_95.pdf
[Errno 2] No such file or directory: '/Users/dvdblk/Downloads/pdf_files_complete/UK_95.pdf'
Error processing file: UK_96.pdf
[Errno 2] No such file or directory: '/Users/dvdblk/Downloads/pdf_files_complete/UK_96.pdf'
Processed file: UK_97.pdf
Processed file: UK_98.pdf
Processed file: UK_99.pdf
Processed file: UK_100.pdf
Processed file: UK_101.pdf
Processed file: UK_102.pdf
Processed file: UK_103.pdf
Error processing file: UK_104.pdf
[Errno 2] No such file or directory: '/Users/dvdblk/Downloads/pdf_files_complete/UK_104.pdf'








Processed file: UK_105.pdf
2023-11-19 17:31:12.976 app.preprocessing.adobe.manager INFO     Calling Adobe Extract API


INFO:app.preprocessing.adobe.manager:Calling Adobe Extract API






Processed file: UK_106.pdf
Error processing file: UK_107.pdf
[Errno 2] No such file or directory: '/Users/dvdblk/Downloads/pdf_files_complete/UK_107.pdf'
Error processing file: UK_108.pdf
[Errno 2] No such file or directory: '/Users/dvdblk/Downloads/pdf_files_complete/UK_108.pdf'
Error processing file: UK_109.pdf
[Errno 2] No such file or directory: '/Users/dvdblk/Downloads/pdf_files_complete/UK_109.pdf'
Processed file: UK_110.pdf
Processed file: UK_111.pdf




Processed file: UK_112.pdf
Error processing file: UK_113.pdf
Error tokenizing data. C error: Expected 4 fields in line 3, saw 5

Error processing file: UK_114.pdf
unsupported operand type(s) for +: 'NoneType' and 'str'
Error processing file: UK_115.pdf
Error tokenizing data. C error: Expected 2 fields in line 5, saw 3

























Processed file: UK_116.pdf
2023-11-19 17:31:31.057 app.preprocessing.adobe.manager INFO     Calling Adobe Extract API


INFO:app.preprocessing.adobe.manager:Calling Adobe Extract API


Error processing file: UK_117.pdf
Service returned a usage error with description: description =Either quota for this operation is not available or Free Tier quota is exhausted. Please visit (www.adobe.com/go/pdftoolsapi_home) to start using Free Tier quota or (www.adobe.com/go/pdftoolsapi_err_quota) to upgrade to paid credentials.; requestTrackingId=f7505c74-fd92-4508-92e7-f390cdd058cd; statusCode=429; errorCode=UNKNOWN




Processed file: UK_118.pdf
Error processing file: UK_119.pdf
'NoneType' object has no attribute 'endswith'
2023-11-19 17:31:34.404 app.preprocessing.adobe.manager INFO     Calling Adobe Extract API


INFO:app.preprocessing.adobe.manager:Calling Adobe Extract API


Error processing file: UK_120.pdf
Service returned a usage error with description: description =Either quota for this operation is not available or Free Tier quota is exhausted. Please visit (www.adobe.com/go/pdftoolsapi_home) to start using Free Tier quota or (www.adobe.com/go/pdftoolsapi_err_quota) to upgrade to paid credentials.; requestTrackingId=f75575bf-57e2-48ce-9c70-f2b32bd7d63d; statusCode=429; errorCode=UNKNOWN
Processed file: UK_121.pdf
Processed file: UK_122.pdf
Processed file: UK_123.pdf
Processed file: UK_124.pdf
