# Form Parsing

In [2]:
import os, json
from llama_parse import LlamaParse
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import (
    VectorStoreIndex
    , StorageContext
    , load_index_from_storage
)
from llama_index.core.workflow import (
    StartEvent
    , StopEvent
    , Workflow
    , step
    , Event
    , Context
)
from helper import get_llama_cloud_api_key, get_llama_cloud_base_url, extract_html_content
from IPython.display import display, HTML
from llama_index.utils.workflow import draw_all_possible_flows


In [3]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
llama_cloud_api_key = get_llama_cloud_api_key()
llama_cloud_base_url = get_llama_cloud_base_url()

In [19]:
parser = LlamaParse(
    api_key=llama_cloud_api_key
    , base_url=llama_cloud_base_url
    , result_type="markdown"
    , content_guideline_instruction="This is a job application form. Create a list of all the fields that need to be filled in."
    , formatting_instruction="Return a bulleted list of the fields ONLY."
)
result = parser.load_data("data/fake_application_form.pdf")[0]

Started parsing the file under job_id 158f0f4a-a1ec-40d6-83da-750b5a2ee94f


In [20]:
print(result)

Doc ID: 60bda80e-cc6e-4b32-894a-c67b44a8ceb7
Text: # Big Tech Co. Job Application Form  # Position: Senior Web
Developer C3  Thanks for applying to Big Tech Co.! We are humbled that
you would consider working here.  Please fill in the following form to
help us get started.  |First Name|Last Name| |---|---| |Email|Phone|
|Linkedin|Project Portfolio| |Degree|Graduation Date| |Current Job
title|Cur...


In [21]:
print(result.text)

# Big Tech Co. Job Application Form

# Position: Senior Web Developer C3

Thanks for applying to Big Tech Co.! We are humbled that you would consider working here.

Please fill in the following form to help us get started.

|First Name|Last Name|
|---|---|
|Email|Phone|
|Linkedin|Project Portfolio|
|Degree|Graduation Date|
|Current Job title|Current Employer|
|Technical Skills|Technical Skills|
|Describe why you’re a good fit for this position|Describe why you’re a good fit for this position|
|Do you have 5 years of experience in React?|Do you have 5 years of experience in React?|


In [22]:
llm = Ollama(model="llama3.2:1b")

In [23]:
raw_json = llm.complete(
    f"""
    This is a parsed form.
    Convert it into a JSON object containing only the list 
    of fields to be filled in, in the form {{ fields: [...] }}. 
    <form>{result.text}</form>. 
    Return JSON ONLY, no markdown."""
)

In [24]:
raw_json.text

'{ "fields": ["First Name", "Last Name", "Email", "Phone", "Linkedin", "Degree", "Graduation Date", "Current Job title", "Technical Skills", "Describe why you’re a good fit for this position", "Do you have 5 years of experience in React?" ] }'

In [25]:
fields = json.loads(raw_json.text)["fields"]
fields

['First Name',
 'Last Name',
 'Email',
 'Phone',
 'Linkedin',
 'Degree',
 'Graduation Date',
 'Current Job title',
 'Technical Skills',
 'Describe why you’re a good fit for this position',
 'Do you have 5 years of experience in React?']

In [26]:
class ParseFormEvent(Event):
    application_form: str

class QueryEvent(Event):
    query: str
    field: str 

class ResponseEvent(Event):
    response: str

In [None]:
class RAGWorkflow(Workflow):
    storage_dir = "./storage"
    llm: Ollama
    query_engine: VectorStoreIndex
    embed_model: OllamaEmbedding

    @step
    async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent:
        if not ev.resume_file:
            raise ValueError("Resume file is required")
        
        # define LLM
        self.llm = Ollama(model="llama3.2:3b")
        self.embed_model=OllamaEmbedding(model_name="nomic-embed-text")

        if os.path.exists(self.storage_dir):
            storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
            index = load_index_from_storage(storage_context, embed_model=self.embed_model)
        else:
            documents = LlamaParse(
                result_type="markdown"
                , content_guideline_instruction="This is a resume, gather related facts together and format it as bullet points with headers"
            ).load_data(ev.resume_file)

            index = VectorStoreIndex.from_documents(
                documents=documents
                , embed_model=self.embed_model
            )
            index.storage_context.persist(persist_dir=self.storage_dir)
        
        self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)

        return ParseFormEvent(application_form=ev.application_form)
    
    @step
    async def parse_form(self, ctx: Context, ev:ParseFormEvent) -> QueryEvent:
        parser = LlamaParse(
            api_key=llama_cloud_api_key
            , base_url=llama_cloud_base_url
            , result_type="markdown"
            , content_guideline_instruction="This is a job application form. Create a list of all the fields that need to be filled in."
            , formatting_instruction="Return a bulleted list of the fields ONLY."
        )

        result = parser.load_data(ev.application_form)[0]
        raw_json = self.llm.complete(
            f"""
            This is a parsed form.
            Convert it into a JSON object containing only the list 
            of fields to be filled in, in the form {{ fields: [...] }}. 
            <form>{result.text}</form>.
            Return JSON ONLY, no markdown."""
        )
        print(raw_json)
        fields = json.loads(raw_json.text)["fields"]
        print(f"fields: {fields}")

        for field in fields:
            ctx.send_event(
                QueryEvent(
                    field=field
                    , query=f"How would you answer this question about the candidate? {field}"
                )
            )
        
        await ctx.set("total_fields", len(fields))
        return

    @step
    async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent:
        response = self.query_engine.query(f"This is a question about the specific resume we have in our database: {ev.query}")
        return ResponseEvent(field=ev.field, response=response.response)
    
    @step
    async def fill_in_application(self, ctx: Context, ev: ResponseEvent) -> StopEvent:
        total_fields = await ctx.get("total_fields")

        responses = ctx.collect_events(ev, [ResponseEvent] * total_fields, timeout=3000)
        if responses is None:
            return None 
        
        responseListString = "\n".join("Field: " + r.field + "\n" + "Response: " + r.response for r in responses)

        result = self.llm.complete(
            f"""
            You are given a list of fields in an application form and responses to
            questions about those fields from a resume. Combine the two into a list of
            fields and succinct, factual answers to fill in those fields.

            <responses>
            {responseListString}
            </responses>
            """
        )
        return StopEvent(result=result)

            

In [None]:
w = RAGWorkflow(verbose=False, timeout=3000)
result = await w.run(
    resume_file="data/fake_resume.pdf"
    , application_form="data/fake_application_form.pdf"
)
print(result)

In [None]:
WORKFLOW_FILE = "workflows/form_parsing_workflow.html"
draw_all_possible_flows(w, filename=WORKFLOW_FILE)
html_content = extract_html_content(WORKFLOW_FILE)
display(HTML(html_content), metadata=dict(isolated=True))