# Blocking out the functionality for doc tech
This API will:
- receive an arbitrary query from the user as a textual transcript from speech to text
- will decide which of some number of fixed commands is relevent
- will return a structured response based on what is relevent

Some things this will do, for instance:
- text to page number
- text to command for scrolling
- text to which PDF to open

In [None]:
!pip install langgraph langchain-openai

Collecting langgraph
  Downloading langgraph-0.2.35-py3-none-any.whl.metadata (13 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<0.4,>=0.2.39 (from langgraph)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.0.0 (from langgraph)
  Downloading langgraph_checkpoint-2.0.1-py3-none-any.whl.metadata (4.6 kB)
Collecting openai<2.0.0,>=1.40.0 (from langchain-openai)
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4,>=0.2.39->langgraph)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-core<0.4,>=0.2.39->langgraph)
  Downloading langsmith-0.1.134-py3-none-a

In [None]:
import os
from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OpenAIAPIKey')

# Note:
I'm thinking I'll first explore what type of thing I need to do

In [None]:
from typing import TypedDict
from langgraph.graph import StateGraph, START, END
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

#action determiner
class Action(TypedDict):
    scroll_up: bool
    scroll_down: bool
    next_page: bool
    previous_page: bool
    snap_page: bool
    find_fig: bool
    find_pdf: bool
    non_determ: bool

action_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Decide if the user wants one of the following actions performed:
            - `scroll_up`: scroll up a bit on the pdf
            - `scroll_down`: scroll down a bit on the pdf
            - `snap_page`: snap to a specific page of a pdf
            - `find_fig`: find a specific figure
            - `find_doc`: find a specific doc
            - `non_determ`: no valid action is discernable
            These are mutually exclusive. One should be true, the rest should be false.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

action_parser = action_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(Action)

user_query = 'hey doctor show me a document with that one diagram about a horse'
action_parser.invoke({"messages": [("ai", "my name is doc tech, what action would you like me to perform?"),("user", user_query)]})

{'scroll_up': False,
 'scroll_down': False,
 'next_page': False,
 'previous_page': False,
 'snap_page': False,
 'find_fig': True,
 'find_pdf': False,
 'non_determ': False}

# Note:
ok now I need to handle each of these cases.

In [None]:
class SnapPage(TypedDict):
    snap_page: int

snap_page_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Parse out the specific page of the pdf the user wants to snap to.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

snap_page_parser = snap_page_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(SnapPage)

user_query = 'go up two pages'
current_page = 6
snap_page_parser.invoke({"messages": [("ai", f"my name is doc tech, what page would you like to snap to. You are currently on page {current_page}"),("user", user_query)]})

{'snap_page': 8}

In [None]:
class FigDesc(TypedDict):
    figure_description: str

fig_desc_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """The user wants to find a figure. Extract a description of the figure the user needs.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

fig_desc_parser = fig_desc_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(FigDesc)

user_query = 'hey doc tech, yeah, i uh, i need a diagram of a snow a uh a snow globe with no yea with out a top'
fig_desc_parser.invoke({"messages": [("ai", f"my name is doc tech, describe the figure you want me to find."),("user", user_query)]})

{'figure_description': 'a diagram of a snow globe without a top'}

In [None]:
class DocDesc(TypedDict):
    doc_description: str

doc_desc_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """The user wants to find a document. Extract a description of the document the user needs.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

doc_desc_parser = doc_desc_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(DocDesc)

user_query = 'hey doc tech, yeah, i uh, i need a uh status report for twenty 24'
doc_desc_parser.invoke({"messages": [("ai", f"my name is doc tech, describe the figure you want me to find."),("user", user_query)]})

{'doc_description': 'status report for 2024'}

# Note:
I think it would be super useful to be able to understand if the user wants to be on the same pdf or not. So I'll make a parser for that general logic.

In [None]:
class SameDoc(TypedDict):
    same_doc: bool

same_doc_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """if the user explicitly said they want to do something on this document.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

same_doc_parser = same_doc_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(SameDoc)

user_query = 'find that one colorful figure somewhere on this'
same_doc_parser.invoke({"messages": [("ai", f"my name is doc tech, describe a figure you want me to find."),("user", user_query)]})

{'same_doc': True}

# Note:
ok we're making headway. Now I need a way to search a GX bucket for some specific documents/figures

11795

In [None]:
!pip install groundx-python-sdk --upgrade
!pip install openai

Collecting groundx-python-sdk
  Downloading groundx_python_sdk-1.3.27-py3-none-any.whl.metadata (33 kB)
Collecting cryptography<43.0.0,>=42.0.5 (from groundx-python-sdk)
  Downloading cryptography-42.0.8-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.3 kB)
Downloading groundx_python_sdk-1.3.27-py3-none-any.whl (359 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m359.4/359.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cryptography-42.0.8-cp39-abi3-manylinux_2_28_x86_64.whl (3.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cryptography, groundx-python-sdk
  Attempting uninstall: cryptography
    Found existing installation: cryptography 43.0.1
    Uninstalling cryptography-43.0.1:
      Successfully uninstalled cryptography-43.0.1
Successfully installed cryptography-42.0.8 groundx-python-sdk-1.3.27


In [None]:
from pprint import pprint
from groundx import Groundx
from google.colab import userdata

groundx = Groundx(
    api_key=userdata.get('GroundXAPIKey_daniel.warfield')
)

In [None]:
from pprint import pprint
from groundx import Groundx

bucket_id = 11795

def gx_search_figure(query):

    response = groundx.search.content(
        id=bucket_id,
        query=query
    )

    semantic_object = response.body['search']['results'][0]

    return semantic_object['sourceUrl'], semantic_object['boundingBoxes'][0]['pageNumber']

gx_search_figure('Theres a colorful block puzzle thing and I cant figure it out')

('https://upload.groundx.ai/file/11795/babys-first-blocks.pdf', 2)

In [None]:
from pprint import pprint
from groundx import Groundx

bucket_id = 11795

def gx_search_document(query):

    response = groundx.search.content(
        id=bucket_id,
        query=query
    )

    semantic_object = response.body['search']['results'][0]

    return semantic_object['sourceUrl']

gx_search_document('build document of a coffee table')

'https://upload.groundx.ai/file/11795/lack-coffee-table-black-brown__aa-472482-3-2.pdf'

# Tying it all together
given a query:
- decide on the action to take
- do any processing that's required
- return a response

In [None]:
!pip install langgraph langchain-openai
!pip install groundx-python-sdk --upgrade
!pip install openai

Collecting langgraph
  Downloading langgraph-0.2.35-py3-none-any.whl.metadata (13 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<0.4,>=0.2.39 (from langgraph)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.0.0 (from langgraph)
  Downloading langgraph_checkpoint-2.0.1-py3-none-any.whl.metadata (4.6 kB)
Collecting openai<2.0.0,>=1.40.0 (from langchain-openai)
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4,>=0.2.39->langgraph)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-core<0.4,>=0.2.39->langgraph)
  Downloading langsmith-0.1.134-py3-none-a

In [None]:
import os
from google.colab import userdata
os.environ['GROUNDX_API_KEY'] = userdata.get('GroundXAPIKey_daniel.warfield')
os.environ['OPENAI_API_KEY'] = userdata.get('OpenAIAPIKey')

In [None]:
from groundx import Groundx
from google.colab import userdata
from typing import TypedDict
from langgraph.graph import StateGraph, START, END
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import os

# Setting up API keys
groundx = Groundx(
    api_key=os.environ['GROUNDX_API_KEY']
)

#===============================================================================
# Action Parsing
#===============================================================================

#action determiner
class Action(TypedDict):
    scroll_up: bool
    scroll_down: bool
    next_page: bool
    previous_page: bool
    snap_page: bool
    find_fig: bool
    find_pdf: bool
    non_determ: bool

action_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Decide if the user wants one of the following actions performed:
            - `scroll_up`: scroll up a small amount within one page of the pdf
            - `scroll_down`: scroll down a small amount within one page of the pdf
            - `snap_page`: snap to a specific page of a pdf
            - `find_fig`: find a specific figure, table, image, or specific item.
            - `find_doc`: find a specific doc
            - `non_determ`: no valid action is discernable
            These are mutually exclusive. One should be true, the rest should be false.
            note: you can use snap_page to go to a page relative to the current page.
            note: blanket questions should default to find figure, unless they're obviously about a document
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

action_parser = action_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(Action)

#===============================================================================
# Snap Page Parsing
#===============================================================================

class SnapPage(TypedDict):
    snap_page: int

snap_page_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Parse out the specific page of the pdf the user wants to snap to.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

snap_page_parser = snap_page_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(SnapPage)

#===============================================================================
# Figure Description Parsing
#===============================================================================

class FigDesc(TypedDict):
    figure_description: str

fig_desc_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """The user wants to find a figure. Extract a description of the figure the user needs.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

fig_desc_parser = fig_desc_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(FigDesc)

#===============================================================================
# Document Description Parsing
#===============================================================================

class DocDesc(TypedDict):
    doc_description: str

doc_desc_parse_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """The user wants to find a document. Extract a description of the document the user needs.
            """,
        ),
        ("placeholder", "{messages}"),
    ]
)

doc_desc_parser = doc_desc_parse_prompt | ChatOpenAI(
    model="gpt-4o", temperature=0
).with_structured_output(DocDesc)

user_query = 'hey doc tech, yeah, i uh, i need a uh status report for twenty 24'
doc_desc_parser.invoke({"messages": [("ai", f"my name is doc tech, describe the figure you want me to find."),("user", user_query)]})

#===============================================================================
# Search for Figure
#===============================================================================

bucket_id = 11795

def gx_search_figure(query):

    response = groundx.search.content(
        id=bucket_id,
        query=query
    )

    semantic_object = response.body['search']['results'][0]

    return semantic_object['sourceUrl'], semantic_object['boundingBoxes'][0]['pageNumber']

#===============================================================================
# Search for Documents
#===============================================================================

def gx_search_document(query):

    response = groundx.search.content(
        id=bucket_id,
        query=query
    )

    semantic_object = response.body['search']['results'][0]

    return semantic_object['sourceUrl']

#===============================================================================
# Endpoint
#===============================================================================

def handle_query(query, context):
    #getting action that should be performed
    response = action_parser.invoke({"messages": [("ai", "my name is doc tech, what action would you like me to perform?"),("user", query)]})
    response['pdf']= None
    response['page']=None

    #doing follow up as necessary
    if response['snap_page']:
        response['page']=snap_page_parser.invoke({"messages": [("ai", f"my name is doc tech, what page would you like to snap to. Current state: {context}"),("user", query)]})
    elif response['find_fig']:
        response['pdf'], response['page'] = gx_search_figure(query)
    elif response['find_pdf']:
        response['pdf'] = gx_search_document(query)

    return response

In [None]:
handle_query('Show me the portfolio overlap of the s&p 500', {'current_page': 6})

{'scroll_up': False,
 'scroll_down': False,
 'next_page': False,
 'previous_page': False,
 'snap_page': False,
 'find_fig': True,
 'find_pdf': False,
 'non_determ': False,
 'pdf': 'https://upload.groundx.ai/file/11795/dashboard-sp-500-factor.pdf',
 'page': 5}

In [None]:
handle_query('Theres a colorful block puzzle thing and I cant figure it out?', {'current_page': 1})

{'scroll_up': False,
 'scroll_down': False,
 'snap_page': False,
 'find_fig': True,
 'find_pdf': False,
 'non_determ': False,
 'pdf': 'https://upload.groundx.ai/file/11795/babys-first-blocks.pdf',
 'page': 2}

In [None]:
handle_query('Scroll up', {'current_page': 1})

{'scroll_up': True,
 'scroll_down': False,
 'snap_page': False,
 'find_fig': False,
 'find_pdf': False,
 'non_determ': False,
 'pdf': None,
 'page': None}

In [None]:
handle_query('Pull up the S and P five hundred document', {'current_page': 1})

{'scroll_up': False,
 'scroll_down': False,
 'snap_page': False,
 'find_fig': False,
 'find_pdf': True,
 'non_determ': False,
 'pdf': 'https://upload.groundx.ai/file/11795/dashboard-sp-500-factor.pdf',
 'page': None}

In [None]:
handle_query('How do I attatch the shelf on this table?', {'current_page': 1})

{'scroll_up': False,
 'scroll_down': False,
 'snap_page': False,
 'find_fig': True,
 'find_pdf': False,
 'non_determ': False,
 'pdf': 'https://upload.groundx.ai/file/11795/lack-coffee-table-black-brown__aa-472482-3-2.pdf',
 'page': 7}