In [1]:
# RESTART KERNEL AFTER INSTALLATION
!pip install pypdf

In [1]:
# GET HELPER FUNCTIONS NEEDED FOR MATCHING ENGINE LATER IN NB
import os
import urllib.request

if not os.path.exists("utils"):
    os.makedirs("utils")

url_prefix = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/language/use-cases/document-qa/utils"
files = ["__init__.py", "matching_engine.py", "matching_engine_utils.py"]

for fname in files:
    urllib.request.urlretrieve(f"{url_prefix}/{fname}", filename=f"utils/{fname}")

In [2]:
import json
import textwrap

# Utils
import time
import uuid
from typing import List

import numpy as np
import vertexai

# Vertex AI
from google.cloud import aiplatform

print(f"Vertex AI SDK version: {aiplatform.__version__}")

# LangChain
import langchain

print(f"LangChain version: {langchain.__version__}")

from typing_extensions import TypeAlias
from langchain.chains import RetrievalQA
from langchain.document_loaders import GCSDirectoryLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils

2024-04-25 23:10:19.355304: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Vertex AI SDK version: 1.35.0
LangChain version: 0.0.323


In [3]:
# DEFINED FUNCTION IN PY FILE TO CONVERT PDF TO TEXT
from helpers import extract_full_pdf_text

In [4]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
REGION = "us-central1" # CHANGE TO REGION YOU'VE BEEN USING

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

In [5]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [6]:
# VALUES NEED TO MATCH THOSE FROM TAX_RAG NB
ME_REGION = "us-central1"
ME_INDEX_NAME = "tax-rag-me-index-test"  
ME_EMBEDDING_DIR = "tax-rag-me-bucket-test"  
ME_DIMENSIONS = 768  

In [7]:
# Text model instance integrated with langChain
llm = VertexAI(
    model_name="text-bison@001",
    max_output_tokens=1024,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Embeddings API integrated with langChain
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
)

In [8]:
mengine = MatchingEngineUtils(PROJECT_ID, ME_REGION, ME_INDEX_NAME)

In [9]:
# SAVED MATCHING ENGINE INDEX/ENDPOINT IN LAST NB; WANT SAME TO HIT THE MODEL NOW
%store -r ME_INDEX_ID ME_INDEX_ENDPOINT_ID

In [10]:
# INIT ME VECTOR STORE W/TEXT EMBEDDING MODEL

me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=ME_REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID,
)

In [11]:
# FIRST TEST WHETHER SEARCH FROM VECTOR STORE WORKS 
me.similarity_search("Our company is a software company with its primary offering being an online data aggregation platform. Users can issue their employees various proprietary surveys, which gauge such employees' satisfaction with their workplace environment. The platform aggregates and displays the responses through easy-to-read visual aides (such as a 1-100 ranged score). We know that software companies often qualify as 'qualified small businesses' under Section 1202, which allows for a nearly unlimited exclusion of capital gain on stock (qualified small business stock or 'QSBS') in the company. However, separate counsel has told us that a 'consulting' business may be excluded under Section 1202(e)(3). Our online platform provides general 'best practices' for certain ranges of survey responses, but does not tailor any of such advice for individual clients. Would we be disqualified as a consulting business for purposes of Section 1202/QSBS?", k=2)



Waiting


[Document(page_content="The invoices provided to Company's clients for the services it performs represent billing for implementation services and embedded advice. Company does not separately bill for advice and counsel. Taxpayer represents that more than 80% of Company's assets are used in its data migration and management business.\n\nTaxpayer sold all of its shares in Company on Date 2.\n\nLaw and Analysis\n\nSection 1202(a)(1) of the Code provides that in the case of a taxpayer other than a corporation, gross income does not include 50 percent of any gain from the sale or exchange of qualified small business stock held for more than 5 years.\n\nSection 1202(a)(3) provides that in the case of qualified small business stock acquired after February 17, 2009, and on or before September 27, 2010, the exclusion is 75 percent.\n\nSection 1202(a)(4) provides that in the case of qualified small business stock acquired after September 27, 2010, the exclusion is 100 percent.", metadata={'sourc

THEN TEST SEARCH/LLM INTEGRATION THRU LANGCHAIN

In [12]:
# Create chain to answer questions
NUMBER_OF_RESULTS = 10
SEARCH_DISTANCE_THRESHOLD = 0.6

# Expose index to the retriever
retriever = me.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": NUMBER_OF_RESULTS,
        "search_distance": SEARCH_DISTANCE_THRESHOLD,
    },
)

In [14]:
## CUSTOMIZE DEFAULT RETRIEVAL PROMPT TEMPLATE
template = """SYSTEM: You are an intelligent assistant helping the users with their questions on research papers.

Question: {question}

Strictly Use the following pieces of context to answer the question at the end. Think step-by-step and then answer.
Whenever you answer cite the document name of the context, a one to two sentence synopsis of the document and an explanation of how it is relevant to the question.
If the context is empty, just say "there does not seem to be a relevant document in our database."

=============
{context}
=============

Question: {question}
Helpful Answer:"""

In [15]:
# CONFIGURE RETRIEVALQA CHAIN
# Uses LLM to synthesize results from the search index.
# Use Vertex PaLM Text API for LLM
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=template,
            input_variables=["context", "question"],
        ),
    },
)

In [16]:
# # ENABLE VERBOSE LOGGING FOR DEBUGGING
# # Enable for troubleshooting
# # FOR HITTING MODEL DISPLAY PURPOSES COMMENTING OUT FOR NOW; UNCOMMENT IF YOU WANT TO DEBUG YOUR OWN
# qa.combine_documents_chain.verbose = True
# qa.combine_documents_chain.llm_chain.verbose = True
# qa.combine_documents_chain.llm_chain.llm.verbose = True

In [17]:
# FUNCTION TO FORMAT THE RESULT
def formatter(result):
    print(f"Query: {result['query']}")
    print("." * 80)
    if "source_documents" in result.keys():
        for idx, ref in enumerate(result["source_documents"]):
            print("-" * 80)
            print(f"REFERENCE #{idx}")
            print("-" * 80)
            if "score" in ref.metadata:
                print(f"Matching Score: {ref.metadata['score']}")
            if "source" in ref.metadata:
                print(f"Document Source: {ref.metadata['source']}")
            if "document_name" in ref.metadata:
                print(f"Document Name: {ref.metadata['document_name']}")
            print("." * 80)
            print(f"Content: \n{wrap(ref.page_content)}")
    print("." * 80)
    print(f"Response: {wrap(result['result'])}")
    print("." * 80)


def wrap(s):
    return "\n".join(textwrap.wrap(s, width=120, break_long_words=False))


def ask(query, qa=qa, k=NUMBER_OF_RESULTS, search_distance=SEARCH_DISTANCE_THRESHOLD):
    qa.retriever.search_kwargs["search_distance"] = search_distance
    qa.retriever.search_kwargs["k"] = k
    result = qa({"query": query})
    return formatter(result)

In [18]:
# TEST SAME QUESTION WITH ASK RAG MODEL 
ask("Our company is a software company with its primary offering being an online data aggregation platform. Users can issue their employees various proprietary surveys, which gauge such employees' satisfaction with their workplace environment. The platform aggregates and displays the responses through easy-to-read visual aides (such as a 1-100 ranged score). We know that software companies often qualify as 'qualified small businesses' under Section 1202, which allows for a nearly unlimited exclusion of capital gain on stock (qualified small business stock or 'QSBS') in the company. However, separate counsel has told us that a 'consulting' business may be excluded under Section 1202(e)(3). Our online platform provides general 'best practices' for certain ranges of survey responses, but does not tailor any of such advice for individual clients. Would we be disqualified as a consulting business for purposes of Section 1202/QSBS?", k=2)





[1m> Entering new RetrievalQA chain...[0m
Waiting

[1m> Finished chain.[0m
Query: Our company is a software company with its primary offering being an online data aggregation platform. Users can issue their employees various proprietary surveys, which gauge such employees' satisfaction with their workplace environment. The platform aggregates and displays the responses through easy-to-read visual aides (such as a 1-100 ranged score). We know that software companies often qualify as 'qualified small businesses' under Section 1202, which allows for a nearly unlimited exclusion of capital gain on stock (qualified small business stock or 'QSBS') in the company. However, separate counsel has told us that a 'consulting' business may be excluded under Section 1202(e)(3). Our online platform provides general 'best practices' for certain ranges of survey responses, but does not tailor any of such advice for individual clients. Would we be disqualified as a consulting business for purposes

SO, LOOKING THROUGH THE MODEL'S ANSWER, WE CAN SEE THAT THE CORE ARCHITECTURE IS SOUND. THE VECTOR SEARCH FEEDS CONTEXT INTO THE LLM PROMPT, AND THE LLM CORRECTLY CITES THE RESULTS AND AVOIDS GIVING HALLUCINATORY ANSWERS IF IT IS NOT CLEAR ON HOW THEY ANSWER THE QUESTION.

FURTHER, WHILE IT DECLINES GIVING A DEFINITIVE ANSWER, THE CASES THEMSELVES ARE RELEVANT PRECEDENT FOR THE QUESTION. IT IS NOT GIVING AS USEFUL OF AN ANSWER AS WE WOULD LIKE, BUT FOR A FIRST PASS AVOIDING GIVING AN INCORRECT ANSWER AND DISPLAYING RELEVANT SEARCH RESULTS IS HELPFUL AT LEAST.

AS A NEXT STEP (WITH A BIGGER MODELING BUDGET) I WOULD BE INTERESTED IN USING LARGER CHUNKS OF THE PDFS; GIVEN THAT THE CASES ARE RELEVANT I'D BE INTERESTED TO SEE IF THE MODEL CAN GIVE MORE DEFINITIVE ANSWERS IF EXPOSED TO THE ENTIRETY (OR A LARGER FRACTION OF) THE WRITTERN DETERMINATIONS WITH MEANINGFUL PRECEDENCE

AS A FINAL STEP, I ALSO BUILT-OUT A FIRST PASS FUNCTIONALITY FOR UPLOADING A PDF TO THE MODEL AND ASKING IT TO RETURN SIMILAR CASES. FURTHER ITERATIONS OF THE MODEL COULD INCORPORATE THIS AND MORE INVOLVED QUESTIONS ABOUT PRECEDENCE BASED ON THE CASE.

In [19]:
def pdf_similarity_search(filename):
    return me.similarity_search(extract_full_pdf_text(filename)) # USE FUNCTION FROM HELPERS TO GET TEXT FROM FILE

In [20]:
# VECTOR SEARCH
pdf_similarity_search("example_case.pdf") # A RENAMED PDF; EXPERIMENT WITH WHICHVER YOU PREFER FROM THE PULLED ONES

Waiting


[Document(page_content='Dear --------------:\n\nThis is in response to a letter sent on your behalf by your representatives dated ----------- ------------. In the letter, your representatives requested a ruling on your behalf that Company is engaged in a qualified trade or business as defined in section 1202(e)(3) of the Internal Revenue Code (Code) for purposes of qualifying for the exclusion of gain under section 1202(a)(1) and is not engaged in a trade or business involving the performance of services in the field of consulting within the meaning of section 1202(e)(3)(A).\n\nFACTS\n\nCompany was formed in Year 1 and elected classification as an association to be taxed as a C corporation. It is owned by A individual shareholders, one of whom is Taxpayer. On Date 1, Company elected to be classified as an S corporation but Company had more than one class of stock in violation of section 1.361-1(l)1 of the Regulations. Thus, Taxpayer represents that its S corporation election was immedi

In [21]:
# FULL MODEL ASK
ask(f"what is a case similar to {extract_full_pdf_text('example_case.pdf')}?")



[1m> Entering new RetrievalQA chain...[0m
Waiting

[1m> Finished chain.[0m
Query: what is a case similar to   
Internal Revenue Service  Department of the Treasury  
Washington, DC 20224  
Number: 202342013  
Release Date: 10/20/2023  
Index Number:  1202.00 -00 
 
----------------------  
----------------------------  
-----------------------------  
 
 
 
 
 
   
Third Party Communication: None  
Date of Communication: Not Applicable  
Person To Contact:  
---------------------- , ID No. -----------------  
Telephone Number:  
--------------------  
Refer Reply To:  
CC:ITA:08  
PLR-120748 -22 
Date: 
July 24, 2023  
 
 
 
TY: -------  
 
Legend:  
 
Taxpayer = ------------------------------------------------  
Company = ------------------------  
Date 1 = -----------------------  
Date 2 = ------------------  
Year 1 = -------  
A = -- 
 
 
Dear ----------------- : 
 
This is in response to a letter sent on your behalf by your representatives dated -----------
------------ .  

SO, WE CAN SEE THE PDF INTEGRATION WORKS WELL. THE MODEL FOUND THE CORRECT CASE (I.E. THE EXACT ONE WE USED FOR TESTING PURPOSES) AND ADEQUATELY DESCRIBED WHY THEY ARE RELEVANT TO EACH OTHER. WHILE WE FED IT AN EASY ONE (I.E. ONE ALREADY IN OUR DB), THE FUNCTIONALTY IS CORRECT FOR MORE COMPLICATED EXAMPLES LATER.