In [None]:
# Install Vertex AI LLM SDK
! pip install --user --upgrade google-cloud-aiplatform==1.47.0 langchain==0.1.14 langchain-google-vertexai==0.1.3 typing_extensions==4.9.0

In [None]:
import os
from tabulate import tabulate
import pdfplumber
from operator import itemgetter
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import vertexai
from langchain_google_vertexai import (
    VertexAI,
    VertexAIEmbeddings,
    VectorSearchVectorStore,
)
from google.cloud import aiplatform
print(f"Vertex AI SDK version: {aiplatform.__version__}")

In [None]:
DIRECTORY_PATH = './PDFs/'
PROJECT_ID = "<project_id>"  # @param {type:"string"}
REGION = "<region>"  # @param {type:"string"}

In [None]:
class pdf_loader:

    def __init__(self, directory_path) -> None:
        self.directory_path = directory_path
        self.pdfs = [directory_path + pdf for pdf in os.listdir(directory_path)]

    def check_bboxes(self, word, table_bbox):
        l = word['x0'], word['top'], word['x1'], word['bottom']
        r = table_bbox
        return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

    def format_table(self, table):
        label = table[0][0]
        for lb_ind in range(len(table[0])):
            if table[0][lb_ind]:
                label = table[0][lb_ind]
            else:
                table[0][lb_ind] = label
        return str(tabulate(table, tablefmt='html'))

    def clean_content(self, x):
        return ' '.join(x.split()[1 : -1]) + ' ####' if  x!= '' and 'IRC:' in x.split()[0] else ' '.join(x.split()[0 : -1]) + ' ####'

    def clean_documents(self, documents):
        final_docs = []
        for document in documents:
            if document.metadata['page'] == 1:
                index = documents.index(document)
                while index < len(documents) and isinstance(documents[index].metadata['page'], int):
                    final_docs.append(documents[index])
                    index += 1
        return final_docs

    def load(self):
        documents = []
        for file in self.pdfs:
            pdf = pdfplumber.open(file)
            doc_name = str(file[len(self.directory_path): -4])
            for page in pdf.pages:
                doc_page = ''
                tables = page.find_tables()
                table_bboxes = [i.bbox for i in tables]
                tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
                non_table_words = [word for word in page.extract_words() if not any([self.check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
                for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):
                    if 'text' in cluster[0]:
                        try: 
                            doc_page += ' ' + ' '.join([i['text'] for i in cluster])
                        except:
                            pass                                # SOME PAGES ARE HORIZONTAL, FIX LATER
                    elif 'table' in cluster[0]:
                        doc_page += ' ' + self.format_table(cluster[0]['table'])
                page_number = int(doc_page.split()[-1]) if doc_page != '' and doc_page.split()[-1].isdigit() else None
                documents.append(Document(metadata={'source' : doc_name, 'page' : page_number}, page_content=self.clean_content(doc_page)))
            
        documents = self.clean_documents(documents)
        return documents

In [None]:
# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

In [None]:
# Loading Documents
loader = pdf_loader(DIRECTORY_PATH)
documents = loader.load()
documents

In [None]:
child_splitter = RecursiveCharacterTextSplitter(separators=['\n'], chunk_size=50, chunk_overlap=0, length_function=len)   
parent_splitter = RecursiveCharacterTextSplitter(separators=['####'], chunk_size=2000, chunk_overlap=100, length_function=len)   

In [None]:
# Embeddings API integrated with LangChain
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")

In [None]:
#Configure parameters to create Matching Engine index
ME_REGION = "<region>"
ME_INDEX_NAME = f"{PROJECT_ID}-me-index"  # @param {type:"string"}
ME_EMBEDDING_DIR = f"{PROJECT_ID}-me-bucket"  # @param {type:"string"}
ME_DIMENSIONS = 768  # when using Vertex AI PaLM Embedding

#Make a Google Cloud Storage bucket for your Matching Engine index
! set -x && gsutil mb -p $PROJECT_ID -l us-central1 gs://$ME_EMBEDDING_DIR

#Create Index 
# NOTE : This operation can take upto 30 seconds
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="langchain-index",
    dimensions=768,
    approximate_neighbors_count=150,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
    index_update_method="STREAM_UPDATE",  # allowed values BATCH_UPDATE , STREAM_UPDATE
)
if my_index:
    print(my_index.name)

# Create an endpoint
index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"langchain-index-endpoint", public_endpoint_enabled=True
)
if index_endpoint:
    print(f"Index endpoint resource name: {index_endpoint.name}")
    print(
        f"Index endpoint public domain name: {index_endpoint.public_endpoint_domain_name}"
    )

# Deploy Index to endpoint
# NOTE : This operation can take upto 20 minutes
my_index_endpoint = index_endpoint.deploy_index(
    index=my_index, deployed_index_id="langchain_index_endpoint_deployed_index"
)

my_index_endpoint.deployed_indexes


In [None]:
# Initialize Google's Matching Engine Vector Store
me = VectorSearchVectorStore.from_components(
    project_id=PROJECT_ID,
    region=ME_REGION,
    gcs_bucket_name=f"gs://{ME_EMBEDDING_DIR}".split("/")[2],
    embedding=embeddings,
    index_id=my_index.name,
    endpoint_id=index_endpoint.name,
    stream_update=True,
)

In [None]:
# Text model instance integrated with LangChain
llm = VertexAI(
    model_name="text-bison@002",
    max_output_tokens=1024,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

----------------------------------------------