## Creating Index with PDF files

In [None]:
#All necessary imports
from google.cloud import aiplatform
from vertexai.preview.language_models import TextEmbeddingModel
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud import aiplatform_v1
from google.protobuf import struct_pb2
import grpc
from google.cloud.aiplatform.matching_engine._protos import match_service_pb2
from google.cloud.aiplatform.matching_engine._protos import (
    match_service_pb2_grpc,)
import time

In [None]:
#Declaring Variables
project_id = "<enter project id>"
dataset_id = "<enter the dataset id of BQ>
table_id = "<enter the table id of BQ>
location = "<enter location of project>"
processor_id = "<enter OCR processor id>"
processor_version = "<enter OCR processor version>"
mime_type = "<enter mime type of your file format, for pdf- application/pdf>"
full_table_id = f"{project_id}.{dataset_id}.{table_id}"

In [None]:
#Read all pdf files
import glob
import json
files = glob.glob("<enter location of pdf files available>")
files

In [None]:
#Function to generate embeddings for the text using gecko model
final_embedding = []
def text_embedding(vocab_list):
    """
    Text embedding with a Large Language Model.
    
    Args:
    
    vocab_list : List
                 Contains list of strings
    
    Returns:
    
    final_embedding : List
                      List of embedding vectors
    """
    start,end = 0,5
    while start<end:
        model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
        embeddings = model.get_embeddings(vocab_list[start:end])
        final_embedding.extend(embeddings)
        start = end
        end = end+5
        if end<=len(vocab_list):
            end=end
        else:
            end=len(vocab_list)
    return final_embedding

In [None]:
def chunk_sent(doc_text):
    """
    This function is used to create chunks of
    2000 tokens out of an entire document
    
    Args:
    
    doc_text : String
               Contains entire document text
    
    Returns:
    
    chunked_sent : List
                   List of strings containing chunked
                   sentences
    """
    chunked_sent = []
    start, end = 0, 2000
    word_list = doc_text.split()
    for i in range(len(word_list)):
        temp_sent = " ".join(word_list[start:end])
        chunked_sent.append(temp_sent)
        if end>=len(word_list):
            return chunked_sent
        else:
            start = end
            end = end+2000
    return chunked_sent

In [None]:
vocab_list = []
for file_path in files:
    file_path = file_path
    doc_text = process_document_ocr_sample(project_id, location, processor_id, processor_version, file_path, mime_type)
    
    #Chunking to 2000 tokens for each doc
    chunked_sent = chunk_sent(doc_text)
    vocab_list.extend(chunked_sent)
    
#Send the complete chunked sentences list to Palm API to get the vectors
embedding_list = text_embedding(vocab_list)

In [None]:
#Code to store the index and its corresponding value in BigQuery
bq_client = bigquery.Client()
destination = bigquery.table.Table.from_string(full_table_id)

for index,value in enumerate(vocab_list):
    bq_row = [{
                "Index": str(index),
                "Value": str(value),
            }]
    bq_client.insert_rows_json(destination, bq_row)

In [None]:
#Creating the JSONL file to create Index
final_list = []
with open("index_file.json", "w") as f:
    for i in range(len(embedding_list)):
        val_dict = {}
        val_dict["id"] = str(i)
        val_dict["embedding"] = embedding_list[i].values
        f.writelines(json.dumps(val_dict)+ "\n")   

In [None]:
#Creating Index in Vertex Matching Engine
aiplatform.init(project=project_id, location=location)
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="<enter a display name for the index>",
    contents_delta_uri="<gs path of jsonl file created above>",
    dimensions=768,
    approximate_neighbors_count=150,
    distance_measure_type="COSINE_DISTANCE",
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=7,
    description="<description of your index created>",
)

## Creating an Endpoint and deploying the index on the endpoint

In [None]:
#Deploying to an endpoint
#Create Endpoint
REGION = "<enter region where you want to create endpoint>"
ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

PROJECT_ID = project_id
PARENT = "projects/{}/locations/{}".format(PROJECT_ID, REGION)

PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]

NETWORK_NAME = "<VPC network name>"

VPC_NETWORK_NAME = "projects/{}/global/networks/{}".format(PROJECT_NUMBER, NETWORK_NAME)
VPC_NETWORK_NAME

In [None]:
#Creating an endpoint
DISPLAY_NAME="<Endpoint display Name>"
index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    project = project_id,
    network=VPC_NETWORK_NAME,
    location = REGION
    #IMPORTANT if you want to use a public endpoint you need to use aiplatform_v1beta1 when query or inserting vectors
    # https://cloud.google.com/vertex-ai/docs/matching-engine/deploy-index-public
    # public_endpoint_enabled=True
)

In [None]:
INDEX_RESOURCE_NAME = "projects/<project number>/locations/<region>/indexes/<index id>"
index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

ENDPOINT_RESOURCE_NAME = "projects/<project number>/locations/<region>/indexEndpoints/<endpoint id>"
index_endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name=ENDPOINT_RESOURCE_NAME)

In [None]:
#Deploy index to the created endpoint
deployed_index = index_endpoint.deploy_index(
    index=index, deployed_index_id=DISPLAY_NAME.replace('-','_')
)

## Inference

In [None]:
#inference embedding function
from google.cloud import aiplatform
from vertexai.preview.language_models import TextEmbeddingModel
def text_embedding(vocab_list):
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    embeddings = model.get_embeddings(vocab_list)
    return embeddings

In [None]:
query = ["Write a Query"]
query_embeddings = text_embedding(query)
response = index_endpoint.match(
    deployed_index_id="<deployed_index_id_name>",
    queries=[query_embeddings[0].values],
    num_neighbors=1
)

In [None]:
# Perform a query to fetch value from BQ against the index
client = bigquery.Client()
index = int(response[0][0].id)
QUERY = (
    f'SELECT name FROM {full_table_id}'
    'WHERE Index = index')
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish
for row in rows:
    context = row.Value

## Generating Text based on the context of semantic Search

In [None]:
#Generating Text
from vertexai.preview.language_models import TextGenerationModel

prompt=f"""
Follow exactly those 3 steps:
1. Read the context below and aggregrate this data
Context : {context}
2. Answer the question using only this context
User query: {query[0]}

If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
"""

model = TextGenerationModel.from_pretrained('text-bison@001')
response = model.predict(
        prompt,
        temperature=0.2,
        top_k=40,
        top_p=.8,
        max_output_tokens=1024,
)
print(f"Response: \n{response.text}")