In [None]:
!pip install --quiet --upgrade google_cloud_firestore google_cloud_aiplatform langchain langchain-google-vertexai langchain_community langchain_experimental pymupdf

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

import pickle
from IPython.display import display, Markdown

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure

In [None]:
import vertexai

PROJECT_ID = ! gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "" # @param {type:"string"}

print(PROJECT_ID)

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
from langchain_google_vertexai import VertexAIEmbeddings

# Initialize the a specific Embeddings Model version
embedding_model = VertexAIEmbeddings(model_name="text-embedding-004")

In [None]:
!curl -LO github.com/cloudlabguru/gcp-cloudskillboost/blob/main/Create%20and%20Deploy%20a%20RAG%20application%20with%20vector%20search%20in%20Firestore/nyc_food_safety_manual.pdf

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
data = PyMuPDFLoader("nyc_food_safety_manual.pdf").load()

In [None]:
def clean_page(page):
  return page.page_content.replace("-\n","")\
                          .replace("\n"," ")\
                          .replace("\x02","")\
                          .replace("\x03","")\
                          .replace("fo d P R O T E C T I O N  T R A I N I N G  M A N U A L","")\
                          .replace("N E W  Y O R K  C I T Y  D E P A R T M E N T  O F  H E A L T H  &  M E N T A L  H Y G I E N E","")

In [None]:
cleaned_pages = [clean_page(page) for page in data]

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

# Create a SemanticChunker to split pages based on semantic similarity
doc_chunker = SemanticChunker(
    embeddings=embedding_model
)

# Chunk the first five pages
docs = doc_chunker.create_documents(cleaned_pages[:5])

# Extract the text content from the chunked documents
chunked_content = [doc.page_content for doc in docs]

In [None]:
chunked_embeddings = embedding_model.embed_documents(chunked_content)

In [None]:
!curl -LO github.com/cloudlabguru/gcp-cloudskillboost/blob/main/Create%20and%20Deploy%20a%20RAG%20application%20with%20vector%20search%20in%20Firestore/chunked_content.pkl
!curl -LO github.com/cloudlabguru/gcp-cloudskillboost/blob/main/Create%20and%20Deploy%20a%20RAG%20application%20with%20vector%20search%20in%20Firestore/chunked_embeddings.pkl

chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))

In [None]:
db = firestore.Client(project=PROJECT_ID)
collection = db.collection('food-safety')

In [None]:
from google.cloud import firestore
db = firestore.Client()
collection = db.collection("food-safety")

In [None]:
for i, (content, embedding) in enumerate(zip(chunked_content, chunked_embeddings)):
    doc_ref = collection.document(f"doc_{i}")
    doc_ref.set({
        "content": content,
        "embedding": Vector(embedding)
    })

In [None]:
!gcloud firestore indexes composite create --project=PROJECT_ID --collection-group=food-safety --query-scope=COLLECTION --field-config=vector-config='{"dimension":"768","flat": "{}"}',field-path=embedding

In [None]:
def search_vector_database(query: str):
  context = ""
  query_embedding = embedding_model.embed_query(query)
  vector_query = collection.find_nearest(
    vector_field="embedding",
    query_vector=Vector(query_embedding),
    distance_measure=DistanceMeasure.EUCLIDEAN,
    limit=5,
  )
  docs = vector_query.stream()
  context = [result.to_dict()['content'] for result in docs]
  return context

In [None]:
search_vector_database("How should I store food?")