In [1]:
!pip install --quiet --upgrade google_cloud_firestore google_cloud_aiplatform langchain langchain-google-vertexai langchain_community langchain_experimental pymupdf


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.0/209.0 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.8/131.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

import pickle
from IPython.display import display, Markdown

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure

In [3]:
PROJECT_ID = "qwiklabs-gcp-01-5813c5344fea"  # 将此处替换为您的项目 ID
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [4]:
embedding_model = VertexAIEmbeddings(model_name="text-embedding-004")  # 或者使用最新模型


In [5]:
!gcloud storage cp gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf .


Copying gs://partner-genai-bucket/genai069/nyc_food_safety_manual.pdf to file://./nyc_food_safety_manual.pdf

Average throughput: 121.9MiB/s


In [6]:
loader = PyMuPDFLoader("nyc_food_safety_manual.pdf")
data = loader.load()

In [7]:
def clean_page(page):
    return page.page_content.replace("-\n","")\
                          .replace("\n"," ")\
                          .replace("\x02","")\
                          .replace("\x03","")\
                          .replace("fo d P R O T E C T I O N  T R A I N I N G  M A N U A L","")\
                          .replace("N E W  Y O R K  C I T Y  D E P A R T M E N T  O F  H E A L T H  &  M E N T A L  H Y G I E N E","")


In [11]:
from langchain.schema import Document

# 将清洗后的字符串列表转换为 Document 对象列表
cleaned_docs = [Document(page_content=page) for page in cleaned_pages[:5]]

# 使用 SemanticChunker 分割文档
semantic_chunker = SemanticChunker(embeddings=embedding_model)
chunked_docs = semantic_chunker.split_documents(cleaned_docs)
chunked_content = [doc.page_content for doc in chunked_docs]


In [12]:
chunked_embeddings = embedding_model.embed_documents(chunked_content)

In [13]:
!gsutil cp gs://partner-genai-bucket/genai069/chunked_content.pkl .
!gsutil cp gs://partner-genai-bucket/genai069/chunked_embeddings.pkl .

chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))


Copying gs://partner-genai-bucket/genai069/chunked_content.pkl...
/ [0 files][    0.0 B/280.7 KiB]                                                / [1 files][280.7 KiB/280.7 KiB]                                                
Operation completed over 1 objects/280.7 KiB.                                    
Copying gs://partner-genai-bucket/genai069/chunked_embeddings.pkl...
/ [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      


In [18]:
from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
import uuid
import numpy as np

PROJECT_ID = "qwiklabs-gcp-01-5813c5344fea"  # 替换为您的项目 ID，确保与之前的代码一致

db = firestore.Client(project=PROJECT_ID)


In [19]:
collection = db.collection("food-safety")


In [20]:
for content, embedding in zip(chunked_content, chunked_embeddings):
    doc_ref = collection.document(str(uuid.uuid4()))  # 使用随机 UUID 作为文档 ID

    # 将 embedding 转换为 Firestore Vector
    vector_embedding = Vector(np.array(embedding).astype(np.float32).tolist())

    doc_ref.set({
        "content": content,
        "embedding": vector_embedding
    })

In [None]:
#创建向量索引
#gcloud firestore indexes composite create \
# --collection-group="food-safety" \
#  --query-scope=COLLECTION \
#  --field-config='field-path=embedding,vector-config={"dimension":"768", "flat": "{}"}' \
#  --database="(default)" \
#  --project="qwiklabs-gcp-01-5813c5344fea"

#列出向量索引
#gcloud firestore indexes composite list --database="(default)"

In [31]:
def search_vector_database(query: str):

    context = ""

    # 1. Generate the embedding of the query
    query_embedding = embedding_model.embed_query(query) # 需要之前定义的 embedding_model
    query_vector = Vector(np.array(query_embedding).astype(np.float32).tolist())

    # 2. Get the 5 nearest neighbors from your collection
    # Call the get() method on the result of your call to
    # find_nearest to retrieve document snapshots.
    results = collection.find_nearest(
        "embedding",  # 直接传递要比较的字段名
        query_vector,
        distance_measure=DistanceMeasure.COSINE,
        limit=5
    ).get()

    # 3. Call to_dict() on each snapshot to load its data.
    # Combine the snapshots into a single string named context
    for doc in results:
        data = doc.to_dict()
        context += data["content"] + "\n\n"

    return context

query = "How should I store food?"
result = search_vector_database(query)
print(result)

 Store foods away from dripping condensate , at least six inches above the floor and with enough space between items to encourage air circulation. Freezer Storage Freezing is an excellent method for prolonging the shelf life of foods. By keeping foods frozen solid, the bacterial growth is minimal at best. However, if frozen foods are thawed and then refrozen, then harmful bacteria can reproduce to dangerous levels when thawed for the second time. In addition to that, the quality of the food is also affected. Never refreeze thawed foods, instead use them immediately. Keep the following rules in mind for freezer storage:  Use First In First Out method of stock rotation. All frozen foods should be frozen solid with temperature at 0°F or lower. Always use clean containers that are clearly labeled and marked, and have proper and secure lids. Allow adequate spacing between food containers to allow for proper air circulation. Never use the freezer for cooling hot foods. * * Tip: When receivin