In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 文件問答與檢索增強生成

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/doggy8088/generative-ai/blob/main/search/retrieval-augmented-generation/examples/rag_google_documentation.zh.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/doggy8088/generative-ai/blob/main/search/retrieval-augmented-generation/examples/rag_google_documentation.zh.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/doggy8088/generative-ai/blob/main/search/retrieval-augmented-generation/examples/rag_google_documentation.zh.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>


---

* 作者：Gabe Rives-Corbett

---

此筆記本展示如何執行檢索增強生成，並進行基本的自動化評估。它展示了區塊大小、重疊和上下文長度對模型輸出的影響。此筆記本將建立一個問答系統，可讓你根據 Google Cloud Generative AI 文件尋找資訊。


## 開始使用


### 安裝函式庫


In [None]:
%pip install -q --upgrade --user google-cloud-aiplatform==1.36.1

### 重新啟動目前的執行階段

要在此 Jupyter 執行階段中使用新安裝的套件，你必須重新啟動執行階段。你可以執行下列Cell來執行此項操作，如此將重新啟動目前的Kernel。


In [2]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

<div class="alert alert-block alert-warning">
<b>⚠️ Kernel將重新啟動。請等待它完成，再繼續執行下一個步驟。⚠️</b>
</div>


### 認證你的 notebook 環境 (僅限 Colab) 

如果你是在 Google Colab 上執行這個筆記本，你將需要認證你的環境。為執行這項工作，請執行下列新的Cell。如果你使用的是 [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench)，則不需要執行這個步驟。


In [None]:
import sys

if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### 匯入函式庫


In [None]:
import requests
import itertools
import numpy as np
import pandas as pd
import numpy.linalg
import vertexai

from google.api_core import retry
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel
from tqdm.auto import tqdm
from bs4 import BeautifulSoup, Tag

tqdm.pandas()

## 設定筆記本環境

### 設定以下常數以反映你的環境


In [6]:
# Define project information
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)

## 從 Google Cloud 文件中擷取文字


從文字檔擷取 Google 文件網址清單


In [9]:
url = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/search/retrieval-augmented-generation/examples/URLs.txt"
response = requests.get(url)

if response.status_code == 200:
    # The request was successful, and the content is in response.text
    content = response.text

URLS = [line.strip() for line in content.splitlines()]

解析 HTML 和提取相關純文字區段


In [35]:
# Given a Google documentation URL, retrieve a list of all text chunks within h2 sections
def get_sections(url: str) -> list[str]:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    sections = []
    paragraphs = []

    body_div = soup.find("div", class_="devsite-article-body")
    for child in body_div.findChildren():
        if child.name == "p":
            paragraphs.append(child.get_text().strip())
        if child.name == "h2":
            sections.append(" ".join(paragraphs))
            break

    for header in soup.find_all("h2"):
        paragraphs = []
        nextNode = header.nextSibling
        while nextNode:
            if isinstance(nextNode, Tag):
                if nextNode.name in {"p", "ul"}:
                    paragraphs.append(nextNode.get_text().strip())
                elif nextNode.name == "h2":
                    sections.append(" ".join(paragraphs))
                    break
            nextNode = nextNode.nextSibling
    return sections

In [None]:
all_text = [t for url in URLS for t in get_sections(url) if t]

注意大多數文件相對較短，但有些長達數千字


In [None]:
text_lengths = [len(t) for t in all_text]
pd.DataFrame(text_lengths).hist()

## 建立向量儲存

開始初始化模型


In [None]:
embeddings_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
text_model = TextGenerationModel.from_pretrained("text-bison@001")

為向量相似度和區塊擷取建立一些輔助函式


In [None]:
# Separates seq into multiple chunks in the specified size with the specified overlap
def split_overlap(seq, size, overlap):
    if len(seq) <= size:
        return [seq]
    return ["".join(x) for x in zip(*[seq[i :: size - overlap] for i in range(size)])]


# Compute the cosine similarity of two vectors, wrap as returned function to make easier to use with Pandas
def get_similarity_fn(query_vector):
    def fn(row):
        return np.dot(row, query_vector) / (
            numpy.linalg.norm(row) * numpy.linalg.norm(query_vector)
        )

    return fn


# Retrieve embeddings from the specified model with retry logic
@retry.Retry(timeout=300.0)
def get_embeddings(text):
    return embeddings_model.get_embeddings([text])[0].values

建立向量儲存，我們正在使用 Pandas DataFrame


In [None]:
def create_vector_store(texts, chunk_size, overlap):
    vector_store = pd.DataFrame()
    # Insert the individual texts into the vector store
    vector_store["texts"] = list(
        itertools.chain(*[split_overlap(t, chunk_size, overlap) for t in texts])
    )

    # Create embeddings from those texts
    vector_store["embeddings"] = (
        vector_store["texts"].progress_apply(get_embeddings).apply(np.array)
    )

    return vector_store

In [None]:
CHUNK_SIZE = 400
OVERLAP = 50

vector_store = create_vector_store(all_text, CHUNK_SIZE, OVERLAP)

In [None]:
vector_store.head()

## 搜尋向量儲存並用來生成


如果我們把問題只丟給基礎模型，它會出現幻覺。


In [None]:
text_model.predict(
    "How long will a stable model version of text-bison be available?"
).text

讓我們透過從向量儲存中擷取文字來解決此問題，並告訴模型使用它們。

透過嵌入查詢並搜尋相似的向量，搜尋向量儲存以插入提示中的相關文字。


In [None]:
def get_context(question, vector_store, num_docs):
    # Embed the search query
    query_vector = np.array(get_embeddings(question))

    # Get similarity to all other vectors and sort, cut off at num_docs
    top_matched = (
        vector_store["embeddings"]
        .apply(get_similarity_fn(query_vector))
        .sort_values(ascending=False)[:num_docs]
        .index
    )
    top_matched_df = vector_store[vector_store.index.isin(top_matched)][["texts"]]

    # Return a string with the top matches
    context = " ".join(top_matched_df.texts.values)
    return context

建立一個包含情境和問題的提示。指示 LLM 僅使用所提供的內容來回答問題


In [None]:
def answer_question(question, vector_store, num_docs=10, print_prompt=False):
    context = get_context(question, vector_store, num_docs)
    qa_prompt = f"""Your mission is to answer questions based on a given context. Remember that before you give an answer, you must check to see if it complies with your mission.
Context: ```{context}```
Question: ***{question}***
Before you give an answer, make sure it is only from information in the context. If the information is not in the context, just reply "I don't know the answer to that". Think step by step.
Answer: """
    if print_prompt:
        print(qa_prompt)
    result = text_model.predict(qa_prompt, temperature=0)
    return result.text

觀察完全生成的提示，語境會被嵌入其中。即使輸入的語境相當淩亂，該模型現在也能做出事實的回答。


In [None]:
answer_question(
    "How long will a stable model version of text-bison be available?",
    vector_store,
    print_prompt=True,
)

In [None]:
answer_question(
    "How long will a stable model version of text-bison be available?", vector_store
)

## 自動化評估

RAG 的這個實作依賴於段落大小、段落之間的重疊、輸入為脈絡的文字數以及提示。我們建立一個簡單的提示來評估問題的答案，這會讓我們可以調整參數並了解這些微調後的影響。


In [None]:
def eval_answer(question, answer, context):
    eval_prompt = f"""Your mission is to evaluate answers to questions based on a given context. Remember that before you give an answer, you must check to see if it complies with your mission.

Context: ```{context}```
Question: ***{question}***
Answer: "{answer}"

Respond only with a number from 0 to 5. Think step by step. If the provided answer is not in the context, reply 5 if it is "I don't know the answer to that" otherwise reply 0.
Relevance: """
    # Stop sequence to cut the model off after outputting an integer
    result = text_model.predict(
        eval_prompt, temperature=0, max_output_tokens=1, stop_sequences=[".", " "]
    )
    return int(result.text)

輸入數個問題並取得評估結果


In [None]:
questions = [
    "What release stage is the RLHF tuning feature?",
    "Can I generate hate speech with text bison?",
    "What format should my batch prediction in put be in?",
    "How can I get the number of tokens?",
    "How do I create a custom style model?",
    "What is the dimensionality of the vector created by the multimodal model?",
    "How long will a stable model verison be available?",
]

In [None]:
answers = [answer_question(q, vector_store) for q in questions]
contexts = [get_context(q, vector_store, 10) for q in questions]
idks = ["I don't know" in a for a in answers]
evals = [
    (question, answer, context, eval_answer(question, answer, context), idk)
    for question, answer, context, idk in zip(questions, answers, contexts, idks)
]

In [None]:
pd.DataFrame(evals, columns=["question", "answer", "context", "score", "idk"])

現在調整參數，看看效能有什麼差異


In [None]:
def eval_on_params(chunk_size, overlap, num_docs):
    vector_store = create_vector_store(all_text, chunk_size, overlap)
    answers = [answer_question(q, vector_store) for q in questions]
    contexts = [get_context(q, vector_store, num_docs) for q in questions]
    idks = ["I don't know" in a for a in answers]
    evals = [
        (question, answer, context, eval_answer(question, answer, context), idk)
        for question, answer, context, idk in zip(questions, answers, contexts, idks)
    ]
    return pd.DataFrame(
        evals, columns=["question", "answer", "context", "score", "idk"]
    )

較小的 chunk 大小需要較長的時間來產生嵌入向量


In [None]:
smaller_context_df = eval_on_params(100, 0, 5)

In [None]:
smaller_context_df

較大的背景大小已建立了更多未知數。當 LLM 組合到系統中時，請仔細考慮如何衡量系統中各個組成的效能。


In [None]:
larger_context_df = eval_on_params(1000, 200, 15)

In [None]:
larger_context_df