In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用 Retrieval Augmented Generation (RAG) 與 Codey API

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/doggy8088/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.zh.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> 在 Colab 中執行
    </a>
  </td>
  <td>
    <a href="https://github.com/doggy8088/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.zh.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      在 GitHub 上檢視
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/doggy8088/generative-ai/blob/main/language/code/code_retrieval_augmented_generation.zh.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      在 Vertex AI Workbench 中開啟
    </a>
  </td>
</table>


| | |
|-|-|
|作者 | [Lavi Nigam](https://github.com/lavinigam-gcp), [Polong Lin](https://github.com/polong-lin) |


### 目標

本筆記本將展示如何透過引入外部知識來擴增 Codey API 的輸出。我們將示範一個使用 [Google Cloud 的 Generative AI GitHub 儲存庫](https://github.com/doggy8088/generative-ai) 作為外部知識的 Code Retrieval Augmented Generation(RAG) 樣本。本筆記本使用 [Vertex AI PaLM API for Code](https://cloud.google.com/vertex-ai/docs/generative-ai/code/code-models-overview)、[Embeddings for Text API](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings)、FAISS 向量儲存庫和 [LangChain 🦜️🔗](https://python.langchain.com/en/latest/)。

### 總覽

以下簡述我們將涵蓋的內容。

索引建立：

1. 遞迴列出 GitHub 儲存庫中的檔案(.ipynb)
2. 從檔案中擷取程式碼和標記
3. 將每個程式碼字串進行分塊並產生嵌入，然後初始化向量儲存庫

執行階段：

4. 使用者輸入提示或詢問問題作為提示
5. 嘗試零次提示
6. 以 RAG Chain 執行提示並比較結果。我們使用 **code-bison** 產生回應，但也可以使用 **code-gecko** 和 **codechat-bison** 

### 成本

本教學課程使用 Google Cloud 的計費元件：

- Vertex AI PaLM API，由 Google Cloud 提供

瞭解 [Vertex AI 價格](https://cloud.google.com/vertex-ai/pricing) 並使用 [定價計算器](https://cloud.google.com/products/calculator/)，根據預計使用量估算成本。

**注意：** 在本範例中，我們使用本機向量儲存庫(FAISS)，但建議將可高擴充的向量儲存庫用於生產用途，例如 [Vertex AI Matching Engine](https://cloud.google.com/vertex-ai/docs/vector-search/overview) 或使用 pgvector 擴充功能的 [AlloyDB for PostgreSQL](https://cloud.google.com/alloydb/docs/ai/work-with-embeddings) 或 [Cloud SQL for PostgreSQL](https://cloud.google.com/sql/docs/postgres/features)。


### 安裝函式庫


In [None]:
!pip install --upgrade --user -q google-cloud-aiplatform langchain==0.0.332 faiss-cpu==1.7.4

### 重新啟動執行時


In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### 驗證你的筆記本電腦環境 (僅限 Colab)


如果你在 Google Colab 上運行這個筆記本，你將需要驗證你的環境。為此，請執行以下單元格。如果你使用的是 Vertex AI Workbench，則不需要執行這個步驟。


In [None]:
import sys

if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### 匯入函式庫


In [None]:
from typing import List
import nbformat
import requests
import time

# LangChain
from langchain.llms import VertexAI
from langchain.embeddings import VertexAIEmbeddings

from langchain.schema.document import Document

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Vertex AI
from google.cloud import aiplatform
import vertexai

print(f"Vertex AI SDK version: {aiplatform.__version__}")

In [None]:
# Initialize project
# Define project information
PROJECT_ID = "YOUR_PROJECT_ID"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Code Generation
code_llm = VertexAI(
    model_name="code-bison@002",
    max_output_tokens=2048,
    temperature=0.1,
    verbose=False,
)

接下來我們需要建立一個 Github 個人 token，以便列出儲存庫中的所有檔案。

- 追蹤 [此連結](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) 建立 Github token，加上 repo->public_repo 範圍，並更新下列的 `GITHUB_TOKEN` 變數。


In [None]:
# provide Github personal access token
GITHUB_TOKEN = "YOUR_GITHUB_TOKEN"  # @param {type:"string"}
GITHUB_REPO = "GoogleCloudPlatform/generative-ai"  # @param {type:"string"}

# 索引建立

我們將使用 Google Cloud Generative AI GitHub 儲存庫作為資料來源。首先列出儲存庫中的所有 Jupyter Notebook 檔案，並儲存在文字檔案中。

如果你已執行一次並產生了輸出的文字檔，則可以跳過步驟(#1)。

### 1. 遞迴列出 GitHub 儲存庫中的檔案(.ipynb)


In [None]:
# Crawls a GitHub repository and returns a list of all ipynb files in the repository
def crawl_github_repo(url: str, is_sub_dir: bool, access_token: str = GITHUB_TOKEN):
    ignore_list = ["__init__.py"]

    if not is_sub_dir:
        api_url = f"https://api.github.com/repos/{url}/contents"

    else:
        api_url = url

    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": f"Bearer {access_token}",
    }

    response = requests.get(api_url, headers=headers)
    response.raise_for_status()  # Check for any request errors

    files = []

    contents = response.json()

    for item in contents:
        if (
            item["type"] == "file"
            and item["name"] not in ignore_list
            and (item["name"].endswith(".py") or item["name"].endswith(".ipynb"))
        ):
            files.append(item["html_url"])
        elif item["type"] == "dir" and not item["name"].startswith("."):
            sub_files = crawl_github_repo(item["url"], True)
            time.sleep(0.1)
            files.extend(sub_files)

    return files

In [None]:
code_files_urls = crawl_github_repo(GITHUB_REPO, False, GITHUB_TOKEN)

# Write list to a file so you do not have to download each time
with open("code_files_urls.txt", "w") as f:
    for item in code_files_urls:
        f.write(item + "\n")

len(code_files_urls)

In [None]:
code_files_urls[0:10]

### 2. 從 Jupyter notebook 抽取程式碼。

你也可以包含 .py 檔案、shell 指令碼等。


In [None]:
# Extracts the python code from an ipynb file from github
def extract_python_code_from_ipynb(github_url, cell_type="code"):
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace(
        "/blob/", "/"
    )

    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    notebook_content = response.text

    notebook = nbformat.reads(notebook_content, as_version=nbformat.NO_CONVERT)

    python_code = None

    for cell in notebook.cells:
        if cell.cell_type == cell_type:
            if not python_code:
                python_code = cell.source
            else:
                python_code += "\n" + cell.source

    return python_code


def extract_python_code_from_py(github_url):
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace(
        "/blob/", "/"
    )

    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    python_code = response.text

    return python_code

In [None]:
with open("code_files_urls.txt") as f:
    code_files_urls = f.read().splitlines()
len(code_files_urls)

In [None]:
code_strings = []

for i in range(0, len(code_files_urls)):
    if code_files_urls[i].endswith(".ipynb"):
        content = extract_python_code_from_ipynb(code_files_urls[i], "code")
        doc = Document(
            page_content=content, metadata={"url": code_files_urls[i], "file_index": i}
        )
        code_strings.append(doc)

### 3. 為每個程式碼字串分割區塊並產生嵌入，並初始化向量儲存

我們需要將程式碼分割成可使用的區塊，以便 LLM 可以用於產生程式碼。因此，使用正確的區塊分割方法和區塊大小至關重要。


In [None]:
# Utility functions for Embeddings API with rate limiting
def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    print("Waiting")
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            print(".", end="")
            time.sleep(sleep_time)


class CustomVertexAIEmbeddings(VertexAIEmbeddings):
    requests_per_minute: int
    num_instances_per_batch: int

    # Overriding embed_documents method
    def embed_documents(self, texts: List[str]):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches because the API accepts maximum 5
            # documents per request to get embeddings
            head, docs = (
                docs[: self.num_instances_per_batch],
                docs[self.num_instances_per_batch :],
            )
            chunk = self.client.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

In [None]:
# Chunk code strings
text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)


texts = text_splitter.split_documents(code_strings)
print(len(texts))

# Initialize Embedding API
EMBEDDING_QPM = 100
EMBEDDING_NUM_BATCH = 5
embeddings = CustomVertexAIEmbeddings(
    requests_per_minute=EMBEDDING_QPM,
    num_instances_per_batch=EMBEDDING_NUM_BATCH,
    model_name="textembedding-gecko@latest",
)

# Create Index from embedded code chunks
db = FAISS.from_documents(texts, embeddings)

# Init your retriever.
retriever = db.as_retriever(
    search_type="similarity",  # Also test "similarity", "mmr"
    search_kwargs={"k": 5},
)

retriever

# Runtime
### 4. 使用者輸入提示或作為提示提問


In [None]:
user_question = "Create a Python function that takes a prompt and predicts using langchain.llms interface with Vertex AI text-bison model"

In [None]:
# Define prompt templates


# Zero Shot prompt template
prompt_zero_shot = """
    You are a proficient python developer. Respond with the syntactically correct & concise code for to the question below.

    Question:
    {question}

    Output Code :
    """

prompt_prompt_zero_shot = PromptTemplate(
    input_variables=["question"],
    template=prompt_zero_shot,
)


# RAG template
prompt_RAG = """
    You are a proficient python developer. Respond with the syntactically correct code for to the question below. Make sure you follow these rules:
    1. Use context to understand the APIs and how to use it & apply.
    2. Do not add license information to the output code.
    3. Do not include colab code in the output.
    4. Ensure all the requirements in the question are met.

    Question:
    {question}

    Context:
    {context}

    Helpful Response :
    """

prompt_RAG_tempate = PromptTemplate(
    template=prompt_RAG, input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_llm(
    llm=code_llm,
    prompt=prompt_RAG_tempate,
    retriever=retriever,
    return_source_documents=True,
)

### 5. 嘗試零範例提示


In [None]:
response = code_llm.predict(text=user_question, max_output_tokens=2048, temperature=0.1)
print(response)

### 6. 使用 RAG Chain 運行提示並比較結果
為了產生回應，我們使用 code-bison，但也可以使用 code-gecko 和 codechat-bison


In [None]:
results = qa_chain({"query": user_question})
print(results["result"])

### 讓我們試試另一個提示


In [None]:
user_question = "Create python function that takes text input and returns embeddings using Langchain with VertexAI textembedding-gecko model"


response = code_llm.predict(text=user_question, max_output_tokens=2048, temperature=0.1)
print(response)

In [None]:
results = qa_chain({"query": user_question})
print(results["result"])