In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 說明：使用 LangChain 🦜🔗 進行零售業的 SEO 優化產品說明產生

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/doggy8088/generative-ai/blob/main/language/use-cases/description-generation/product_description_generator_attributes_to_text.zh.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory 標誌"><br>在 Colab 中執行
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/doggy8088/generative-ai/blob/main/language/use-cases/description-generation/product_description_generator_attributes_to_text.zh.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub 標誌"><br>在 GitHub 上查看
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/doggy8088/generative-ai/blob/main/language/use-cases/description-generation/product_description_generator_attributes_to_text.zh.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI 標誌"><br>在 Vertex AI Workbench 中開啟
    </a>
  </td>
</table>


| | |
|-|-|
|作者(們) | [Anant Nawalgaria](https://github.com/anantnawal) |


## 簡介

本筆記本示範如何使用 [LangChain](https://python.langchain.com/docs/get_started/introduction.html) 與 Vertex AI LLM 一同解決許多大型零售商面臨的問題：根據產品屬性或規格 (通常由供應商提供)，自動產生資訊豐富、符合 SEO 且具有創意潛能的產品說明和標題。此自動產生說明流程能顯著節省成本與時間。

本教學課程示範如何從原始產品屬性和元資料開始，使用 LLM 產生完整、準確、符合 SEO 且安全的說明。你還將學習如何使用 LLM 驗證說明。此外，你將學習如何使用檢索擴充式產生功能，藉由 k-NN (k 近鄰) 嵌入式搜尋進一步改善提示內容。最後，示範如何根據品牌的寫作風格調整說明，即使各產品的風格不盡相同。

執行本教學課程的注意事項：

可在此處下載用於此示範的免費公開資料集範例 (授權個人或商業使用)：[請按一下此處](https://data.world/promptcloud/product-details-on-flipkart-com)。

### 目標

在本教學課程中，你將學習如何使用 LangChain 與 PaLM API 從現有產品屬性產生產品說明。你將演練以下範例：

* 基礎模型的零次提示以產生說明
* 在自訂語料庫上針對參數效益已調整最佳化之基礎模型的零次提示
* 利用 Vertex AI 嵌入式找出類似的範例以納入提示中，進行少次提示未調整最佳化 (以及已調整最佳化) 處理。本教學課程在本地端記憶體中完成此作業，但日後可擴大使用 [Vertex AI Matching Engine](https://cloud.google.com/vertex-ai/docs/matching-engine/overview)(一種受控的擴充式向量資料庫) 
* 使用提示設計的 LLM 零次提示來檢查所產生產品說明的安全性、真實性和品質，並針對其評估結果提出理由說明
* 使用 n-gram 重疊指標 (例如 BLEU 和 ROUGE) 評估大量提示的品質，以及使用 PaLM API 與可能的範例和負面範例進行語意相似度檢查 (使用嵌入式) 
* 使用基本和進階 LangChain 結構，例如提示範本、LLM 鏈、順序 LLM 鏈 (用於多個輸入和輸出順序提示，其中一項輸出的結果作為輸入提供給另一項輸出)、k-NN 檢索器以及自訂 LLM 類別以使用 Vertex AI 和 LangChain。

你還將看到使用基於 k-NN 的 Vertex AI LLM 嵌入式語意相似度計算的少次提示，可以在 BLEU/ROUGE 和語意相似度中提升效能指標。此外，你可以利用 Vertex Generative AI 圖片標題描述服務加入額外的產品屬性，進而豐富產品說明內容。

### 成本

本教學課程使用 Google Cloud 的可開立帳單元件：
- Vertex AI Generative AI Studio

瞭解 [Vertex AI 價格](https://cloud.google.com/vertex-ai/pricing)，並使用 [價格計算器](https://cloud.google.com/products/calculator/) 依照預期使用量來產生成本估計。


## 開始使用


### 安裝 Vertex AI SDK 和其他依賴項


In [None]:
!pip3 install --user --upgrade  pydantic==1.10.9 \
                                keras-nlp==0.5.2 \
                                tensorflow==2.12.0 \
                                scikit-learn==1.2.2 \
                                lark==1.1.5 \
                                langchain==0.0.323 \
                                google-cloud-aiplatform==1.35.0 \
                                rouge-score==0.1.2


重新啟動核心以重新載入你剛安裝的套件。你可能會看到一則快顯式警告，你可以選擇關閉它。


In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### 驗證你的筆記本環境
* 如果你使用 **Colab** 來執行這個筆記本，取消註解下面的Cell，然後繼續。
* 如果你使用 **Vertex AI Workbench** ，請查看設定說明 [在此](https://github.com/doggy8088/generative-ai/tree/main/setup-env)。


In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
REGION = "us-central1"

import vertexai

vertexai.init(project=PROJECT_ID, location=REGION)

### 匯入函式庫


In [None]:
import json
import pprint
import time
import warnings

import keras_nlp
import pandas as pd
import tensorflow as tf
from langchain.chains import LLMChain, SequentialChain
from langchain.embeddings.base import Embeddings
from langchain.llms.base import LLM
from langchain.prompts import PromptTemplate
from langchain.retrievers import KNNRetriever
from sklearn.model_selection import train_test_split
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel

warnings.filterwarnings("ignore")

### 供程式碼其他部分使用的輔助函式/類別
這些函式和類別會示範如何使用基本的 langchain 建構，在需要時建立你自訂的 LLM 模型與嵌入 (例如來自 tfhub) 。LangChain 同時本機支援頂點 LLM (用於產生和嵌入)。


In [None]:
REQUESTS_PER_MINUTE = 16
pp = pprint.PrettyPrinter(width=200)


# for creating dynamic fewshot based on embedding based kNN approach
def compute_fewshot(
    query,
    retriever,
    ixed_df,
    delimiter="\n",
    input_label="input:",
    output_label="output:",
):
    """
    Takes in a query, a langchain retriever object and
    a dataframe indexed on the product attributes, computes K nearest
    neighbours based on embedding semantic similarity of product descriptions.
    Then returns output as new-line delimited string of format key:value for
    product attributes: product description of semantically similar products
    to the original query.

    E.g. Query = Color:White
         Brand = Adidas

       Output of function:
       Input: Color:White \n Brand= Nike
       Output: These Nike sport shoes help you with your everyday run!

       Input: Color:Grey \n Brand= Adidas,
       Output: Helps you protect your heels while running!
    """
    results = list()
    for spec in retriever.get_relevant_documents(query)[:3]:
        results.append("{}{}{}".format(input_label, delimiter, spec.page_content))
        results.append(
            "%s%s%s"
            % (output_label, delimiter, ixed_df.loc[spec.page_content]["description"])
        )
    return "\n".join(results)


# to & extract parse the various fields in a clean and uniform key:value format
def extract_tags(x, delimiter=":"):
    """
    Takes in a row of a dataframe, & extracts/ parses the various fields to
    create a newline delimited array of key:value pairs where key is the
    name of the product attribute: and value is the value of the attribute itself

    E.g. output
       Color: White
       Discounted_price: 200
    """
    results = list()
    name = x["product_name"]
    brand = x["brand"]
    price = x["discounted_price"]
    category = x["product_category_tree"]
    for sym in ["[", "]", '"']:
        category = category.replace(sym, "")

    results.append("{}{}{}".format("Product name", delimiter, name))
    results.append("{}{}{}".format("brand", delimiter, brand))
    results.append("{}{}{}".format("discounted price", delimiter, price))
    results.append("{}{}{}".format("Product category", delimiter, category))
    x = x["product_specifications"]

    if "nil" in x:
        return ""
    x = json.loads(x.replace("=>", ":"))["product_specification"]

    if type(x) is not list:
        results.append(
            "{}{}{}".format(x.get("key", "other detail"), delimiter, x.get("value", ""))
        )
    else:
        for attr in x:
            results.append(
                "%s%s%s"
                % (attr.get("key", "other detail"), delimiter, attr.get("value", ""))
            )

    return "\n".join(results)


# to compute the quality metrics of the generated text w.r.t reference text using Bleu, Rouge
# and semantic similarity scores
def compute_quality_metrics_batch(
    references, predictions, rouge_n_order=2, embedding=None
):
    """
    Takes a batch of generated text and corresponding reference texts,
    computes and prints their corresponding
    Bleu, Rouge and semantic similarity(using embeddings) scores.
    """
    rouge_n = keras_nlp.metrics.RougeN(order=rouge_n_order)
    bleu_n = keras_nlp.metrics.Bleu()
    rouge_scr = rouge_n(references, predictions)["f1_score"]
    bleu_scr = bleu_n(references, predictions)
    pp.pprint("Bleu:%s" % (bleu_scr.numpy()))
    pp.pprint("Rouge:%s" % (rouge_scr.numpy()))
    if embedding:
        embed_predictions = embedding.embed_documents(predictions)
        embed_references = embedding.embed_documents(references)
        m = tf.keras.metrics.CosineSimilarity(axis=1)
        m.update_state(embed_predictions, embed_references)
        pp.pprint("Semantic Similarity:%s" % (m.result().numpy()))


class VertexLLM(LLM):
    """
    Class to use Vertex AI LLMs to generate text throttled by specified
    rate to avoid quota errors.
    """

    model: TextGenerationModel
    predict_kwargs: dict

    def __init__(self, model, verbose, **predict_kwargs):
        super().__init__(model=model, verbose=verbose, predict_kwargs=predict_kwargs)

    @property
    def _llm_type(self):
        return "vertex"

    def _call(self, prompt, stop=None):
        result = self.model.predict(prompt, **self.predict_kwargs)
        return str(result)

    @property
    def _identifying_params(self):
        return {}


def rate_limit(max_per_minute):
    period = 60 / max_per_minute
    while True:
        before = time.time()
        yield
        after = time.time()
        elapsed = after - before
        sleep_time = max(0, period - elapsed)
        if sleep_time > 0:
            # print(f'Sleeping {sleep_time:.1f} seconds')
            time.sleep(sleep_time)


class VertexEmbeddings(Embeddings):
    """
    Class to use Vertex AI LLMs to generate embeddings by specified
    rate to avoid quota errors.
    """

    def __init__(self, model, *, requests_per_minute=20):
        self.model = model
        self.requests_per_minute = requests_per_minute

    def embed_documents(self, texts):
        limiter = rate_limit(self.requests_per_minute)
        results = []
        docs = list(texts)

        while docs:
            # Working in batches of 5 to stay below the quota limit
            head, docs = docs[:5], docs[5:]
            chunk = self.model.get_embeddings(head)
            results.extend(chunk)
            next(limiter)

        return [r.values for r in results]

    def embed_query(self, text):
        single_result = self.embed_documents([text])
        return single_result[0]

## 數據準備
在本部分中，你將對完整資料集進行清洗、剖析、準備和切分。作為清洗流程的一部分，你還將確保篩選出包含空值或重複說明的欄位。


In [None]:
df = pd.read_csv(
    "gs://github-repo/use-cases/product_description_generation_retail/dataset_sample.csv"
)
df = (
    df[~(df["product_specifications"].str.contains("nil", na=False))]
    .dropna()
    .drop_duplicates(subset=["description"])
)
df.info()

In [None]:
df["parsed_product_specs"] = df.apply(lambda x: extract_tags(x), axis=1)
df_processed = df[["parsed_product_specs", "description"]]

In [None]:
# train/test split
train, test = train_test_split(df_processed, test_size=0.05, random_state=42)
print(len(train), len(test))

## 使用 LLM 進行說明產生
此部分展示了你可以如何使用零次、少量次以及參數有效微調的方式持續改善 BLEU、Rouge 和語意相似性指標。它也展示了如何使用 LLM 標記文字的毒性，並使用 BLEU、Rouge 和語意相似性得分評估其結果。


### 載入模型

你載入的名為 `text-bison@001` 的預訓練文字生成模型。


In [None]:
# You need to provide a GCP project here
embedding = VertexEmbeddings(
    TextEmbeddingModel.from_pretrained("textembedding-gecko@001"),
    requests_per_minute=REQUESTS_PER_MINUTE,
)

model = TextGenerationModel.from_pretrained("text-bison@001")
llm = VertexLLM(
    model, max_output_tokens=512, temperature=0.1, top_p=0.8, top_k=40, verbose=True
)

從 LangChain 中定義可再使用的提示範本，並建立一個按順序鏈接的鏈，首先使用第一個提示產生描述，然後使用第二個提示對其安全進行評估。輸入變數會在執行時填寫。


In [None]:
template_descgen = """
Imagine you are a copywriter for the online ecommerce platform {platform_name}. You have been given a series of information about a product and are asked to write a short and truthful product description based on that information, that can be put on the website.
At the same time the description should be optimised for search engines, and look like a realistic description in freeform text for that product which could appear on the {platform_name}.com website. The information you have been given are the following:
{product_attributes}
"""
template_evaluate = """
Does this text look like a real human written description for a product which could be found on
{platform_name} and is also not toxic? Answer with one of the following options, along with a short explanation
Options:
- Yes
- No

text: {generated_description}
Option:
"""

prompt_descgen = PromptTemplate(
    input_variables=["platform_name", "product_attributes"],
    template=template_descgen,
)
prompt_eval = PromptTemplate(
    input_variables=["platform_name", "generated_description"],
    template=template_evaluate,
)
eval_chain = LLMChain(llm=llm, prompt=prompt_eval, output_key="is_safe")

### 方法 1：零次描述生成、驗證和評估

要建立產品說明，你首先需要建立一個提示，其中包含產品屬性的預留位置變數。這些變數會在執行時填入實際的產品屬性。接下來，你需要將相對應的大語言模型 (LLM) 附加到說明生成和評估模型。最後，你需要將模型鏈結在一起，好讓第一個模型生成的產品說明作為輸入傳送給第二個模型。然後，執行「順序鏈結」時便會提供兩個模型的輸出。


In [None]:
descgen_chain = LLMChain(
    llm=llm, prompt=prompt_descgen, output_key="generated_description"
)

overall_chain = SequentialChain(
    chains=[descgen_chain, eval_chain],
    input_variables=["platform_name", "product_attributes"],
    # Here you return multiple variables
    output_variables=["generated_description", "is_safe"],
    verbose=True,
)

In [None]:
attrs = test["parsed_product_specs"].iloc[4]
orig_descr = test["description"].iloc[4]
pp.pprint("The original description:\n" + orig_descr)

In [None]:
result_0shot_untuned = overall_chain(
    {"platform_name": "Flipkart", "product_attributes": attrs}
)
pp.pprint(result_0shot_untuned)

你會發現生成的描述看起來經過 SEO 優化，且令人信服，加入了許多產品屬性。


現在你可以評估上面所產生的描述結果和原本的描述：


In [None]:
compute_quality_metrics_batch(
    [result_0shot_untuned["generated_description"]], [orig_descr], embedding=embedding
)

現在你可以對 10 個產品規格的隨機抽樣批次執行相同的評估，並評估他們的 LLM 生成的說明，並根據他們的原始說明進行評估。輸出將是 10 對評估指標的平均值。


In [None]:
sample_test = test[["parsed_product_specs", "description"]].sample(10, random_state=42)
sample_test["generated_description_0shot"] = sample_test["parsed_product_specs"].apply(
    lambda x: overall_chain({"platform_name": "Flipkart", "product_attributes": x})[
        "generated_description"
    ]
)

sample_attrs = sample_test["parsed_product_specs"].values.tolist()
sample_descriptions = sample_test["description"].values.tolist()
sample_generated_descriptions_0shot = sample_test[
    "generated_description_0shot"
].values.tolist()

現在檢視評估指標的平均輸出：


In [None]:
compute_quality_metrics_batch(
    sample_generated_descriptions_0shot, sample_descriptions, embedding=embedding
)

現在你可以快速瀏覽原始和生成的描述，然後將資料框儲存為 .CSV 格式的磁碟


In [None]:
sample_test[["description", "generated_description_0shot"]].head()

In [None]:
sample_test.to_csv("./augmented_dataset.csv", index=False)

### 方法 2：使用動態 k 近鄰，生成小樣本描述
在這一部分，你將使用小樣本提示，嘗試改善 LLM 生成的描述，用以比較你先前使用的零樣本提示技巧。

在此，非硬編碼或隨機選擇小樣本範例，而是先嵌入查詢和文件語料庫的範例，然後計算查詢嵌入的 k 個最近鄰居。


In [None]:
template_descgen_fewshot = """
Imagine you are a copywriter for the online ecommerce platform {platform_name}. You have been given a series of information about a product as input and are asked to write a short and truthful product description based on that information as output, that can be put on the website.
At the same time the description should also be optimised for search engines and look like a realistic description for that product which could appear on the {platform_name}.com website.
{examples}
input:
{product_attributes}
output:
"""

prompt_descgen_fewshot = PromptTemplate(
    input_variables=["platform_name", "product_attributes", "examples"],
    template=template_descgen_fewshot,
)

descgen_chain_fewshot_untuned = LLMChain(
    llm=llm, prompt=prompt_descgen_fewshot, output_key="generated_description"
)

overall_chain_fewshot_untuned = SequentialChain(
    chains=[descgen_chain_fewshot_untuned, eval_chain],
    input_variables=["platform_name", "product_attributes", "examples"],
    # Here we return multiple variables
    output_variables=["generated_description", "is_safe"],
    verbose=True,
)

若要建立輸入/輸出範例來引導模型，你可在訓練組中根據輸入產品規格使用 Vertex LLM 植入式 k 近鄰運算。接著根據訓練組中的產品規格/屬性進行最近鄰運算，並擷取相符描述，以透過少數輸入/輸出範例來引導模型。


In [None]:
train_spec_ix = train.copy().set_index("parsed_product_specs")
retriever = KNNRetriever.from_texts(
    train["parsed_product_specs"].values.tolist()[:500], embedding
)
examples = compute_fewshot(attrs, retriever, train_spec_ix)

In [None]:
result_fewshot_untuned = overall_chain_fewshot_untuned(
    {"platform_name": "Flipkart", "product_attributes": attrs, "examples": examples}
)
pp.pprint(result_fewshot_untuned["generated_description"])
pp.pprint(result_fewshot_untuned["is_safe"])

In [None]:
sample_test["generated_description_fewshot"] = sample_test[
    "parsed_product_specs"
].apply(
    lambda x: overall_chain_fewshot_untuned(
        {
            "platform_name": "Flipkart",
            "product_attributes": x,
            "examples": compute_fewshot(x, retriever, train_spec_ix),
        }
    )["generated_description"]
)
sample_generated_descriptions_fewshot = sample_test[
    "generated_description_fewshot"
].values.tolist()

你現在應該看到所有指標 (Bleu、Rouge 和語義相似性) 應全部都有所提升，且在某些情況下大幅改善：


In [None]:
compute_quality_metrics_batch(
    sample_generated_descriptions_fewshot, sample_descriptions, embedding=embedding
)

現在你可以快速瀏覽原始和生成的描述，然後將資料框儲存為 .CSV 格式的磁碟


In [None]:
sample_test[
    ["parsed_product_specs", "description", "generated_description_fewshot"]
].head()

In [None]:
sample_test.to_csv("./augmented_dataset.csv", index=False)

### 方法 3：微調零次描述產生驗證和評估

在此部分，你會在 500 對訓練資料集中隨機取樣的 (提示、描述) 中執行模型的參數有效微調，以調整模型至描述和書寫風格。然後你將為一批資料產生描述，並根據前面各部分所示範的三項指標，評估相對於原始資料的結果。


<div class="alert alert-block alert-warning">
<b>⚠️ 此區塊需要 TPU：請注意微調會使用 TPU，因此你需要確保專案中可供使用。</b>
</div>


In [None]:
tuned_model = TextGenerationModel.from_pretrained("text-bison@001")

train_tuning = train.copy()

train_tuning["prompt_product_specs"] = train_tuning["parsed_product_specs"].apply(
    lambda x: prompt_descgen.format(platform_name="Flipkart", product_attributes=x)
)

train_tuning.rename(
    columns={"prompt_product_specs": "input_text", "description": "output_text"},
    inplace=True,
)

請注意，以下程式碼將啟動調整管道，可能需花一、兩小時才能完成：


In [None]:
tuned_model.tune_model(
    training_data=train_tuning.sample(10, random_state=42),
    train_steps=1,
    tuning_job_location="europe-west4",
    tuned_model_location="us-central1",
)

**在這裡** 你載入最新訓練的模型，並在與先前相同的測試句子上評估它。


In [None]:
model_id = tuned_model.list_tuned_model_names()[0]
tuned_model = TextGenerationModel.get_tuned_model(tuned_model_name=model_id)

In [None]:
import datetime

print(datetime.datetime.now())  # started at 11:20am BST

In [None]:
llm_tuned = VertexLLM(
    tuned_model,
    max_output_tokens=512,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

使用與之前相同的提示範本建立新的 LLM 鏈，只需隨附新調整的模型即可。然後像對零發射模型一樣將其包含在順序鏈中，然後為測試集中的產品屬性批次生成新的描述。


In [None]:
descgen_chain_tuned = LLMChain(
    llm=llm_tuned, prompt=prompt_descgen, output_key="generated_description"
)

overall_chain_tuned = SequentialChain(
    chains=[descgen_chain_tuned, eval_chain],
    input_variables=["platform_name", "product_attributes"],
    # Here you return multiple variables
    output_variables=["generated_description", "is_safe"],
    verbose=True,
)

In [None]:
result_0shot_tuned = overall_chain_tuned(
    {"platform_name": "Flipkart", "product_attributes": attrs}
)
pp.pprint(result_0shot_tuned)

In [None]:
sample_test["generated_description_tuned_0shot"] = sample_test[
    "parsed_product_specs"
].apply(
    lambda x: overall_chain_tuned(
        {"platform_name": "Flipkart", "product_attributes": x}
    )["generated_description"]
)
sample_generated_descriptions_tuned_0shot = sample_test[
    "generated_description_tuned_0shot"
].values.tolist()

像以前一樣計算批次 Bleu、rouge 和語義相似度分數


In [None]:
compute_quality_metrics_batch(
    sample_generated_descriptions_tuned_0shot, sample_descriptions, embedding=embedding
)

現在你可以快速瀏覽原始和生成的描述，然後將資料框儲存為 .CSV 格式的磁碟


In [None]:
sample_test[
    ["parsed_product_specs", "description", "generated_description_tuned_0shot"]
].head()

In [None]:
sample_test.to_csv("./augmented_dataset.csv", index=False)

### 方法 4：少樣本描述生成驗證與評估，使用微調的模型

在此區段，你對模型執行參數有效率的微調，透過 500 對從訓練組中隨機抽樣的 (提示、描述)，以便進一步與描述和寫作風格相符。接著，你為一批次生成描述，並根據這三個指標針對原始描述評估它們，如前所述。


注意：由於你並未以少量方式訓練調整模型，因此它有時可能會產生混淆並生成不自然的文字。在這種情況下，你可以依賴確認模型，它可以評估並重複提示直到生成有效的回覆。


<div class="alert alert-block alert-warning">
<b>⚠️ 此區塊需要 TPU：請注意微調會使用 TPU，因此你需要確保專案中可供使用。</b>
</div>


In [None]:
descgen_chain_fewshot_tuned = LLMChain(
    llm=llm_tuned, prompt=prompt_descgen_fewshot, output_key="generated_description"
)
overall_chain_fewshot_tuned = SequentialChain(
    chains=[descgen_chain_fewshot_tuned, eval_chain],
    input_variables=["platform_name", "product_attributes", "examples"],
    # Here you return multiple variables
    output_variables=["generated_description", "is_safe"],
    verbose=True,
)

In [None]:
result_fewshot_tuned = overall_chain_fewshot_tuned(
    {"platform_name": "Flipkart", "product_attributes": attrs, "examples": examples}
)
pp.pprint(result_fewshot_tuned["generated_description"])

有時可能會發生由於上述原因，微調模型會產生不正確的回應：這就是為什麼驗證器模型的回應可被用於過濾它的原因


In [None]:
pp.pprint(result_fewshot_tuned["is_safe"])

In [None]:
sample_test["generated_description_tuned_fewshot"] = sample_test[
    "parsed_product_specs"
].apply(
    lambda x: overall_chain_fewshot_tuned(
        {
            "platform_name": "Flipkart",
            "product_attributes": x,
            "examples": compute_fewshot(x, retriever, train_spec_ix),
        }
    )["generated_description"]
)
sample_generated_descriptions_tuned_fewshot = sample_test[
    "generated_description_tuned_fewshot"
].values.tolist()

這種情況下，由於微調模型降低了質量，特別是對於具有長度懲罰的 Bleu 分數。因此監控所有 3 項指標非常重要


In [None]:
compute_quality_metrics_batch(
    sample_generated_descriptions_tuned_fewshot,
    sample_descriptions,
    embedding=embedding,
)

現在你可以快速瀏覽原始和生成的描述，然後將資料框儲存為 .CSV 格式的磁碟


In [None]:
sample_test[
    ["parsed_product_specs", "description", "generated_description_tuned_fewshot"]
].head()

In [None]:
sample_test.to_csv("./augmented_dataset.csv", index=False)

## 結語

本筆記本說明如何使用 Vertex AI 生成式 AI 模型和 LangChain 建立 SEO 最佳化、真實且有創意的產品說明。

在本筆記本中，你將會學習如何：

* 利用少數範例來建立 LLM 和避免產生幻覺，以及根據類似產品調整產生的說明，使其更接近現有產品說明。
* 使用 Vertex AI textembeddings 模型來評估語義相似度
* 建立 LangChain 提示、檢索器、鏈條和順序鏈條以產生更多有創意且引人入勝的產品說明。
* 使用 BLEU、ROUGE 和語義相似度 (基於餘弦距離) 分數針對原始文字批次評估生成文字的品質。
* 使用驗證器 LLM 模型保護代理，以確保產生的文字準確、真實且有創意。

### 可能的後續步驟：

* 你可以透過使用 [Vertex AI 圖像標題服務](https://cloud.google.com/vertex-ai/docs/generative-ai/image/image-captioning) 來新增更多產品屬性，這也有助於豐富產品說明。
* 你可以嘗試使用 RLHF (透過人類回饋進行強化學習) 來執行多個生成說明之間的偏好最佳化。
