In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用 Vertex AI Gemini API 的負責任 AI：安全評分與閾值

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/doggy8088/generative-ai/blob/main/gemini/responsible-ai/gemini_safety_ratings.zh.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory 標誌"><br> 在 Colab 中執行
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fdoggy8088%2Fgenerative-ai%2Fmain%2Fgemini%2Fresponsible-ai%2Fgemini_safety_ratings.zh.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise 標誌"><br> 在 Colab Enterprise 中執行
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://github.com/doggy8088/generative-ai/blob/main/gemini/responsible-ai/gemini_safety_ratings.zh.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub 標誌"><br> 在 GitHub 上檢視
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/doggy8088/generative-ai/main/gemini/responsible-ai/gemini_safety_ratings.zh.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI 標誌"><br> 
      在 Vertex AI Workbench 中開啟
    </a>
  </td>                                                                                               
</table>


| | |
|-|-|
|作者 | [Hussain Chinoy](https://github.com/ghchinoy) |


## 概述

大語言模型 (LLM) 可以翻譯語言、摘要文字、產生創意寫作、產生程式碼、支援聊天機器人和虛擬助理，以及補充搜尋引擎和推薦系統。LLM 的驚人通用性也是難以精確預測它們可能會產生哪種類型的意外或不可預見輸出。

對於這些風險和複雜性，Vertex AI Gemini API 在設計時考量了 [Google 的 AI 原則](https://ai.google/responsibility/principles/)。但是，開發人員瞭解和測試其模型以安全且負責任地部署至關重要。為協助開發人員，Vertex AI Studio 內建了內容過濾、安全評分以及定義適用於其使用案例和業務的安全過濾器閾值的功能。

更多資訊，請參閱 [Google Cloud Generative AI 關於負責 AI 的文件](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/responsible-ai)。


### 目標

在本教學課程中，你將學習如何使用 Python SDK 檢查 Vertex AI Gemini API 傳回的安全評等，以及如何設定安全臨界值，以篩選來自 Vertex AI Gemini API 的回應。

執行步驟包括：

- 呼叫 Vertex AI Gemini API，並檢查回應的安全評等
- 定義門檻值，根據你的需求篩選安全評等


### 成本

本教學課程使用 Google Cloud 的可計費元件：

- Vertex AI

瞭解 [Vertex AI 定價](https://cloud.google.com/vertex-ai/pricing)，並使用 [定價計算器](https://cloud.google.com/products/calculator/) 根據預計使用量產生成本估計。


## 開始使用


### 安裝 Vertex AI SDK for Python


In [None]:
! pip3 install --upgrade --user google-cloud-aiplatform

### 重新啟動目前的執行階段

要在此 Jupyter 執行階段中使用新安裝的套件，你必須重新啟動執行階段。你可以執行下列Cell來執行此項操作，如此將重新啟動目前的Kernel。


In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ Kernel將重新啟動。請等待它完成，再繼續執行下一個步驟。⚠️</b>
</div>


### 驗證你的筆記本環境 (僅限 Colab) 

如果你在 Google Colab 上執行此筆記本，請執行下列單元格來驗證你的環境。如果你使用 [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench)，則不需要這個步驟。


In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### 定義 Google Cloud 專案資訊及初始化 Vertex AI

針對你的專案初始化 Python 版的 Vertex AI SDK：


In [2]:
# Define project information
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### 匯入函式庫


In [3]:
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmCategory,
    HarmBlockThreshold,
    Image,
    Part,
)

### 載入 Gemini 1.0 Pro 模型


In [4]:
model = GenerativeModel("gemini-1.0-pro")

# Set parameters to reduce variability in responses
generation_config = GenerationConfig(
    temperature=0,
    top_p=0.1,
    top_k=1,
    max_output_tokens=1024,
)

## 產生文字並顯示安全評分


開始使用 Gemini 產生一段聽起來愉快的文本回應。


In [5]:
# Call Gemini API
nice_prompt = "Say three nice things about me"
responses = model.generate_content(
    contents=[nice_prompt],
    generation_config=generation_config,
    stream=True,
)

for response in responses:
    print(response.text, end="")

1. You are a thoughtful and compassionate person.
2. You have a great sense of humor and always make me laugh.
3. You are a loyal and supportive friend.

#### 檢查安全評分


請查看串流回應的`safety_ratings`。


In [6]:
responses = model.generate_content(
    contents=[nice_prompt],
    generation_config=generation_config,
    stream=True,
)

for response in responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "1. You are a thoughtful and compassionate person.\n2. You have a great sense of humor and always make me laugh.\n3. You are a"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: " loyal and supportive friend."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety

#### 理解安全評級：分類和機率


你可以看到安全評分，包含每個 `類別` 類型及它相關聯的 `機率` 標籤。

`類別` 類型包含：

* 仇恨言論： `HARM_CATEGORY_HATE_SPEECH`
* 危險內容： `HARM_CATEGORY_DANGEROUS_CONTENT`
* 騷擾： `HARM_CATEGORY_HARASSMENT`
* 性暗示言論： `HARM_CATEGORY_SEXUALLY_EXPLICIT`

`機率` 標籤是：

* `NEGLIGIBLE` - 內容有很小的機率是不安全的
* `LOW` - 內容有低機率是不安全的
* `MEDIUM` - 內容有中機率是不安全的
* `HIGH` - 內容有高機率是不安全的


嘗試一個可能觸發以下類別的提示：


In [7]:
impolite_prompt = "Write a list of 5 disrespectful things that I might say to the universe after stubbing my toe in the dark:"

impolite_responses = model.generate_content(
    impolite_prompt,
    generation_config=generation_config,
    stream=True,
)

for response in impolite_responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: LOW
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: " Thanks for nothing.\"\n3. \"I\'m starting to think you have it out for me. What did I ever do to you?\"\n4."
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: LOW
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
}

candi

#### 封鎖回應


如果回應被封鎖，你會看到最終候選包含 `blocked: true`，並觀察封鎖回應的哪個安全性評分已經觸發 (例如：`finish_reason: SAFETY`)。


In [8]:
rude_prompt = "Write a list of 5 very rude things that I might say to the universe after stubbing my toe in the dark:"

rude_responses = model.generate_content(
    rude_prompt,
    generation_config=generation_config,
    stream=True,
)

for response in rude_responses:
    print(response)

candidates {
  content {
    role: "model"
    parts {
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: LOW
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: LOW
  }
}

candidates {
  content {
    role: "model"
    parts {
      text: ", that was just plain mean. I\'m starting to think you have a personal vendetta against me.\"\n3. \"Seriously, universe? You\'re"
    }
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: LOW
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE

### 定義安全評級的閾值

你可視業務政策或使用案例調整預設安全過濾器閾值。Vertex AI Gemini API 為你提供傳遞每個類別閾值的方法。

下列清單顯示可能的閾值標籤：

* `BLOCK_ONLY_HIGH` - 在偵測到高機率的不安全內容時封鎖
* `BLOCK_MEDIUM_AND_ABOVE` - 在偵測到中等或高機率的不安全內容時封鎖
* `BLOCK_LOW_AND_ABOVE` - 在偵測到低、中或高機率的不安全內容時封鎖
* `BLOCK_NONE` - 不論不安全內容的機率，始終顯示


#### 設定安全性閥值
以下，安全性閥值已設定到此最敏感閥值：`BLOCK_LOW_AND_ABOVE`


In [9]:
safety_settings = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
}

#### 測試閾值

在這裡，你將重新使用較早時不禮貌的提示和最敏感的安全閾值。它甚至在「低」機率標籤下仍會阻止回應。


In [10]:
impolite_prompt = "Write a list of 5 disrespectful things that I might say to the universe after stubbing my toe in the dark:"

impolite_responses = model.generate_content(
    impolite_prompt,
    generation_config=generation_config,
    safety_settings=safety_settings,
    stream=True,
)

for response in impolite_responses:
    print(response)

candidates {
  content {
    role: "model"
  }
  finish_reason: SAFETY
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: LOW
    blocked: true
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 24
  total_token_count: 24
}

