In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 使用文字轉語音和 Gemini 講述一個多角色的故事

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/doggy8088/generative-ai/blob/main/speech/use-cases/storytelling/storytelling.zh.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory 標誌"><br>在 Colab 中執行
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fdoggy8088%2Fgenerative-ai%2Fmain%2Fspeech%2Fuse-cases%2Fstorytelling%2Fstorytelling.zh.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise 標誌"><br>在 Colab Enterprise 中執行
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/doggy8088/generative-ai/blob/main/speech/use-cases/storytelling/storytelling.zh.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI 標誌"><br>在 Vertex AI Workbench 中開啟
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/doggy8088/generative-ai/blob/main/speech/use-cases/storytelling/storytelling.zh.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub 標誌"><br>在 GitHub 上查看
    </a>
  </td>
</table>


---

* 作者：霍特·斯基納
* 建立日期：2024 年 1 月

---

## 概觀

此筆記本示範如何使用 [Text-to-Speech API](https://cloud.google.com/text-to-speech) 朗讀一個故事，讓每個角色都有不同的聲音。


### 目標

本教學課程使用下列 Google Cloud AI 服務與資源：

- [Cloud 文字轉語音 API](https://cloud.google.com/text-to-speech/docs)
- Cloud 儲存空間

執行步驟包含：

- 分析劇本格式中的輸入故事文字。(「角色：台詞」) 
- 為每個角色指定一種聲音。
- 根據角色聲音合成每一行。
- 將聲音檔案合併成一個 MP3 檔案。

計畫擴充功能：

- 上傳聲音至 Cloud 儲存空間
- 使用 [文件 AI 光學文字辨識](https://cloud.google.com/document-ai) 讀取故事文字
- 使用 Gemini 將故事轉換成劇本格式。
  - 可能會：使用 Gemini 直接從書籍文字產生 [SSML](https://cloud.google.com/text-to-speech/docs/ssml)。
- 使用 LangChain 建立替代實作。
- 在支援更多聲音後，加入 [Journey 語音](https://cloud.google.com/text-to-speech/docs/voice-types#journey_voices)。


### 成本

本教學使用 Google Cloud 的計費元件：

* 文字轉語音
* 雲端儲存

深入了解 [文字轉語音定價](https://cloud.google.com/text-to-speech/pricing)，
還有 [雲端儲存定價](https://cloud.google.com/storage/pricing)，
並使用 [定價計算器](https://cloud.google.com/products/calculator/)
根據你預估的使用量來產生成本估算。


## 開始使用


### 安裝 Vertex AI SDK、其他套件及其相依性

安裝執行此雲端筆記本所需的下列套件。


In [None]:
# Install the packages
%pip install --user --upgrade -q google-cloud-aiplatform google-cloud-texttospeech pydub pandas tqdm

如果你使用Mac，則需要安裝 [FFmpeg](https://ffmpeg.org/)。


In [None]:
import platform

# Check if the system is macOS
if platform.system() == "Darwin":
    # Install using Homebrew
    !brew install ffmpeg

### 執行下列單元以重新啟動核心。


In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ 這個 kernel 將會重新啟動。請等到重新啟動程序完成後再進行下一步。⚠️</b>
</div>


設定專案和地區。

* 請注意，Text-to-Speech 的 **可用地區** ，請參閱 [文件](https://cloud.google.com/text-to-speech/docs/endpoints)


In [None]:
PROJECT_ID = "YOUR_PROJECT_ID"  # @param {type:"string"}

TTS_LOCATION = "us"  # @param {type:"string"}
VERTEXAI_LOCATION = "us-central1"  # @param {type:"string"}

### 驗證筆記本環境

* 如果你使用 **Colab** 執行這個筆記本，請執行下列Cell並繼續。
* 如果你使用 **Vertex AI Workbench** ，請查看 [這裡](https://github.com/doggy8088/generative-ai/tree/main/setup-env) 的設定說明。


In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

    ! gcloud config set project {PROJECT_ID}
    ! gcloud auth application-default login -q

初始化 [Vertex AI Python SDK](https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk)


In [None]:
import vertexai

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=VERTEXAI_LOCATION)

### 從 Google Cloud Storage 下載原始文字

這個公開儲存區包含一些由 PaLM 生成的故事。


In [None]:
! gsutil cp gs://github-repo/speech/storytelling/*.txt .

### 匯入函式庫


In [None]:
from IPython.display import Audio

import json
import os
from pathlib import Path
from typing import Dict, List, Tuple

from pydub import AudioSegment
from tqdm import tqdm
import pandas as pd

from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig

### 定義常數


In [None]:
DEFAULT_LANGUAGE = "en"
# Voice used for narration, scene details, etc.
DEFAULT_VOICE = "en-GB-Neural2-B"

tts_client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(
        api_endpoint=f"{TTS_LOCATION}-texttospeech.googleapis.com"
    )
)
model = GenerativeModel("gemini-pro")

SILENCE_LENGTH = 200  # In Milliseconds
TXT_EXTENSION = ".txt"

### 輔助函式


In [None]:
def list_voices(
    language_code: str = DEFAULT_LANGUAGE, voice_type: str = "Neural2"
) -> List[Dict]:
    response = tts_client.list_voices(language_code=language_code)

    return [
        {
            "name": voice.name,
            "gender": texttospeech.SsmlVoiceGender(voice.ssml_gender).name.lower(),
        }
        for voice in response.voices
        if voice_type in voice.name and voice.name != DEFAULT_VOICE
    ]


def create_character_map(characters: List[str], voices: List[str]) -> Dict[str, str]:
    responses = model.generate_content(
        f"""Your job is to uniquely and appropriately match character names to voices available with Google Cloud Text to Speech.

The following is a list of available voices for Google Cloud Text to Speech in a JSON list.

{voices}

The following is a list of character names in a JSON list:

{characters}

Output a JSON formatted object mapping Character Names to Voice Names:
""",
        generation_config=GenerationConfig(
            max_output_tokens=2048, temperature=0.9, top_p=1
        ),
        safety_settings=[],
        stream=True,
    )

    for response in responses:
        json_string = response.text.replace("`", "").replace("json", "")
        return json.loads(json_string)


def synthesize_text(
    text: str, output: str, voice_name: str, language_code: str = DEFAULT_LANGUAGE
):
    response = tts_client.synthesize_speech(
        input=texttospeech.SynthesisInput(text=text),
        voice=texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_name,
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        ),
    )

    # The response's audio_content is binary.
    with open(output, "wb") as f:
        f.write(response.audio_content)


def combine_audio_files(audio_files: List[str], filename: str) -> str:
    full_audio = AudioSegment.silent(duration=SILENCE_LENGTH)

    for file in audio_files:
        sound = AudioSegment.from_mp3(file)
        silence = AudioSegment.silent(duration=SILENCE_LENGTH)
        full_audio += sound + silence
        os.remove(file)

    outfile_name = f"{Path(filename).stem}-complete.mp3"
    full_audio.export(outfile_name, format="mp3")
    return outfile_name


def get_characters(input_file: str) -> List[str]:
    character_list = []
    with open(input_file, "r") as f:
        lines = f.readlines()

    start_line = lines.index("Characters:\n")

    for i in range(start_line + 2, len(lines)):
        if lines[i] == "\n":
            break
        character_list.append(lines[i].strip())
    return character_list


def parse_file(
    input_file: str, character_to_voice: Dict[str, Tuple[str, str]]
) -> List[str]:
    with open(input_file, "r") as f:
        lines = f.readlines()

    line_number = 1
    output_files = []
    filename = Path(input_file).stem

    for line in tqdm(lines, "Parsing input file"):
        split_line = line.strip().split(": ", 1)

        character = split_line[0]
        if not character:  # Skip blank lines
            continue

        voice = character_to_voice.get(character, DEFAULT_VOICE)

        if len(split_line) <= 1:
            dialogue = split_line[0]
        elif "Scene" in split_line[0]:
            dialogue = f"{split_line[0]}, {split_line[1]}"
        else:
            dialogue = split_line[1]

        output_file = f"{filename}-{line_number}.mp3"
        output_files.append(output_file)
        synthesize_text(dialogue, output_file, voice[0], voice[1])
        line_number += 1

    return output_files

## 利用腳本內容呼叫文字到語音 API


### 取得可用的語音


In [None]:
all_voices = list_voices()
print(pd.DataFrame(all_voices))

### 列出所有符號


In [None]:
input_file = "Macbeth.txt"  # @param {type:"string"}

character_list = get_characters(input_file)

if len(character_list) > len(all_voices):
    print(f"Too many characters {len(character_list)}. Max {len(all_voices)}")

### 映射字元至聲音


In [None]:
character_to_voice = create_character_map(character_list, all_voices)

print(pd.DataFrame(character_to_voice))

### 將輸入文字解析，每行輸出為聲音


In [None]:
output_files = parse_file(input_file, character_to_voice)

### 將聲音檔案合併為一個單一檔案


In [None]:
outfile_name = combine_audio_files(output_files, input_file)
print(f"Audio content written to file {outfile_name}")

### 聆聽聲音


In [None]:
Audio(outfile_name)