In [None]:
# pip install pydub pysrt google-cloud-texttospeech==1.0.1 
# (>2.0.0 failed)
# https://cloud.google.com/text-to-speech/docs/voices

In [None]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "../googlecloud.json"
from google.cloud import texttospeech
from pydub import AudioSegment
import io
import pysrt

VEDIO_IDX = 5 # PLEASE MODIFY HERE
LANGUAGES =  [
    'kr', 
    'vn', 
    'th', 
    'en', 
    'zh',
    'jp'
]

# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
VOICES = {}
VOICES['kr'] = texttospeech.types.VoiceSelectionParams(
    language_code='ko-KR', name='ko-KR-Wavenet-A',
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
VOICES['vn'] = texttospeech.types.VoiceSelectionParams(
    language_code='vi-VN', name='vi-VN-Wavenet-A',
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
VOICES['th'] = texttospeech.types.VoiceSelectionParams(
    language_code='th-TH', name='th-TH-Standard-A',
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
VOICES['en'] = texttospeech.types.VoiceSelectionParams(
    language_code='en-US', name='en-US-Wavenet-F',
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
VOICES['zh'] = texttospeech.types.VoiceSelectionParams(
    language_code='cmn-TW', name='cmn-TW-Wavenet-A',
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
VOICES['jp'] = texttospeech.types.VoiceSelectionParams(
    language_code='ja-JP', name='ja-JP-Wavenet-B',
    ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)

# Instantiates a client
client = texttospeech.TextToSpeechClient()
# Select the type of audio file you want returned
audio_config = texttospeech.types.AudioConfig(
    audio_encoding=texttospeech.enums.AudioEncoding.MP3)

In [None]:
def total_ms(sub_time):
    return (sub_time.hours * sub_time.HOURS_RATIO + 
            sub_time.minutes * sub_time.MINUTES_RATIO + 
            sub_time.seconds * sub_time.SECONDS_RATIO + 
            sub_time.milliseconds)

In [None]:
%%time
for LANGUAGE in LANGUAGES:
    parser = pysrt.open(f'{LANGUAGE}_{VEDIO_IDX}.srt')
    voice = VOICES[LANGUAGE]
    
    all_ = AudioSegment.empty()
    # first silence
    d_silent = total_ms(parser[0].start)
    silence = AudioSegment.silent(duration=d_silent)
    all_ = all_ + silence
    for i, (sub, next_sub) in enumerate(zip(parser[:-1],parser[1:])):
        if i%50==0:
            print(f'{i}/{len(parser)}')

        # Set the text input to be synthesized
        synthesis_input = texttospeech.types.SynthesisInput(text=sub.text)
        # Perform the text-to-speech request on the text input with the selected
        # voice parameters and audio file type
        response = client.synthesize_speech(synthesis_input, voice, audio_config)    

        audio_sub = io.BytesIO(response.audio_content)
        audio_sub = AudioSegment.from_file(audio_sub)
        all_ = all_ + audio_sub
        d_silent = total_ms(next_sub.start) - all_.duration_seconds*1000
        silence = AudioSegment.silent(duration=d_silent)
        all_ = all_ + silence
    # last sub
    sub = parser[-1]
    synthesis_input = texttospeech.types.SynthesisInput(text=sub.text)
    response = client.synthesize_speech(synthesis_input, voice, audio_config)    
    audio_sub = io.BytesIO(response.audio_content)
    audio_sub = AudioSegment.from_file(audio_sub)
    all_ = all_ + audio_sub
    all_.export(f'../audios/{LANGUAGE}_{VEDIO_IDX}.mp3')
    print(all_.duration_seconds)