In [None]:
#!pip install -q fairseq2==0.1.0 gradio==3.40.1
#!pip install -q git+https://github.com/camenduru/seamless_communication


In [None]:
#git clone -b dev https://github.com/camenduru/seamless_m4t-hf

In [None]:
# %pip install fairseq2==0.1 pydub yt-dlp
# %git clone https://github.com/facebookresearch/seamless_communication.git
# %cd seamless_communication
# %git checkout 01c1042841f9bce66902eb2c7512dbdd71d42112 # We will use a stable version; if you want to use the latest version, comment out this line.
# %pip install .

In [None]:
# git clone https://github.com/facebookresearch/seamless_communication.git
# copy seamless_communication/src 裡面的 seamless_communication 資料夾到 /ipy/ 目錄
#pip install torch==2.1.1 
#pip install torchaudio
#%pip install fairseq2==0.2 pydub yt-dlp
#conda install -c conda-forge libsndfile==1.0.31

In [1]:
from seamless_communication.inference import Translator
from IPython.display import Audio
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub import AudioSegment
import torchaudio
import torch
import os

In [2]:
def split_audio_with_max_duration(input_file, output_directory, min_silence_len=2500, silence_thresh=-60, max_chunk_duration=15000):
    sound = AudioSegment.from_wav(input_file)
    # Splitting on silence
    audio_chunks = split_on_silence(sound, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    # split for max_chunk_duration
    final_audio_chunks = []
    for chunk in audio_chunks:
        if len(chunk) > max_chunk_duration:
            num_subchunks = len(chunk) // max_chunk_duration + 1
            subchunk_size = len(chunk) // num_subchunks
            for i in range(num_subchunks):
                start_idx = i * subchunk_size
                end_idx = (i + 1) * subchunk_size
                subchunk = chunk[start_idx:end_idx]
                final_audio_chunks.append(subchunk)
        else:
            final_audio_chunks.append(chunk)
    # Export wav
    for i, chunk in enumerate(final_audio_chunks):
        output_file = f"{output_directory}/chunk{i}.wav"
        print("Exporting file", output_file)
        chunk.export(output_file, format="wav")

In [3]:
def save_and_play_audio(path_save, audio, sample_rate):
    torchaudio.save(
        path_save,
        audio[0].cpu(),
        sample_rate=sample_rate,
    )

    audio_play = Audio(path_save, rate=sample_rate, autoplay=True, normalize=True)
    display(audio_play)

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    dtype = torch.float16
else:
    device = torch.device("cpu")
    dtype = torch.float32


In [5]:
translator = Translator(
    model_name_or_card="seamlessM4T_v2_large_local",
    vocoder_name_or_card="vocoder_v2_local",
    device=device,
    dtype=dtype,
    apply_mintox=True,
)

Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large_local. Set `force` to `True` to download again.
Using the cached etox dataset. Set `force` to `True` to download again.
Using the cached tokenizer of mintox. Set `force` to `True` to download again.


In [None]:
# Download the video
video_url = 'www.youtube.com/watch?v=g_9rPvbENUw'
!yt-dlp -f "mp4"  --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o Video.mp4  $video_url

In [None]:
# Convert to wav
!ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav

In [None]:
input_audio_file = "/content/seamless_communication/audio.wav"
output_directory = "/content/seamless_communication/split_segments"

!mkdir split_segments
!rm -rf /content/seamless_communication/split_segments/*
split_audio_with_max_duration(input_audio_file, output_directory)

In [None]:
# Play a split
audio_path = '/content/seamless_communication/split_segments/chunk1.wav'
audio = Audio(audio_path, rate=44100, autoplay=True, normalize=True)
display(audio)

In [None]:
# Example Speech to Speech Translate
translated_text, wav, sr = translator.predict(
    input='/content/seamless_communication/split_segments/chunk1.wav',
    task_str='s2st',
    tgt_lang='eng', # target language
    src_lang='spa', # source language # If you specify this, it will improve the model's result.
    spkr= -1,
)

# Save the audio and play
save_and_play_audio(
    '/content/seamless_communication/audiot.wav',
    wav,
    sr,
)

In [None]:
#Now we will translate all the segments and combine them into a new audio file.
segments = []

for filename in sorted(os.listdir(output_directory)):
    if filename.startswith("chunk") and filename.endswith(".wav"):
        segment_path = os.path.join(output_directory, filename)

        translated_text, wav, sr = translator.predict(
            input=segment_path,
            task_str='s2st',
            tgt_lang='eng',
            src_lang='spa',
        )
        print(translated_text, segment_path)

        torchaudio.save(
            segment_path,
            wav[0].cpu(),
            sample_rate=sr,
        )

        segment = AudioSegment.from_file(segment_path)
        segments.append(segment)

    combined_audio = sum(segments)
    combined_audio.export('/content/seamless_communication/audio_eng.mp3', format="mp3")

In [None]:
audio_path = '/content/seamless_communication/audio_eng.mp3'
audio = Audio(audio_path, rate=44100, autoplay=True, normalize=True)
display(audio)

In [None]:
#Text to Speech Translate
text = 'En el bosque encantado'
translated_text, wav, sr = translator.predict(
    text,
    "t2st",
    tgt_lang='eng',
    src_lang='spa'
)

save_and_play_audio(
    '/content/seamless_communication/text2speech.wav',
    wav,
    sr,
)

In [7]:
# Text to Text Translate
text = 'En el bosque encantado, un zorro curioso halló un reloj antiguo. Al tocarlo, quedó atrapado en un bucle temporal. Buscó ayuda de un búho sabio, quien reveló que solo resolviendo acertijos podría romper el hechizo. Juntos descifraron enigmas, liberando al zorro y tejiendo una amistad eterna.'
translated_text, _ = translator.predict(text, "t2tt", 'eng', src_lang='spa')
translated_text

[CString('In the enchanted forest, a curious fox found an ancient clock. When he touched it, he was trapped in a time loop. He sought help from a wise owl, who revealed that only by solving riddles could he break the spell. Together they deciphered riddles, freeing the fox and weaving an eternal friendship.')]

In [23]:
# Text to Text Translate
text = 'SeamlessM4T is our foundational all-in-one Massively Multilingual and Multimodal Machine Translation model delivering high-quality translation for speech and text in nearly 100 languages.'
translated_text, _ = translator.predict(text, "t2tt", 'cmn', src_lang='eng')


In [27]:
a = translated_text[0]
print(a)

SeamlessM4T是我们基础的全方位大规模多语言和多模式机器翻译模型,为近100种语言的语音和文本提供高质量翻译.


In [None]:
#Speech to text translate
# Resample audio
resample_rate = 44100
waveform, sample_rate = torchaudio.load('/mnt/d/AI-mp4/TheMayor.wav')
resampler = torchaudio.transforms.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
resampled_waveform = resampler(waveform)
torchaudio.save('/content/seamless_communication/split_segments/resample_chunk1.wav', resampled_waveform, resample_rate)


In [31]:
translated_text, a, b = translator.predict('/mnt/d/demo/AI-mp4/TheMayor.wav', "s2tt", src_lang='kor', tgt_lang='cmn')
translated_text


OutOfMemoryError: CUDA out of memory. Tried to allocate 142.53 GiB. GPU 0 has a total capacty of 12.00 GiB of which 909.47 MiB is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 8.31 GiB is allocated by PyTorch, and 1.73 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF