<a href="https://colab.research.google.com/github/dungdt-infopstats/TV-command-synthesis/blob/main/src_prototype/Phase2_VCTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# F5-TTS
!pip install -q f5-tts

In [None]:
# Reference Speech
!gdown 1G5APBDW_BlWAyXW_76RO24-wK6XqWkNe --quiet
# Reference Meta (including Text)
!gdown 1eJ7migpI5HXW_ZsAdUnY2S7WL-cwLG6e --quiet

In [None]:
!unzip -q vctk_ref_audio.zip
!unzip -q vctk_ref_json.zip

In [None]:
import os
from pydub import AudioSegment
from tqdm import tqdm

def add_silence_to_wav(input_dir, output_dir, silence_duration_ms=1000):
    # Tạo thư mục output nếu chưa tồn tại
    os.makedirs(output_dir, exist_ok=True)

    # Lấy danh sách file .wav
    wav_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".wav")]

    # Loop qua danh sách file với tqdm
    for filename in tqdm(wav_files, desc="Processing WAV files"):
        filepath = os.path.join(input_dir, filename)

        # Load audio
        audio = AudioSegment.from_wav(filepath)

        # Tạo đoạn silence
        silence = AudioSegment.silent(duration=silence_duration_ms)

        # Ghép audio + silence
        padded_audio = audio + silence

        # Lưu file mới
        output_path = os.path.join(output_dir, filename)
        padded_audio.export(output_path, format="wav")

# Ví dụ chạy
add_silence_to_wav("audio", "pad_audio", silence_duration_ms=1000)


In [None]:
import os
import shutil
import librosa

def filter_audio_json(audio_dir, json_dir, output_audio_dir, output_json_dir, max_duration=12.0):
    """
    Lọc bỏ các file audio có độ dài > max_duration và đồng bộ file JSON cùng tên.

    Args:
        audio_dir (str): Thư mục chứa audio input.
        json_dir (str): Thư mục chứa json input.
        output_audio_dir (str): Thư mục chứa audio output.
        output_json_dir (str): Thư mục chứa json output.
        max_duration (float): Ngưỡng thời lượng tối đa (giây).
    """
    os.makedirs(output_audio_dir, exist_ok=True)
    os.makedirs(output_json_dir, exist_ok=True)

    for file in os.listdir(audio_dir):
        if file.endswith((".wav", ".mp3", ".flac")):  # lọc định dạng audio
            audio_path = os.path.join(audio_dir, file)
            try:
                y, sr = librosa.load(audio_path, sr=None)
                duration = librosa.get_duration(y=y, sr=sr)

                if duration <= max_duration:
                    # copy audio
                    shutil.copy(audio_path, os.path.join(output_audio_dir, file))

                    # copy json nếu tồn tại
                    json_file = os.path.splitext(file)[0] + ".json"
                    json_path = os.path.join(json_dir, json_file)
                    if os.path.exists(json_path):
                        shutil.copy(json_path, os.path.join(output_json_dir, json_file))
            except Exception as e:
                print(f"Lỗi khi đọc {file}: {e}")

    print("Hoàn thành lọc audio + json.")


In [None]:
filter_audio_json(
    audio_dir="pad_audio",
    json_dir="json",
    output_audio_dir="filtered_audio",
    output_json_dir="filtered_json",
    max_duration=12.0
)


In [None]:
import os
import librosa
import matplotlib.pyplot as plt

# Đường dẫn tới thư mục chứa audio
audio_dir = "/content/filtered_audio"

durations = []

# Lặp qua các file audio trong thư mục
for file in os.listdir(audio_dir):
    if file.endswith((".wav", ".mp3", ".flac")):  # lọc định dạng audio
        file_path = os.path.join(audio_dir, file)
        try:
            y, sr = librosa.load(file_path, sr=None)  # load với sampling rate gốc
            duration = librosa.get_duration(y=y, sr=sr)
            durations.append(duration)
        except Exception as e:
            print(f"Không đọc được file {file}: {e}")

# Vẽ histogram phân bố độ dài
plt.hist(durations, bins=30, edgecolor="black")
plt.xlabel("Thời lượng (giây)")
plt.ylabel("Số lượng file")
plt.title("Phân bố độ dài speech trong thư mục")
plt.show()


In [None]:
import os
import json
import soundfile as sf
import pandas as pd
import random
import ast
import numpy as np
import uuid
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp
from pathlib import Path
import time
from typing import Any, List
from importlib.resources import files
from f5_tts.api import F5TTS

class ReferenceCache:
    def __init__(self, audio_folder_path: str, json_folder_path: str):
        self.audio_folder_path = Path(audio_folder_path)
        self.json_folder_path = Path(json_folder_path)
        self._audio_files = None
        self._json_files = None
        self._ref_texts = {}

    @property
    def audio_files(self):
        if self._audio_files is None:
            self._audio_files = [f for f in self.audio_folder_path.iterdir() if f.is_file()]
        return self._audio_files

    @property
    def json_files(self):
        if self._json_files is None:
            self._json_files = [f for f in self.json_folder_path.iterdir() if f.suffix == '.json']
        return self._json_files

    def get_ref_text(self, json_path: Path) -> str:
        if str(json_path) not in self._ref_texts:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                self._ref_texts[str(json_path)] = data.get('text', '')
        return self._ref_texts[str(json_path)]

    def sample_reference(self):
        # audio_file = random.choice(self.audio_files)
        json_file = random.choice(self.json_files)
        ref_id = json_file.stem
        audio_file = os.path.join(self.audio_folder_path, f"{ref_id}.wav")
        ref_speech_path = str(audio_file)
        ref_text = self.get_ref_text(json_file)
        return ref_id, ref_speech_path, ref_text

In [None]:
import os
import json
import soundfile as sf
import pandas as pd
import random
import ast
import numpy as np
import uuid
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp
from pathlib import Path
import time
from typing import Any, List
from importlib.resources import files
from f5_tts.api import F5TTS

class SpeechSynthesis:
  def __init__(self, ref_folder_path, json_folder_path, num_workers):
    self.ref_folder_path = ref_folder_path
    self.json_folder_path = json_folder_path
    self.num_workers = num_workers

  def load_model(self):
    f5tts = F5TTS(device = 'cuda')
    return f5tts


  def audio_generate(self, command: str, model, ref_file: str = "", ref_text: str = ""):
      wav, sr, spec = model.infer(
          ref_file=ref_file,
          ref_text=ref_text,
          gen_text=command,
          seed=None,
      )
      return wav, sr, spec

  def process_single_command(self, row: pd.Series, ref_cache: ReferenceCache, export_path: str, model: Any):
    try:
        ref_id, ref_speech_path, ref_text = ref_cache.sample_reference()
        wav, sr, _ = self.audio_generate(row['text'], model, ref_speech_path, ref_text)

        cur_id = row['id']
        cmd_path = Path(export_path) / f"{row['type']}_{cur_id}"
        cmd_path.mkdir(parents=True, exist_ok=True)

        # Save full audio
        sf.write(cmd_path / f"{row['type']}_{cur_id}_full.wav", wav, sr)

        type_segments = {}
        text_segments = {}

        if pd.notna(row['segments']) and row['segments'] not in [None, "", "nan"]:
            seg_list = ast.literal_eval(row['segments'])
            for count, seg in enumerate(seg_list):
                key = list(seg.keys())[0]
                seg_wav, seg_sr, _ = self.audio_generate(seg[key], model, ref_speech_path, ref_text)
                sf.write(cmd_path / f"{row['type']}_{cur_id}_seg_{count}.wav", seg_wav, seg_sr)
                list_key = list(seg.keys())
                type_segments[list_key[0]] = seg[list_key[1]] if len(list_key) > 1 else ""
                text_segments[list_key[0]] = seg[list_key[0]]

        json_template = {
            "id": cur_id,
            "type": row['type'],
            "command": row['text'],
            "sampling_rate": sr,
            "num_segments": len(type_segments),
            "type_segments": type_segments,
            "text_segments": text_segments,
            "ref_id": ref_id,
            "ref_file": ref_speech_path,
            "ref_text": ref_text
        }
        with open(cmd_path / f"{row['type']}_{cur_id}.json", 'w', encoding='utf-8') as f:
            json.dump(json_template, f, ensure_ascii=False, indent=2)

        return json_template
    except Exception as e:
        print(f"Failed to process: {row['text']}, error: {e}")
        return None


  def worker_task(self, rows: List[pd.Series], ref_folder: str, json_folder: str, export_path: str):
      # Mỗi worker load model riêng
      model = self.load_model()
      ref_cache = ReferenceCache(ref_folder, json_folder)
      results = []
      for row in rows:
          res = self.process_single_command(row, ref_cache, export_path, model)
          if res:
              results.append(res)
      return results

  def command_synthesis_pipeline_multi_model(
    self,
    text_command,
    export_path
  ):
    start_time = time.time()
    os.makedirs(export_path, exist_ok=True)

    if self.num_workers is None:
        self.num_workers = min(mp.cpu_count(), 4)  # Giới hạn số model load

    # Chia dữ liệu thành N phần, mỗi worker xử lý riêng
    chunks = np.array_split(text_command, self.num_workers)

    all_results = []
    with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
        futures = [
            executor.submit(self.worker_task, chunk.to_dict('records'), self.ref_folder_path, self.json_folder_path, export_path)
            for chunk in chunks
        ]
        for i, future in enumerate(futures):
            res_list = future.result()
            all_results.extend(res_list)
            print(f"Worker {i+1} done ({len(res_list)} items)")

    print(f"Pipeline finished in {time.time() - start_time:.2f}s")
    return pd.DataFrame(all_results)



In [None]:
import gdown

gid = '1UVlc-elJEGK6xvGct4QorySYjn2ISt7W'
url = f"https://drive.google.com/uc?id={gid}"

output = gdown.download(url, quiet=False)  # output sẽ là tên file đã tải

In [None]:
!unzip -q {output}

In [None]:
import shutil

param = {
    'ref_folder_path': "filtered_audio",
    'json_folder_path': "filtered_json",
    'num_workers': 4,
}

csv_folder = f'{os.path.splitext(output)[0]}'
export_folder = 'synthesis_command_pad'

def main(export_folder):
  os.makedirs(export_folder, exist_ok=True)
  list_speech_df = []
  speech_synthesis = SpeechSynthesis(**param)
  for csv_file in os.listdir(csv_folder):
    if csv_file.endswith('.csv'):
      csv_path = os.path.join(csv_folder, csv_file)
      df = pd.read_csv(csv_path)
    else:
      continue

    name = os.path.splitext(csv_file)[0]
    export_path = f'{export_folder}/synthesis_command_{name}'
    os.makedirs(export_path, exist_ok=True)
    df_res = speech_synthesis.command_synthesis_pipeline_multi_model(df, export_path=export_path)
    shutil.make_archive(export_path, 'zip', export_path)
    list_speech_df.append(df_res)

    df_path = os.path.join(export_path, f'synthesis_command_{name}.csv')
    df.to_csv(df_path, index=False)

  list_speech_df = pd.concat(list_speech_df)
  list_speech_df.to_csv('synthesis_command.csv', index=False)

In [None]:
if __name__ == '__main__':
  main(export_folder = export_folder)

In [None]:
!zip synthesis_command.zip /content/synthesis_command_pad -r