<a href="https://colab.research.google.com/github/10udCryp7/TV-command-synthesis/blob/main/src_prototype/Phase2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# F5-TTS
!pip install -q f5-tts

In [None]:
# Reference Speech
!gdown 1uTreohCIiYSlrQTa3fuH1_IQjdeW1SaE --quiet
# Reference Meta (including Text)
!gdown 1PBs6r3cqhFxWzy9s5wrGpF6-5ZnoOWxE --quiet

!unzip -q audio.zip -d audio
!unzip -q json.zip -d json

In [None]:
import gdown

gid = '1IQ6Pz4_S-bIcL-N_ZKzo-cO3yAPnySL6'
url = f"https://drive.google.com/uc?id={gid}"

output = gdown.download(url, quiet=False)  # output sẽ là tên file đã tải

In [None]:
import pandas as pd
text_command = pd.read_csv(output)

In [None]:
class ReferenceCache:
    def __init__(self, audio_folder_path: str, json_folder_path: str):
        self.audio_folder_path = Path(audio_folder_path)
        self.json_folder_path = Path(json_folder_path)
        self._audio_files = None
        self._json_files = None
        self._ref_texts = {}

    @property
    def audio_files(self):
        if self._audio_files is None:
            self._audio_files = [f for f in self.audio_folder_path.iterdir() if f.is_file()]
        return self._audio_files

    @property
    def json_files(self):
        if self._json_files is None:
            self._json_files = [f for f in self.json_folder_path.iterdir() if f.suffix == '.json']
        return self._json_files

    def get_ref_text(self, json_path: Path) -> str:
        if str(json_path) not in self._ref_texts:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                self._ref_texts[str(json_path)] = data.get('text', '')
        return self._ref_texts[str(json_path)]

    def sample_reference(self):
        audio_file = random.choice(self.audio_files)
        json_file = random.choice(self.json_files)
        ref_id = json_file.stem
        ref_speech_path = str(audio_file)
        ref_text = self.get_ref_text(json_file)
        return ref_id, ref_speech_path, ref_text

In [None]:
import os
import json
import soundfile as sf
import pandas as pd
import random
import ast
import numpy as np
import uuid
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp
from pathlib import Path
import time
from typing import Any, List
from importlib.resources import files
from f5_tts.api import F5TTS

class SpeechSynthesis:
  def __init__(self, ref_folder_path, json_folder_path, export_path, num_workers):
    self.ref_folder_path = ref_folder_path
    self.json_folder_path = json_folder_path
    self.export_path = export_path
    self.num_workers = num_workers

  def load_model(self):
    f5tts = F5TTS(device = 'cuda')
    return f5tts


  def audio_generate(self, command: str, model, ref_file: str = "", ref_text: str = ""):
      wav, sr, spec = model.infer(
          ref_file=ref_file,
          ref_text=ref_text,
          gen_text=command,
          seed=None,
      )
      return wav, sr, spec

  def process_single_command(self, row: pd.Series, ref_cache: ReferenceCache, export_path: str, model: Any):
    try:
        ref_id, ref_speech_path, ref_text = ref_cache.sample_reference()
        wav, sr, _ = self.audio_generate(row['text'], model, ref_speech_path, ref_text)

        cur_id = row['id']
        cmd_path = Path(export_path) / f"{row['type']}_{cur_id}"
        cmd_path.mkdir(parents=True, exist_ok=True)

        # Save full audio
        sf.write(cmd_path / f"{row['type']}_{cur_id}_full.wav", wav, sr)

        type_segments = {}
        text_segments = {}

        if pd.notna(row['segments']) and row['segments'] not in [None, "", "nan"]:
            seg_list = ast.literal_eval(row['segments'])
            for count, seg in enumerate(seg_list):
                key = list(seg.keys())[0]
                seg_wav, seg_sr, _ = self.audio_generate(seg[key], model, ref_speech_path, ref_text)
                sf.write(cmd_path / f"{row['type']}_{cur_id}_seg_{count}.wav", seg_wav, seg_sr)
                list_key = list(seg.keys())
                type_segments[list_key[0]] = seg[list_key[1]] if len(list_key) > 1 else ""
                text_segments[list_key[0]] = seg[list_key[0]]

        json_template = {
            "id": cur_id,
            "type": row['type'],
            "command": row['text'],
            "sampling_rate": sr,
            "num_segments": len(type_segments),
            "type_segments": type_segments,
            "text_segments": text_segments,
            "ref_id": ref_id,
            "ref_file": ref_speech_path,
            "ref_text": ref_text
        }
        with open(cmd_path / f"{row['type']}_{cur_id}.json", 'w', encoding='utf-8') as f:
            json.dump(json_template, f, ensure_ascii=False, indent=2)

        return json_template
    except Exception as e:
        print(f"Failed to process: {row['text']}, error: {e}")
        return None


  def worker_task(self, rows: List[pd.Series], ref_folder: str, json_folder: str, export_path: str):
      # Mỗi worker load model riêng
      model = self.load_model()
      ref_cache = ReferenceCache(ref_folder, json_folder)
      results = []
      for row in rows:
          res = self.process_single_command(row, ref_cache, export_path, model)
          if res:
              results.append(res)
      return results

  def command_synthesis_pipeline_multi_model(
    self,
    text_command
  ):
    start_time = time.time()
    os.makedirs(self.export_path, exist_ok=True)

    if self.num_workers is None:
        self.num_workers = min(mp.cpu_count(), 4)  # Giới hạn số model load

    # Chia dữ liệu thành N phần, mỗi worker xử lý riêng
    chunks = np.array_split(text_command, self.num_workers)

    all_results = []
    with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
        futures = [
            executor.submit(self.worker_task, chunk.to_dict('records'), self.ref_folder_path, self.json_folder_path, self.export_path)
            for chunk in chunks
        ]
        for i, future in enumerate(futures):
            res_list = future.result()
            all_results.extend(res_list)
            print(f"Worker {i+1} done ({len(res_list)} items)")

    print(f"Pipeline finished in {time.time() - start_time:.2f}s")
    return pd.DataFrame(all_results)



In [None]:
param = {
    'ref_folder_path': "audio/dung",
    'json_folder_path': "json/dung",
    'export_path': "synthesis_command",
    'num_workers': 4,
}

def main():
  speech_synthesis = SpeechSynthesis(**param)
  speech_synthesis.command_synthesis_pipeline_multi_model(text_command)

In [None]:
speech_synthesis = SpeechSynthesis(**param)
speech_synthesis.command_synthesis_pipeline_multi_model(text_command)