<a href="https://colab.research.google.com/github/10udCryp7/TV-command-synthesis/blob/main/src_prototype/Phase3_Concat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import pandas as pd

def load_json_folders(root_dir: str) -> pd.DataFrame:
    """
    Duyệt tất cả thư mục con trong root_dir,
    mỗi thư mục chứa một file JSON, load vào DataFrame.

    Parameters
    ----------
    root_dir : str
        Thư mục gốc chứa các thư mục con.

    Returns
    -------
    pd.DataFrame
        DataFrame chứa dữ liệu từ tất cả file JSON.
    """
    data = []
    for subdir in os.listdir(root_dir):
        subpath = os.path.join(root_dir, subdir)
        if os.path.isdir(subpath):
            # tìm file json trong subdir
            for fname in os.listdir(subpath):
                if fname.endswith(".json"):
                    fpath = os.path.join(subpath, fname)
                    with open(fpath, "r", encoding="utf-8") as f:
                        try:
                            obj = json.load(f)
                            data.append(obj)
                        except Exception as e:
                            print(f"⚠️ Lỗi đọc {fpath}: {e}")
    return pd.DataFrame(data)


In [None]:
!gdown 1--F60ZQLlD5EKJ32On6MB5CHXW6V6ncJ

In [None]:
!unzip -q sample-5000-speech-synthesis-zip.zip

In [None]:
import os

df_dict = {}
for path in os.listdir('sample-5000-speech-synthesis-zip'):
    root_folder = os.path.join('sample-5000-speech-synthesis-zip', path)
    df = load_json_folders(root_folder)
    df_dict[path] = df

# => we have

In [None]:
!gdown 1ktYfos4wHLUtE1iuC1Pc_wV2cEsSzytm

In [None]:
!unzip -q sample-5000-trimmed-speech.zip

In [None]:
!gdown 1bqztSQKwtHABu4_JHFQcRPoxUzAZmEWh

In [None]:
bad_df = pd.read_csv('sample-5000-bad-list.csv')

In [None]:
bad_list = set(bad_df['id'].values)

In [None]:
df_dict['synthesis_command_part_1']

In [None]:
import os
import json
from pydub import AudioSegment
import ast


import os
import json
import ast
from pydub import AudioSegment


class AudioConcatenator:
    def __init__(self, bad_list=None):
        self.bad_list = set(bad_list) if bad_list else set()

    def _parse_segments(self, type_segments):
        if isinstance(type_segments, str):
            return ast.literal_eval(type_segments)
        if isinstance(type_segments, dict):
            return type_segments
        raise ValueError("Unknown type_segments format")

    def _concat_file(self, row, input_folder, output_folder):
        file_name = f"{row['type']}_{row['id']}"
        type_segments = self._parse_segments(row['type_segments'])

        combined = AudioSegment.silent(duration=0)
        annotations, current_time_ms = [], 0
        num_segments = row['num_segments']

        for i in range(num_segments):
            seg_path = os.path.join(input_folder, file_name, f"{file_name}_seg_{i}_trimmed.wav")
            if not os.path.exists(seg_path):
                print(f"Missing: {seg_path}")
                continue

            segment = AudioSegment.from_wav(seg_path)
            duration_ms = len(segment)

            label = type_segments.get(str(i), "unknown")
            annotations.append({
                "label": label,
                "start": current_time_ms / 1000.0,
                "end": (current_time_ms + duration_ms) / 1000.0
            })

            combined += segment
            current_time_ms += duration_ms

        return self._save_outputs(file_name, combined, annotations, output_folder)

    def _get_non_mix(self, row, input_folder, output_folder):
        file_name = f"{row['type']}_{row['id']}"
        file_path = os.path.join(input_folder, file_name, f"{file_name}_trimmed.wav")

        if not os.path.exists(file_path):
            file_name_full = f"{row['type']}_{row['id']}_full"
            file_path = os.path.join(input_folder, file_name, f"{file_name_full}_trimmed.wav")

        if not os.path.exists(file_path):
            print(f"Missing: {file_path}")
            return None
        segment = AudioSegment.from_wav(file_path)
        duration_ms = len(segment)

        label = "non_active" if row["type"] == "non_active" else "active"
        annotations = [{
            "label": label,
            "start": 0,
            "end": duration_ms / 1000.0
        }]

        return self._save_outputs(file_name, segment, annotations, output_folder)

    def _save_outputs(self, file_name, audio, annotations, output_folder):
        file_output_dir = os.path.join(output_folder, file_name)
        os.makedirs(file_output_dir, exist_ok=True)

        # Save audio
        output_wav = os.path.join(file_output_dir, f"{file_name}_concat.wav")
        audio.export(output_wav, format="wav")

        # Save annotations
        output_json = os.path.join(file_output_dir, f"{file_name}.json")
        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(annotations, f, indent=2, ensure_ascii=False)

        print(f"Saved: {output_wav}, {output_json}")
        return {"wav": output_wav, "json": output_json}

    def process(self, metadata, input_folder, output_folder):
        """Xử lý toàn bộ metadata với input/output folder chỉ định"""
        results = []
        for _, row in metadata.iterrows():
            file_name = f"{row['type']}_{row['id']}"
            file_name_full = f"{row['type']}_{row['id']}_full"
            if file_name in self.bad_list or file_name_full in self.bad_list:
                print(f"Skip {file_name}")
                continue

            if row["num_segments"] != 0:
                result = self._concat_file(row, input_folder, output_folder)
            else:
                result = self._get_non_mix(row, input_folder, output_folder)

            if result:
                results.append(result)

        return results



In [None]:
from tqdm import tqdm
from pathlib import Path
import os

concat_tool = AudioConcatenator(bad_list=bad_list)
input_root = "trimmed_speech"


results_dict = {}
for folder, df in tqdm(df_dict.items(), desc="Processing folders", unit="folder"):
    name = Path(folder).name
    output_folder = os.path.join("concat_speech", name)
    os.makedirs(output_folder, exist_ok=True)
    input_folder = os.path.join(input_root, name)
    results = concat_tool.process(df, input_folder=input_folder, output_folder=output_folder)
    results_dict[name] = results




In [None]:
!zip sample-5000-concat.zip concat_speech -r

In [None]:
import os
from mutagen import File

def total_audio_duration_hours(folder_path: str) -> float:
    """
    Tính tổng thời lượng (giờ) của toàn bộ file âm thanh trong folder (bao gồm thư mục con).
    Dùng mutagen để đọc metadata (nhanh hơn nhiều so với decode audio).

    Args:
        folder_path (str): đường dẫn đến thư mục

    Returns:
        float: tổng số giờ của tất cả file âm thanh
    """
    total_seconds = 0.0

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                audio = File(file_path)
                if audio is not None and audio.info is not None:
                    total_seconds += audio.info.length
            except Exception:
                # bỏ qua file không đọc được
                continue

    return total_seconds / 3600.0


# ví dụ dùng
if __name__ == "__main__":
    folder = "concat_speech"
    hours = total_audio_duration_hours(folder)
    print(hours)


In [None]:
!pip install mutagen