<a href="https://colab.research.google.com/github/dungdt-infopstats/Device-Directed-Speech-Segmentation/blob/main/src_prototype/Phase3_Concat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import json
import pandas as pd

def load_json_folders(root_dir: str) -> pd.DataFrame:
    """
    Duyệt tất cả thư mục con trong root_dir,
    mỗi thư mục chứa một file JSON, load vào DataFrame.

    Parameters
    ----------
    root_dir : str
        Thư mục gốc chứa các thư mục con.

    Returns
    -------
    pd.DataFrame
        DataFrame chứa dữ liệu từ tất cả file JSON.
    """
    data = []
    for subdir in os.listdir(root_dir):
        subpath = os.path.join(root_dir, subdir)
        if os.path.isdir(subpath):
            # tìm file json trong subdir
            for fname in os.listdir(subpath):
                if fname.endswith(".json"):
                    fpath = os.path.join(subpath, fname)
                    with open(fpath, "r", encoding="utf-8") as f:
                        try:
                            obj = json.load(f)
                            data.append(obj)
                        except Exception as e:
                            print(f"⚠️ Lỗi đọc {fpath}: {e}")
    return pd.DataFrame(data)


In [2]:
!gdown 1Tm1UlKNokXDx9cKB3VbAdDMN8kMYcGXX

Downloading...
From (original): https://drive.google.com/uc?id=1Tm1UlKNokXDx9cKB3VbAdDMN8kMYcGXX
From (redirected): https://drive.google.com/uc?id=1Tm1UlKNokXDx9cKB3VbAdDMN8kMYcGXX&confirm=t&uuid=d7083e44-fc58-48b5-9d35-c95761d6f36e
To: /content/all.zip
100% 1.27G/1.27G [00:21<00:00, 58.1MB/s]


In [3]:
!unzip -q all.zip

In [4]:
import os

df_dict = {}
root_path = "all"
for path in os.listdir(root_path):
    folder_path = os.path.join(root_path, path)
    df = load_json_folders(folder_path)
    df_dict[path] = df

# => we have

In [None]:
!gdown 1ktYfos4wHLUtE1iuC1Pc_wV2cEsSzytm

In [None]:
!unzip -q sample-5000-trimmed-speech.zip

In [None]:
!gdown 1bqztSQKwtHABu4_JHFQcRPoxUzAZmEWh

In [None]:
bad_df = pd.read_csv('sample-5000-bad-list.csv')

In [None]:
bad_list = set(bad_df['id'].values)

In [None]:
df_dict['synthesis_command_part_1']

In [9]:
import os
import json
from pydub import AudioSegment
import ast


import os
import json
import ast
from pydub import AudioSegment


class AudioConcatenator:
    def __init__(self, bad_list=None, post_fix=None):
        self.bad_list = set(bad_list) if bad_list else set()
        self.post_fix = post_fix if post_fix else ""
    def _parse_segments(self, type_segments):
        if isinstance(type_segments, str):
            return ast.literal_eval(type_segments)
        if isinstance(type_segments, dict):
            return type_segments
        raise ValueError("Unknown type_segments format")

    def _concat_file(self, row, input_folder, output_folder):
        file_name = f"{row['type']}_{row['id']}"
        type_segments = self._parse_segments(row['type_segments'])

        combined = AudioSegment.silent(duration=0)
        annotations, current_time_ms = [], 0
        num_segments = row['num_segments']

        for i in range(num_segments):
            seg_path = os.path.join(input_folder, file_name, f"{file_name}_seg_{i}{self.post_fix}.wav")
            if not os.path.exists(seg_path):
                print(f"Missing: {seg_path}")
                continue

            segment = AudioSegment.from_wav(seg_path)
            duration_ms = len(segment)

            label = type_segments.get(str(i), "unknown")
            annotations.append({
                "label": label,
                "start": current_time_ms / 1000.0,
                "end": (current_time_ms + duration_ms) / 1000.0
            })

            combined += segment
            current_time_ms += duration_ms

        return self._save_outputs(file_name, combined, annotations, output_folder)

    def _get_non_mix(self, row, input_folder, output_folder):
        file_name = f"{row['type']}_{row['id']}"
        file_path = os.path.join(input_folder, file_name, f"{file_name}{self.post_fix}.wav")

        if not os.path.exists(file_path):
            file_name_full = f"{row['type']}_{row['id']}_full"
            file_path = os.path.join(input_folder, file_name, f"{file_name_full}{self.post_fix}.wav")

        if not os.path.exists(file_path):
            print(f"Missing: {file_path}")
            return None
        segment = AudioSegment.from_wav(file_path)
        duration_ms = len(segment)

        label = "non_active" if row["type"] == "non_active" else "active"
        annotations = [{
            "label": label,
            "start": 0,
            "end": duration_ms / 1000.0
        }]

        return self._save_outputs(file_name, segment, annotations, output_folder)

    def _save_outputs(self, file_name, audio, annotations, output_folder):
        file_output_dir = os.path.join(output_folder, file_name)
        os.makedirs(file_output_dir, exist_ok=True)

        # Save audio
        output_wav = os.path.join(file_output_dir, f"{file_name}_concat.wav")
        audio.export(output_wav, format="wav")

        # Save annotations
        output_json = os.path.join(file_output_dir, f"{file_name}.json")
        with open(output_json, "w", encoding="utf-8") as f:
            json.dump(annotations, f, indent=2, ensure_ascii=False)

        print(f"Saved: {output_wav}, {output_json}")
        return {"wav": output_wav, "json": output_json}

    def process(self, metadata, input_folder, output_folder):
        """Xử lý toàn bộ metadata với input/output folder chỉ định"""
        results = []
        for _, row in metadata.iterrows():
            file_name = f"{row['type']}_{row['id']}"
            file_name_full = f"{row['type']}_{row['id']}_full"
            if file_name in self.bad_list or file_name_full in self.bad_list:
                print(f"Skip {file_name}")
                continue

            if row["num_segments"] != 0:
                result = self._concat_file(row, input_folder, output_folder)
            else:
                result = self._get_non_mix(row, input_folder, output_folder)

            if result:
                results.append(result)

        return results



In [7]:
# config
bad_list = None

In [10]:
from tqdm import tqdm
from pathlib import Path
import os

concat_tool = AudioConcatenator(bad_list=bad_list)
input_root = "all"


results_dict = {}
for folder, df in tqdm(df_dict.items(), desc="Processing folders", unit="folder"):
    name = Path(folder).name
    output_folder = os.path.join("concat_speech", name)
    os.makedirs(output_folder, exist_ok=True)
    input_folder = os.path.join(input_root, name)
    results = concat_tool.process(df, input_folder=input_folder, output_folder=output_folder)
    results_dict[name] = results




Processing folders:   4%|▍         | 2/47 [00:00<00:03, 12.84folder/s]

Saved: concat_speech/synthesis_command_commands_146/single_mix_8694b5db/single_mix_8694b5db_concat.wav, concat_speech/synthesis_command_commands_146/single_mix_8694b5db/single_mix_8694b5db.json
Saved: concat_speech/synthesis_command_commands_146/single_mix_2fbf3053/single_mix_2fbf3053_concat.wav, concat_speech/synthesis_command_commands_146/single_mix_2fbf3053/single_mix_2fbf3053.json
Saved: concat_speech/synthesis_command_commands_146/chain_active_fa4d3fd2/chain_active_fa4d3fd2_concat.wav, concat_speech/synthesis_command_commands_146/chain_active_fa4d3fd2/chain_active_fa4d3fd2.json
Saved: concat_speech/synthesis_command_commands_146/single_active_b22311b8/single_active_b22311b8_concat.wav, concat_speech/synthesis_command_commands_146/single_active_b22311b8/single_active_b22311b8.json
Saved: concat_speech/synthesis_command_commands_146/single_active_1c986873/single_active_1c986873_concat.wav, concat_speech/synthesis_command_commands_146/single_active_1c986873/single_active_1c986873.jso

Processing folders:   9%|▊         | 4/47 [00:00<00:03, 12.48folder/s]

Saved: concat_speech/synthesis_command_commands_147/chain_mix_d06a7908/chain_mix_d06a7908_concat.wav, concat_speech/synthesis_command_commands_147/chain_mix_d06a7908/chain_mix_d06a7908.json
Saved: concat_speech/synthesis_command_commands_147/single_active_418ecaa3/single_active_418ecaa3_concat.wav, concat_speech/synthesis_command_commands_147/single_active_418ecaa3/single_active_418ecaa3.json
Saved: concat_speech/synthesis_command_commands_147/chain_mix_61b1db28/chain_mix_61b1db28_concat.wav, concat_speech/synthesis_command_commands_147/chain_mix_61b1db28/chain_mix_61b1db28.json
Saved: concat_speech/synthesis_command_commands_147/chain_active_c79d3f9f/chain_active_c79d3f9f_concat.wav, concat_speech/synthesis_command_commands_147/chain_active_c79d3f9f/chain_active_c79d3f9f.json
Saved: concat_speech/synthesis_command_commands_147/single_mix_9bd3f40a/single_mix_9bd3f40a_concat.wav, concat_speech/synthesis_command_commands_147/single_mix_9bd3f40a/single_mix_9bd3f40a.json
Saved: concat_spee

Processing folders:  15%|█▍        | 7/47 [00:00<00:05,  7.81folder/s]

Saved: concat_speech/synthesis_command_commands_136/chain_mix_1e4ca64c/chain_mix_1e4ca64c_concat.wav, concat_speech/synthesis_command_commands_136/chain_mix_1e4ca64c/chain_mix_1e4ca64c.json
Saved: concat_speech/synthesis_command_commands_136/non_active_27dc04db/non_active_27dc04db_concat.wav, concat_speech/synthesis_command_commands_136/non_active_27dc04db/non_active_27dc04db.json
Saved: concat_speech/synthesis_command_commands_136/single_mix_39e54313/single_mix_39e54313_concat.wav, concat_speech/synthesis_command_commands_136/single_mix_39e54313/single_mix_39e54313.json
Saved: concat_speech/synthesis_command_commands_136/single_active_561c06ac/single_active_561c06ac_concat.wav, concat_speech/synthesis_command_commands_136/single_active_561c06ac/single_active_561c06ac.json
Saved: concat_speech/synthesis_command_commands_136/single_active_f4888fd4/single_active_f4888fd4_concat.wav, concat_speech/synthesis_command_commands_136/single_active_f4888fd4/single_active_f4888fd4.json
Saved: con

Processing folders:  19%|█▉        | 9/47 [00:01<00:04,  8.61folder/s]

Saved: concat_speech/synthesis_command_commands_100/chain_mix_3c71b9c0/chain_mix_3c71b9c0_concat.wav, concat_speech/synthesis_command_commands_100/chain_mix_3c71b9c0/chain_mix_3c71b9c0.json
Saved: concat_speech/synthesis_command_commands_100/single_mix_3d3e2e71/single_mix_3d3e2e71_concat.wav, concat_speech/synthesis_command_commands_100/single_mix_3d3e2e71/single_mix_3d3e2e71.json
Saved: concat_speech/synthesis_command_commands_100/chain_mix_27468357/chain_mix_27468357_concat.wav, concat_speech/synthesis_command_commands_100/chain_mix_27468357/chain_mix_27468357.json
Saved: concat_speech/synthesis_command_commands_100/chain_active_09b7f228/chain_active_09b7f228_concat.wav, concat_speech/synthesis_command_commands_100/chain_active_09b7f228/chain_active_09b7f228.json
Saved: concat_speech/synthesis_command_commands_100/non_active_ed9416cb/non_active_ed9416cb_concat.wav, concat_speech/synthesis_command_commands_100/non_active_ed9416cb/non_active_ed9416cb.json
Saved: concat_speech/synthesis

Processing folders:  23%|██▎       | 11/47 [00:01<00:03,  9.48folder/s]

Saved: concat_speech/synthesis_command_commands_111/chain_active_852aa7f1/chain_active_852aa7f1_concat.wav, concat_speech/synthesis_command_commands_111/chain_active_852aa7f1/chain_active_852aa7f1.json
Saved: concat_speech/synthesis_command_commands_111/single_mix_d3b71145/single_mix_d3b71145_concat.wav, concat_speech/synthesis_command_commands_111/single_mix_d3b71145/single_mix_d3b71145.json
Saved: concat_speech/synthesis_command_commands_111/single_mix_2cef6c03/single_mix_2cef6c03_concat.wav, concat_speech/synthesis_command_commands_111/single_mix_2cef6c03/single_mix_2cef6c03.json
Saved: concat_speech/synthesis_command_commands_129/non_active_3952b5d3/non_active_3952b5d3_concat.wav, concat_speech/synthesis_command_commands_129/non_active_3952b5d3/non_active_3952b5d3.json
Saved: concat_speech/synthesis_command_commands_129/chain_active_6a82433e/chain_active_6a82433e_concat.wav, concat_speech/synthesis_command_commands_129/chain_active_6a82433e/chain_active_6a82433e.json
Saved: concat_

Processing folders:  28%|██▊       | 13/47 [00:01<00:03,  9.99folder/s]

Saved: concat_speech/synthesis_command_commands_104/chain_mix_79ad920d/chain_mix_79ad920d_concat.wav, concat_speech/synthesis_command_commands_104/chain_mix_79ad920d/chain_mix_79ad920d.json
Saved: concat_speech/synthesis_command_commands_104/non_active_55f4d29a/non_active_55f4d29a_concat.wav, concat_speech/synthesis_command_commands_104/non_active_55f4d29a/non_active_55f4d29a.json
Saved: concat_speech/synthesis_command_commands_104/chain_active_15772d64/chain_active_15772d64_concat.wav, concat_speech/synthesis_command_commands_104/chain_active_15772d64/chain_active_15772d64.json
Saved: concat_speech/synthesis_command_commands_104/chain_active_1b395e13/chain_active_1b395e13_concat.wav, concat_speech/synthesis_command_commands_104/chain_active_1b395e13/chain_active_1b395e13.json
Saved: concat_speech/synthesis_command_commands_104/single_mix_16a8bac7/single_mix_16a8bac7_concat.wav, concat_speech/synthesis_command_commands_104/single_mix_16a8bac7/single_mix_16a8bac7.json
Saved: concat_spee

Processing folders:  32%|███▏      | 15/47 [00:06<00:26,  1.19folder/s]

Saved: concat_speech/synthesis_command_commands_119/single_mix_d910fcc5/single_mix_d910fcc5_concat.wav, concat_speech/synthesis_command_commands_119/single_mix_d910fcc5/single_mix_d910fcc5.json
Saved: concat_speech/synthesis_command_commands_119/chain_active_637bbaf0/chain_active_637bbaf0_concat.wav, concat_speech/synthesis_command_commands_119/chain_active_637bbaf0/chain_active_637bbaf0.json
Saved: concat_speech/synthesis_command_commands_119/non_active_09694fe9/non_active_09694fe9_concat.wav, concat_speech/synthesis_command_commands_119/non_active_09694fe9/non_active_09694fe9.json
Saved: concat_speech/synthesis_command_commands_119/chain_mix_3958c832/chain_mix_3958c832_concat.wav, concat_speech/synthesis_command_commands_119/chain_mix_3958c832/chain_mix_3958c832.json
Saved: concat_speech/synthesis_command_commands_119/single_active_0ba034e2/single_active_0ba034e2_concat.wav, concat_speech/synthesis_command_commands_119/single_active_0ba034e2/single_active_0ba034e2.json
Saved: concat_

Processing folders:  36%|███▌      | 17/47 [00:06<00:18,  1.66folder/s]

Saved: concat_speech/synthesis_command_commands_109/chain_active_68213735/chain_active_68213735_concat.wav, concat_speech/synthesis_command_commands_109/chain_active_68213735/chain_active_68213735.json
Saved: concat_speech/synthesis_command_commands_109/single_mix_e973f4bc/single_mix_e973f4bc_concat.wav, concat_speech/synthesis_command_commands_109/single_mix_e973f4bc/single_mix_e973f4bc.json
Saved: concat_speech/synthesis_command_commands_109/chain_mix_92428e52/chain_mix_92428e52_concat.wav, concat_speech/synthesis_command_commands_109/chain_mix_92428e52/chain_mix_92428e52.json
Saved: concat_speech/synthesis_command_commands_109/non_active_c25a53c6/non_active_c25a53c6_concat.wav, concat_speech/synthesis_command_commands_109/non_active_c25a53c6/non_active_c25a53c6.json
Saved: concat_speech/synthesis_command_commands_109/non_active_537cf3bb/non_active_537cf3bb_concat.wav, concat_speech/synthesis_command_commands_109/non_active_537cf3bb/non_active_537cf3bb.json
Saved: concat_speech/synth

Processing folders:  40%|████      | 19/47 [00:06<00:12,  2.23folder/s]

Saved: concat_speech/synthesis_command_commands_127/single_active_672e66c1/single_active_672e66c1_concat.wav, concat_speech/synthesis_command_commands_127/single_active_672e66c1/single_active_672e66c1.json
Saved: concat_speech/synthesis_command_commands_127/non_active_0be5a4b5/non_active_0be5a4b5_concat.wav, concat_speech/synthesis_command_commands_127/non_active_0be5a4b5/non_active_0be5a4b5.json
Saved: concat_speech/synthesis_command_commands_127/non_active_54bcdc43/non_active_54bcdc43_concat.wav, concat_speech/synthesis_command_commands_127/non_active_54bcdc43/non_active_54bcdc43.json
Saved: concat_speech/synthesis_command_commands_127/single_active_ba98b4ed/single_active_ba98b4ed_concat.wav, concat_speech/synthesis_command_commands_127/single_active_ba98b4ed/single_active_ba98b4ed.json
Saved: concat_speech/synthesis_command_commands_127/chain_mix_5ee6c4e7/chain_mix_5ee6c4e7_concat.wav, concat_speech/synthesis_command_commands_127/chain_mix_5ee6c4e7/chain_mix_5ee6c4e7.json
Saved: con

Processing folders:  45%|████▍     | 21/47 [00:06<00:08,  3.04folder/s]

Saved: concat_speech/synthesis_command_commands_135/single_mix_23f9ce70/single_mix_23f9ce70_concat.wav, concat_speech/synthesis_command_commands_135/single_mix_23f9ce70/single_mix_23f9ce70.json
Saved: concat_speech/synthesis_command_commands_135/single_active_53128e31/single_active_53128e31_concat.wav, concat_speech/synthesis_command_commands_135/single_active_53128e31/single_active_53128e31.json
Saved: concat_speech/synthesis_command_commands_135/non_active_376c65cb/non_active_376c65cb_concat.wav, concat_speech/synthesis_command_commands_135/non_active_376c65cb/non_active_376c65cb.json
Saved: concat_speech/synthesis_command_commands_135/chain_mix_a098c2be/chain_mix_a098c2be_concat.wav, concat_speech/synthesis_command_commands_135/chain_mix_a098c2be/chain_mix_a098c2be.json
Saved: concat_speech/synthesis_command_commands_135/single_mix_f438041e/single_mix_f438041e_concat.wav, concat_speech/synthesis_command_commands_135/single_mix_f438041e/single_mix_f438041e.json
Saved: concat_speech/s

Processing folders:  47%|████▋     | 22/47 [00:06<00:06,  3.59folder/s]

Saved: concat_speech/synthesis_command_commands_131/chain_active_2cbddf0d/chain_active_2cbddf0d_concat.wav, concat_speech/synthesis_command_commands_131/chain_active_2cbddf0d/chain_active_2cbddf0d.json
Saved: concat_speech/synthesis_command_commands_131/single_mix_ffbb39c2/single_mix_ffbb39c2_concat.wav, concat_speech/synthesis_command_commands_131/single_mix_ffbb39c2/single_mix_ffbb39c2.json
Saved: concat_speech/synthesis_command_commands_131/non_active_e9d25022/non_active_e9d25022_concat.wav, concat_speech/synthesis_command_commands_131/non_active_e9d25022/non_active_e9d25022.json
Saved: concat_speech/synthesis_command_commands_131/chain_active_39850e80/chain_active_39850e80_concat.wav, concat_speech/synthesis_command_commands_131/chain_active_39850e80/chain_active_39850e80.json
Saved: concat_speech/synthesis_command_commands_131/chain_active_68557f6a/chain_active_68557f6a_concat.wav, concat_speech/synthesis_command_commands_131/chain_active_68557f6a/chain_active_68557f6a.json
Saved:

Processing folders:  51%|█████     | 24/47 [00:07<00:04,  4.69folder/s]

Saved: concat_speech/synthesis_command_commands_107/chain_mix_e5afec93/chain_mix_e5afec93_concat.wav, concat_speech/synthesis_command_commands_107/chain_mix_e5afec93/chain_mix_e5afec93.json
Saved: concat_speech/synthesis_command_commands_107/single_mix_49890d90/single_mix_49890d90_concat.wav, concat_speech/synthesis_command_commands_107/single_mix_49890d90/single_mix_49890d90.json
Saved: concat_speech/synthesis_command_commands_107/chain_mix_648a2e41/chain_mix_648a2e41_concat.wav, concat_speech/synthesis_command_commands_107/chain_mix_648a2e41/chain_mix_648a2e41.json
Saved: concat_speech/synthesis_command_commands_107/chain_active_9b7d43f5/chain_active_9b7d43f5_concat.wav, concat_speech/synthesis_command_commands_107/chain_active_9b7d43f5/chain_active_9b7d43f5.json
Saved: concat_speech/synthesis_command_commands_107/chain_mix_6520c0f5/chain_mix_6520c0f5_concat.wav, concat_speech/synthesis_command_commands_107/chain_mix_6520c0f5/chain_mix_6520c0f5.json
Saved: concat_speech/synthesis_com

Processing folders:  55%|█████▌    | 26/47 [00:07<00:03,  5.91folder/s]

Saved: concat_speech/synthesis_command_commands_148/single_mix_bcd6397e/single_mix_bcd6397e_concat.wav, concat_speech/synthesis_command_commands_148/single_mix_bcd6397e/single_mix_bcd6397e.json
Saved: concat_speech/synthesis_command_commands_148/single_active_3aec7513/single_active_3aec7513_concat.wav, concat_speech/synthesis_command_commands_148/single_active_3aec7513/single_active_3aec7513.json
Saved: concat_speech/synthesis_command_commands_148/non_active_440c42ec/non_active_440c42ec_concat.wav, concat_speech/synthesis_command_commands_148/non_active_440c42ec/non_active_440c42ec.json
Saved: concat_speech/synthesis_command_commands_148/single_mix_31182ec2/single_mix_31182ec2_concat.wav, concat_speech/synthesis_command_commands_148/single_mix_31182ec2/single_mix_31182ec2.json
Saved: concat_speech/synthesis_command_commands_148/chain_mix_2ecca74a/chain_mix_2ecca74a_concat.wav, concat_speech/synthesis_command_commands_148/chain_mix_2ecca74a/chain_mix_2ecca74a.json
Saved: concat_speech/s

Processing folders:  62%|██████▏   | 29/47 [00:07<00:02,  7.31folder/s]

Saved: concat_speech/synthesis_command_commands_118/single_mix_fdf257d9/single_mix_fdf257d9_concat.wav, concat_speech/synthesis_command_commands_118/single_mix_fdf257d9/single_mix_fdf257d9.json
Saved: concat_speech/synthesis_command_commands_118/single_mix_fe258eb3/single_mix_fe258eb3_concat.wav, concat_speech/synthesis_command_commands_118/single_mix_fe258eb3/single_mix_fe258eb3.json
Saved: concat_speech/synthesis_command_commands_118/single_active_fa6400cf/single_active_fa6400cf_concat.wav, concat_speech/synthesis_command_commands_118/single_active_fa6400cf/single_active_fa6400cf.json
Saved: concat_speech/synthesis_command_commands_118/single_active_90629e7c/single_active_90629e7c_concat.wav, concat_speech/synthesis_command_commands_118/single_active_90629e7c/single_active_90629e7c.json
Saved: concat_speech/synthesis_command_commands_118/non_active_f00a6fe8/non_active_f00a6fe8_concat.wav, concat_speech/synthesis_command_commands_118/non_active_f00a6fe8/non_active_f00a6fe8.json
Saved:

Processing folders:  68%|██████▊   | 32/47 [00:07<00:01,  8.69folder/s]

Saved: concat_speech/synthesis_command_commands_103/chain_active_0848f7d0/chain_active_0848f7d0_concat.wav, concat_speech/synthesis_command_commands_103/chain_active_0848f7d0/chain_active_0848f7d0.json
Saved: concat_speech/synthesis_command_commands_103/chain_mix_3609c358/chain_mix_3609c358_concat.wav, concat_speech/synthesis_command_commands_103/chain_mix_3609c358/chain_mix_3609c358.json
Saved: concat_speech/synthesis_command_commands_103/single_active_4d2a05da/single_active_4d2a05da_concat.wav, concat_speech/synthesis_command_commands_103/single_active_4d2a05da/single_active_4d2a05da.json
Saved: concat_speech/synthesis_command_commands_103/single_active_71bdb6f3/single_active_71bdb6f3_concat.wav, concat_speech/synthesis_command_commands_103/single_active_71bdb6f3/single_active_71bdb6f3.json
Saved: concat_speech/synthesis_command_commands_103/chain_active_e31b0fdf/chain_active_e31b0fdf_concat.wav, concat_speech/synthesis_command_commands_103/chain_active_e31b0fdf/chain_active_e31b0fdf

Processing folders:  72%|███████▏  | 34/47 [00:08<00:01,  8.41folder/s]

Saved: concat_speech/synthesis_command_commands_133/single_mix_2ac04cba/single_mix_2ac04cba_concat.wav, concat_speech/synthesis_command_commands_133/single_mix_2ac04cba/single_mix_2ac04cba.json
Saved: concat_speech/synthesis_command_commands_133/non_active_57f116da/non_active_57f116da_concat.wav, concat_speech/synthesis_command_commands_133/non_active_57f116da/non_active_57f116da.json
Saved: concat_speech/synthesis_command_commands_133/chain_mix_6f3a9ff0/chain_mix_6f3a9ff0_concat.wav, concat_speech/synthesis_command_commands_133/chain_mix_6f3a9ff0/chain_mix_6f3a9ff0.json
Saved: concat_speech/synthesis_command_commands_133/non_active_087161af/non_active_087161af_concat.wav, concat_speech/synthesis_command_commands_133/non_active_087161af/non_active_087161af.json
Saved: concat_speech/synthesis_command_commands_133/non_active_586bb99d/non_active_586bb99d_concat.wav, concat_speech/synthesis_command_commands_133/non_active_586bb99d/non_active_586bb99d.json
Saved: concat_speech/synthesis_com

Processing folders:  79%|███████▊  | 37/47 [00:08<00:01,  8.12folder/s]

Saved: concat_speech/synthesis_command_commands_110/single_mix_4411d9e4/single_mix_4411d9e4_concat.wav, concat_speech/synthesis_command_commands_110/single_mix_4411d9e4/single_mix_4411d9e4.json
Saved: concat_speech/synthesis_command_commands_110/non_active_7d5ead90/non_active_7d5ead90_concat.wav, concat_speech/synthesis_command_commands_110/non_active_7d5ead90/non_active_7d5ead90.json
Saved: concat_speech/synthesis_command_commands_110/chain_mix_346ff0c8/chain_mix_346ff0c8_concat.wav, concat_speech/synthesis_command_commands_110/chain_mix_346ff0c8/chain_mix_346ff0c8.json
Saved: concat_speech/synthesis_command_commands_110/non_active_646bf452/non_active_646bf452_concat.wav, concat_speech/synthesis_command_commands_110/non_active_646bf452/non_active_646bf452.json
Saved: concat_speech/synthesis_command_commands_110/chain_mix_c0197234/chain_mix_c0197234_concat.wav, concat_speech/synthesis_command_commands_110/chain_mix_c0197234/chain_mix_c0197234.json
Saved: concat_speech/synthesis_command

Processing folders:  81%|████████  | 38/47 [00:08<00:01,  7.14folder/s]

Saved: concat_speech/synthesis_command_commands_144/single_mix_4b646c33/single_mix_4b646c33_concat.wav, concat_speech/synthesis_command_commands_144/single_mix_4b646c33/single_mix_4b646c33.json
Saved: concat_speech/synthesis_command_commands_144/chain_mix_8ccb1fed/chain_mix_8ccb1fed_concat.wav, concat_speech/synthesis_command_commands_144/chain_mix_8ccb1fed/chain_mix_8ccb1fed.json
Saved: concat_speech/synthesis_command_commands_144/chain_mix_cd790e5a/chain_mix_cd790e5a_concat.wav, concat_speech/synthesis_command_commands_144/chain_mix_cd790e5a/chain_mix_cd790e5a.json
Saved: concat_speech/synthesis_command_commands_144/chain_mix_61f275fa/chain_mix_61f275fa_concat.wav, concat_speech/synthesis_command_commands_144/chain_mix_61f275fa/chain_mix_61f275fa.json
Saved: concat_speech/synthesis_command_commands_144/single_mix_3e5eea4e/single_mix_3e5eea4e_concat.wav, concat_speech/synthesis_command_commands_144/single_mix_3e5eea4e/single_mix_3e5eea4e.json
Saved: concat_speech/synthesis_command_com

Processing folders:  83%|████████▎ | 39/47 [00:09<00:01,  4.25folder/s]

Saved: concat_speech/synthesis_command_commands_105/single_mix_cd6a0969/single_mix_cd6a0969_concat.wav, concat_speech/synthesis_command_commands_105/single_mix_cd6a0969/single_mix_cd6a0969.json
Saved: concat_speech/synthesis_command_commands_105/chain_mix_ea00fd14/chain_mix_ea00fd14_concat.wav, concat_speech/synthesis_command_commands_105/chain_mix_ea00fd14/chain_mix_ea00fd14.json
Saved: concat_speech/synthesis_command_commands_105/single_mix_849b3e9b/single_mix_849b3e9b_concat.wav, concat_speech/synthesis_command_commands_105/single_mix_849b3e9b/single_mix_849b3e9b.json
Saved: concat_speech/synthesis_command_commands_105/non_active_71fc6c36/non_active_71fc6c36_concat.wav, concat_speech/synthesis_command_commands_105/non_active_71fc6c36/non_active_71fc6c36.json
Saved: concat_speech/synthesis_command_commands_105/single_mix_9e5f6e17/single_mix_9e5f6e17_concat.wav, concat_speech/synthesis_command_commands_105/single_mix_9e5f6e17/single_mix_9e5f6e17.json
Saved: concat_speech/synthesis_com

Processing folders:  85%|████████▌ | 40/47 [00:09<00:01,  4.57folder/s]

Saved: concat_speech/synthesis_command_commands_101/chain_mix_04e5db7b/chain_mix_04e5db7b_concat.wav, concat_speech/synthesis_command_commands_101/chain_mix_04e5db7b/chain_mix_04e5db7b.json
Saved: concat_speech/synthesis_command_commands_101/single_active_4a816301/single_active_4a816301_concat.wav, concat_speech/synthesis_command_commands_101/single_active_4a816301/single_active_4a816301.json
Saved: concat_speech/synthesis_command_commands_101/chain_mix_96ebe6e0/chain_mix_96ebe6e0_concat.wav, concat_speech/synthesis_command_commands_101/chain_mix_96ebe6e0/chain_mix_96ebe6e0.json
Saved: concat_speech/synthesis_command_commands_101/single_active_67067a0c/single_active_67067a0c_concat.wav, concat_speech/synthesis_command_commands_101/single_active_67067a0c/single_active_67067a0c.json
Saved: concat_speech/synthesis_command_commands_141/chain_mix_0d5cfebe/chain_mix_0d5cfebe_concat.wav, concat_speech/synthesis_command_commands_141/chain_mix_0d5cfebe/chain_mix_0d5cfebe.json
Saved: concat_spee

Processing folders:  89%|████████▉ | 42/47 [00:09<00:00,  5.06folder/s]

Saved: concat_speech/synthesis_command_commands_141/single_mix_e20f11c4/single_mix_e20f11c4_concat.wav, concat_speech/synthesis_command_commands_141/single_mix_e20f11c4/single_mix_e20f11c4.json
Saved: concat_speech/synthesis_command_commands_141/non_active_9848b98c/non_active_9848b98c_concat.wav, concat_speech/synthesis_command_commands_141/non_active_9848b98c/non_active_9848b98c.json
Saved: concat_speech/synthesis_command_commands_141/single_mix_74035a03/single_mix_74035a03_concat.wav, concat_speech/synthesis_command_commands_141/single_mix_74035a03/single_mix_74035a03.json
Saved: concat_speech/synthesis_command_commands_141/non_active_8cf7a34c/non_active_8cf7a34c_concat.wav, concat_speech/synthesis_command_commands_141/non_active_8cf7a34c/non_active_8cf7a34c.json
Saved: concat_speech/synthesis_command_commands_141/chain_mix_60259ec6/chain_mix_60259ec6_concat.wav, concat_speech/synthesis_command_commands_141/chain_mix_60259ec6/chain_mix_60259ec6.json
Saved: concat_speech/synthesis_com

Processing folders:  91%|█████████▏| 43/47 [00:09<00:00,  5.73folder/s]

Saved: concat_speech/synthesis_command_commands_137/single_active_9a151530/single_active_9a151530_concat.wav, concat_speech/synthesis_command_commands_137/single_active_9a151530/single_active_9a151530.json
Saved: concat_speech/synthesis_command_commands_137/single_active_7a21e858/single_active_7a21e858_concat.wav, concat_speech/synthesis_command_commands_137/single_active_7a21e858/single_active_7a21e858.json
Saved: concat_speech/synthesis_command_commands_137/single_mix_91d1d4d0/single_mix_91d1d4d0_concat.wav, concat_speech/synthesis_command_commands_137/single_mix_91d1d4d0/single_mix_91d1d4d0.json
Saved: concat_speech/synthesis_command_commands_137/single_active_c1d3af4b/single_active_c1d3af4b_concat.wav, concat_speech/synthesis_command_commands_137/single_active_c1d3af4b/single_active_c1d3af4b.json
Saved: concat_speech/synthesis_command_commands_137/chain_active_b6cf58cb/chain_active_b6cf58cb_concat.wav, concat_speech/synthesis_command_commands_137/chain_active_b6cf58cb/chain_active_

Processing folders:  96%|█████████▌| 45/47 [00:10<00:00,  6.36folder/s]

Saved: concat_speech/synthesis_command_commands_121/chain_mix_13b6019a/chain_mix_13b6019a_concat.wav, concat_speech/synthesis_command_commands_121/chain_mix_13b6019a/chain_mix_13b6019a.json
Saved: concat_speech/synthesis_command_commands_121/single_active_907a3876/single_active_907a3876_concat.wav, concat_speech/synthesis_command_commands_121/single_active_907a3876/single_active_907a3876.json
Saved: concat_speech/synthesis_command_commands_121/chain_mix_75ff64d5/chain_mix_75ff64d5_concat.wav, concat_speech/synthesis_command_commands_121/chain_mix_75ff64d5/chain_mix_75ff64d5.json
Saved: concat_speech/synthesis_command_commands_121/single_mix_8f971dda/single_mix_8f971dda_concat.wav, concat_speech/synthesis_command_commands_121/single_mix_8f971dda/single_mix_8f971dda.json
Saved: concat_speech/synthesis_command_commands_114/non_active_fc587ebe/non_active_fc587ebe_concat.wav, concat_speech/synthesis_command_commands_114/non_active_fc587ebe/non_active_fc587ebe.json
Saved: concat_speech/synth

Processing folders: 100%|██████████| 47/47 [00:10<00:00,  4.56folder/s]

Saved: concat_speech/synthesis_command_commands_106/single_active_8a5d0a9b/single_active_8a5d0a9b_concat.wav, concat_speech/synthesis_command_commands_106/single_active_8a5d0a9b/single_active_8a5d0a9b.json
Saved: concat_speech/synthesis_command_commands_106/non_active_f8cd9125/non_active_f8cd9125_concat.wav, concat_speech/synthesis_command_commands_106/non_active_f8cd9125/non_active_f8cd9125.json
Saved: concat_speech/synthesis_command_commands_106/chain_active_4de702fc/chain_active_4de702fc_concat.wav, concat_speech/synthesis_command_commands_106/chain_active_4de702fc/chain_active_4de702fc.json
Saved: concat_speech/synthesis_command_commands_106/chain_mix_4432520f/chain_mix_4432520f_concat.wav, concat_speech/synthesis_command_commands_106/chain_mix_4432520f/chain_mix_4432520f.json
Saved: concat_speech/synthesis_command_commands_106/non_active_7e6a8490/non_active_7e6a8490_concat.wav, concat_speech/synthesis_command_commands_106/non_active_7e6a8490/non_active_7e6a8490.json
Saved: concat_




In [11]:
!zip sample-5000-concat.zip concat_speech -r

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: concat_speech/synthesis_command_commands_145/single_mix_3f322c8f/single_mix_3f322c8f_concat.wav (deflated 25%)
  adding: concat_speech/synthesis_command_commands_145/single_mix_1cb8bf5c/ (stored 0%)
  adding: concat_speech/synthesis_command_commands_145/single_mix_1cb8bf5c/single_mix_1cb8bf5c.json (deflated 45%)
  adding: concat_speech/synthesis_command_commands_145/single_mix_1cb8bf5c/single_mix_1cb8bf5c_concat.wav (deflated 26%)
  adding: concat_speech/synthesis_command_commands_145/non_active_d7ec438c/ (stored 0%)
  adding: concat_speech/synthesis_command_commands_145/non_active_d7ec438c/non_active_d7ec438c.json (deflated 18%)
  adding: concat_speech/synthesis_command_commands_145/non_active_d7ec438c/non_active_d7ec438c_concat.wav (deflated 28%)
  adding: concat_speech/synthesis_command_commands_145/non_active_44755183/ (stored 0%)
  adding: concat_speech/synthesis_command_commands_145/non_active_44755183/non

In [13]:
!pip install mutagen

Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m184.3/194.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mutagen
Successfully installed mutagen-1.47.0


In [14]:
import os
from mutagen import File

def total_audio_duration_hours(folder_path: str) -> float:
    """
    Tính tổng thời lượng (giờ) của toàn bộ file âm thanh trong folder (bao gồm thư mục con).
    Dùng mutagen để đọc metadata (nhanh hơn nhiều so với decode audio).

    Args:
        folder_path (str): đường dẫn đến thư mục

    Returns:
        float: tổng số giờ của tất cả file âm thanh
    """
    total_seconds = 0.0

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                audio = File(file_path)
                if audio is not None and audio.info is not None:
                    total_seconds += audio.info.length
            except Exception:
                # bỏ qua file không đọc được
                continue

    return total_seconds / 3600.0


# ví dụ dùng
if __name__ == "__main__":
    folder = "concat_speech"
    hours = total_audio_duration_hours(folder)
    print(hours)


7.023309259259259
