<a href="https://colab.research.google.com/github/10udCryp7/TV-command-synthesis/blob/main/notebooksDataSynthesisPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation & Preparation
- install dependencies
- download data

## Installation

In [None]:
# F5-TTS
!pip install -q f5-tts

## Download Data

In [None]:
# Reference Speech
!gdown 1uTreohCIiYSlrQTa3fuH1_IQjdeW1SaE --quiet
# Reference Meta (including Text)
!gdown 1PBs6r3cqhFxWzy9s5wrGpF6-5ZnoOWxE --quiet

!unzip -q /content/audio.zip -d /content/audio
!unzip -q /content/json.zip -d /content/json

# Command Synthesis
Return:
- df_commands
- commands.csv (optional)

## List TV command templates

In [None]:
tv_command_templates = [
    # Power control
    "turn on the TV",
    "turn off the TV",
    "restart the TV",
    "put the TV in sleep mode",

    # Volume control
    "increase the volume",
    "increase the volume to [level]",
    "decrease the volume",
    "decrease the volume to [level]",
    "set volume to [level]",
    "mute the TV",
    "unmute the TV",
    "set the volume to maximum",
    "set the volume to minimum",

    # Channel control
    "change the channel",
    "next channel",
    "previous channel",
    "go to channel [channel number]",
    "switch to channel [channel name]",
    "show me [channel name]",

    # App control
    "open [app name]",
    "launch [app name]",
    "close [app name]",
    "switch to [app name]",
    "search on [app name] for [query]",

    # Media playback
    "play",
    "pause",
    "resume",
    "stop",
    "rewind",
    "rewind [seconds] seconds",
    "fast forward",
    "fast forward [seconds] seconds",
    "skip intro",
    "skip to next episode",
    "go back to previous episode",

    # Subtitles and language
    "turn on subtitles",
    "turn off subtitles",
    "change subtitle language to [language]",
    "change audio language to [language]",

    # Input source
    "switch to [input source]",
    "change input to [input source]",
    "go to HDMI [number]",
    "switch to AV mode",

    # Picture and audio settings
    "set brightness to [level]",
    "set contrast to [level]",
    "enable night mode",
    "disable night mode",
    "enable game mode",
    "disable game mode",
    "set picture mode to [mode name]",
    "set sound mode to [mode name]",

    # Smart features
    "record this show",
    "show the TV guide",
    "what’s playing now",
    "show me recommendations",
    "add this to my watchlist",
    "rate this show [rating]",
    "enable sleep timer for [minutes] minutes",
    "remind me when [show name] starts",

    # Navigation
    "go back to home screen",
    "open settings",
    "scroll up",
    "scroll down",
    "select this option",
]

## Content list
- song name
- movie name
- app name

In [None]:
import pandas as pd

# song's names from spotify
song_dataset = pd.read_csv("hf://datasets/vishnupriyavr/spotify-million-song-dataset/spotify_millsongdata.csv")
# movie's names
movie_dataset = pd.read_csv("hf://datasets/Pablinho/movies-dataset/9000plus.csv")

app_dataset = [
    # 1. Video tổng hợp
    "YouTube",
    "TikTok TV",
    "Facebook Watch",
    "Twitch",
    "Dailymotion",
    "Vimeo",
    "Rumble",
    "Bilibili",
    "TED",
    "Vevo TV",

    # 2. Xem phim/truyền hình
    "Netflix",
    "Disney+",
    "Amazon Prime Video",
    "Apple TV+",
    "HBO Max",
    "Hulu",
    "Paramount+",
    "Peacock",
    "Crunchyroll",
    "Plex",
    "Tubi TV",
    "Pluto TV",
    "Rakuten TV",
    "Viki",
    "Popcornflix",

    # 3. Nghe nhạc và radio
    "Spotify",
    "Apple Music",
    "YouTube Music",
    "Amazon Music",
    "Tidal",
    "Deezer",
    "Pandora",
    "SoundCloud",
    "iHeartRadio",
    "TuneIn Radio",

    # 4. Truyền hình trực tiếp & thể thao
    "ESPN",
    "DAZN",
    "NBC Sports",
    "CBS Sports",
    "Red Bull TV",
    "BBC iPlayer",
    "ITVX",
    "Sling TV"
]

In [None]:
print(len(app_dataset))
print(len(song_dataset))
print(len(movie_dataset))

## Model and function

In [None]:
# PROMPT

prompt_template = """
You are given the following:

- A list of TV control commands: {command_list}
- A content list that includes names of songs, TV shows, movies, apps, and artists: {content_list}

Your task is to generate {generated_nums} human-like commands using the command list and the content list. The content list includes the prefix movies or artist-song to shows you their type, you should remove them when generate commands, also leverage them smartly for high human-like commands.
Half of the commands should be **single commands**, and the other half should be **chain commands**. A chain command consists of multiple actions (up to a maximum of {chain_length} sub-commands) combined in a natural way. The final list sub-commands is extracted from full chain commands.

Be creative in your generation. You should use specific content from the content list—such as names of songs, movies, TV shows, or artists—instead of generic phrases like "this song" or "that movie".

Each generated command should be as **realistic and unique** as possible.

Return only the generated human-like commands in the **following JSON format** (use **double quotes** for all strings, and ensure the output is valid JSON):

```json
{{
  "single": [
    "string"
  ],
  "chain": [
    {{
      "full_commands": "string",
      "sub_commands": ["string"]
    }}
  ]
}}
"""

GENERATED_NUM = 10

CHAIN_LENGTH = 3


# gpt-4o-mini
from openai import OpenAI
import os
from google.colab import userdata

api_key = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

MODEL = 'gpt-4o-mini'

TEMPERATURE = 1


# Sampling Args
NUM_SAMPLES_COMMAND = 20

NUM_SAMPLES_CONTENT = 20

In [None]:
import random

def content_sample(num_samples: int):
  '''
  Sampling num_samples/2 songs, num_samples/2 movies, and all apps in app list.

  Args:
    int: num_samples -> number of samples that add to prompt

  Returns:
    song_list (list): list of songs
    movie_list (list): list of movies
    app_list (list): list of apps
  '''
  num_songs = num_samples//2
  num_movies = num_samples - num_songs
  song_sample = song_dataset.sample(num_songs).reset_index(drop = True)
  movie_sample = movie_dataset.sample(num_movies).reset_index(drop = True)

  song_list = [f"artist: {song_sample.iloc[i]['artist']}- song: {song_sample.iloc[i]['song']}" for i in range(num_songs)]
  movie_list = [f"movie: {movie_sample.iloc[i]['Title']}" for i in range(num_movies)]
  app_list = [f"app: {app_dataset[i]}" for i in range(len(app_dataset))]
  return song_list, movie_list, app_list

def command_sample(num_samples: int):
  '''
  Sampling num_samples commands from tv_command_templates.

  Args:
    int: num_samples -> number of samples that add to prompt

  Returns:
    samples (list): list of commands
  '''
  samples = random.sample(tv_command_templates, num_samples)
  return samples

def generate(command_list: list, content_list: list, generated_num: int = GENERATED_NUM, chain_length: int = CHAIN_LENGTH):
  '''
  Create a prompt then feed to model to generate json of synthesized commands

  Args:

  Returns:
  '''
  prompt = prompt_template.format(command_list=command_list, content_list = content_list, generated_nums=generated_num, chain_length=chain_length)
  response = client.responses.create(
      model = MODEL,
      input = prompt,
      temperature = TEMPERATURE,
  )

  return response

In [None]:
# 2 function for json parsing

import json
import re

def clean_json_string(raw_output):
    # Bỏ phần ```json và ```
    cleaned = re.sub(r"^```json\s*|\s*```$", "", raw_output.strip())
    return cleaned

def parse_generated_json(raw_output):
    cleaned = clean_json_string(raw_output)
    return json.loads(cleaned)


In [None]:
import pandas as pd

# create pandas from json
def create_df(parsed_data, export_name = None):
  data = parsed_data
  rows = []

  # single command processing
  for idx, cmd in enumerate(data['single']):
      rows.append({
          'command': cmd,
          'type': 'single',
          'sub_command': None
      })

  # chain of commands processing
  for i, cmd in enumerate(data['chain']):
      row_id = f"chain_{i:08d}"
      rows.append({
          'command': cmd['full_commands'],
          'type': 'chain',
          'sub_command': cmd['sub_commands']
      })

  # create dataframes
  df = pd.DataFrame(rows)

  if export_name is not None:
    df.to_csv(f'{export_name}.csv', index=False)
  return df

## Main

In [None]:
from tqdm import tqdm  # thêm tqdm

list_commands = []

num = 100 # num * generated num
for i in tqdm(range(50)):  # thêm progress bar
    # command sampling
    command_list = command_sample(num_samples=NUM_SAMPLES_COMMAND)

    # content sampling
    song_list, movie_list, app_list = content_sample(num_samples=NUM_SAMPLES_CONTENT)

    # content list
    content_list = song_list + movie_list + app_list  # list of all content

    # create commands
    response = generate(command_list, content_list)

    # get the content (commands in json form)
    raw_output = response.output[0].content[0].text

    # Parse commands json
    json_commands = parse_generated_json(raw_output)

    # create dataframe
    df_commands = create_df(parsed_data=json_commands, export_name='commands')

    list_commands.append(df_commands)


In [None]:
list_commands

In [None]:
list_commands_df = pd.concat(list_commands, ignore_index=True)

In [None]:
list_commands_df.to_csv('commands.csv', index=False)

# Audio Synthesis
Return:
-

## Model T5 Init

In [None]:
from importlib.resources import files
from f5_tts.api import F5TTS

f5tts = F5TTS(device = 'cuda')

## Pipeline

In [None]:
def audio_generate(command: str, model = f5tts, ref_file: str = "", ref_text: str = ""):
  '''
  generate speech command with model f5tts
  '''
  wav, sr, spec = model.infer(
      ref_file=ref_file,
      ref_text=ref_text,
      gen_text=command,
      seed=None,
  )
  return wav, sr, spec

def ref_sample(audio_folder_path, json_folder_path):
  '''
  sampling reference (audio and text)

  Returns:
  - id: id of reference (for tracing)
  - ref_speech_path: path of reference audio
  - ref_text_path: path of reference text

  '''

  # list for sampling
  audio_files = os.listdir(audio_folder_path)
  json_files = os.listdir(json_folder_path)

  # sampling
  audio_file = random.choice(audio_files)
  json_file = random.choice(json_files)

  # get ID
  id = json_file.split(".")[0]

  # get ref path
  ref_speech_path = os.path.join(audio_folder_path, audio_file)
  ref_text_path = os.path.join(json_folder_path, json_file)

  return id, ref_speech_path, ref_text_path

In [None]:
import random

def audio_pipeline(dataset, ref_speech_folder: str = "/content/audio/dung",
                            ref_meta_folder: str = "/content/json/dung"):
    results = {}

    for idx in range(len(dataset)):
        # get command for sampling
        sample = dataset.iloc[idx]
        id = sample['id']

        # sampling reference
        ref_id, ref_file, ref_text = ref_sample(audio_folder_path = ref_speech_folder, json_folder_path =  ref_meta_folder)

        # Single Case
        if sample['type'] == 'single':

            # single command
            command = sample['command']

            # synthesis
            wav, sr, spec = audio_generate(command, ref_file=ref_file, ref_text=ref_text)

            results[id] = {
                'ref_id': ref_id,
                'ref_file': ref_file,
                'ref_text': ref_text,
                'type': 'single',
                'command': command,
                'wav': wav,
                'sr': sr,
                'spec': spec,
                'sub_commands': None
            }

        elif sample['type'] == 'chain':
            # full command synthesis
            command = sample['command']
            wav, sr, spec = audio_generate(command, ref_file=ref_file, ref_text=ref_text)

            # sub commands synthesis
            sub_commands = sample['sub_command']
            dict_sub_commands = {}

            for sub_idx, sub_cmd in enumerate(sub_commands):
                wav, sr, spec = audio_generate(sub_cmd, ref_file=ref_file, ref_text=ref_text)
                dict_sub_commands[str(sub_idx)] = {
                    'command': sub_cmd,
                    'wav': wav,
                    'sr': sr,
                    'spec': spec
                }

            results[id] = {
                'ref_id': ref_id,
                'ref_file': ref_file,
                'ref_text': ref_text,
                'type': 'chain',
                'command': command,
                'wav': wav,
                'sr': sr,
                'spec': spec,
                'sub_commands': dict_sub_commands
            }

    return results

In [None]:
import os
import json
import soundfile as sf  # pip install soundfile

def export_results(results, export_dir='exported_audio'):
    # init folder for audio and json
    audio_dir = os.path.join(export_dir, 'audio')
    json_dir = os.path.join(export_dir, 'json')

    os.makedirs(audio_dir, exist_ok=True)
    os.makedirs(json_dir, exist_ok=True)


    # export each file from waveform
    for id, data in results.items():
        # Export main wav
        wav_path = os.path.join(audio_dir, f"{id}.wav")
        sf.write(wav_path, data['wav'], data['sr'])

        # Export metadata json
        json_data = {
            'id': id,
            'ref_id': data['ref_id'],
            'ref_file': data['ref_file'],
            'ref_text': data['ref_text'],
            'type': data['type'],
            'command': data['command'],
            'sampling_rate': data['sr']
        }

        # add sub_commands to json
        if data['type'] == 'chain':
            json_data['sub_commands'] = {
                k: {'command': v['command']} for k, v in data['sub_commands'].items()
            }

        # create json_file
        json_path = os.path.join(json_dir, f"{id}.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, ensure_ascii=False, indent=2)

        # Export sub_commands wavs if chain
        if data['type'] == 'chain':
            for k, v in data['sub_commands'].items():
                sub_wav_path = os.path.join(audio_dir, f"{id}_sub{k}.wav")
                sf.write(sub_wav_path, v['wav'], v['sr'])

## Main
- synthesis_folder with generated command: wav and meta (json)

In [None]:
res = audio_pipeline(df_commands)

export_results(res, 'synthesis_folder')

# NOISE Synthesis

## List noise data

In [None]:
!wget -q https://www.openslr.org/resources/17/musan.tar.gz && echo "Downloaded."
!tar -xzf musan.tar.gz > /dev/null 2>&1 && echo "Extracted."


## Pipeline

### Utils

In [None]:
import os

# get list of sample paths for noise sampling
def get_list_samples(folder_path: str = "/content/musan"):
  wav_files = []

  for root, dirs, files in os.walk(folder_path):
      for file in files:
          if file.endswith(".wav"):
              wav_files.append(os.path.join(root, file))
  return wav_files

In [None]:
import random
import soundfile as sf

def noise_sampling(duration: float, list_noise: list, volume: float =0.3):
    '''
    Sample a noise segment of length `noise_length_sec` (in seconds) from a list of noise files.
    The process will loop until it finds a noise file longer than the required length.

    Args:
        duration (float): Length of the desired noise segment (in seconds)
        list_noise (list): List of noise file paths
        volume (float): Volume adjustment factor (0.0 - 1.0)

    Returns:
        segment (np.ndarray): The volume-adjusted noise segment
        sr (int): Sample rate
        noise_path (str): Path of the selected noise file
    '''
    # count for stop
    count = 0


    while count < 50:

        # sampling a noise_path
        noise_path = random.choice(list_noise)

        # get wav and sampling rate
        noise, sr = sf.read(noise_path)

        # get noise_length (number of frames) = duration * sampling_rate
        noise_length = int(duration * sr)


        # sampling a segment from full noise
        if len(noise) >= noise_length:

            # (start, end) of sengment
            start = random.randint(0, len(noise) - noise_length)
            segment = noise[start:start + noise_length]

            # rescaling volume for noise
            segment = segment * volume
            return segment, sr, noise_path

        count += 1

    raise ValueError("Không tìm thấy file noise có độ dài lớn hơn yêu cầu.")


### Noise methods
- 3 methods: pad, overlap, middle

In [None]:
import random
import soundfile as sf
import numpy as np
import os
import librosa

def resample_audio(audio: np.ndarray, original_sr: int, target_sr: int):
    if original_sr == target_sr:
        return audio
    return librosa.resample(audio, orig_sr=original_sr, target_sr=target_sr)

def process_noise_segment(position: str, current_time: float, min_noise_len: float, max_noise_len: float, fixed_sr: int, list_noise: list):
    """
    Helper function to sample, resample, and return a noise segment with its metadata.
    """
    noise_length_sec = random.randint(min_noise_len, max_noise_len)
    seg, noise_sr, noise_path = noise_sampling(noise_length_sec, list_noise)
    seg = resample_audio(seg, noise_sr, fixed_sr)

    duration = len(seg) / fixed_sr
    noise_range = {
        "start": current_time,
        "end": current_time + duration
    }
    noise_file = {
        "file": noise_path,
        "type": position,
        "sampling_rate": fixed_sr
    }

    return seg, noise_range, noise_file, duration

def pad_noise(file_path: str, min_noise_len: float = 2, max_noise_len: float = 10, fixed_sr=16000, list_noise=None):
    '''
    Pad noise to beginning, end, or both sides of an audio file.

    Args:
        file_path (str): path to the original audio file
        min_noise_len (float): minimum length of noise in seconds
        max_noise_len (float): maximum length of noise in seconds
        fixed_sr (int): target sample rate to resample all audio
        list_noise (list): list of noise audio files

    Returns:
        audio_out (np.ndarray): audio with added noise
        sr (int): sample rate (always equals fixed_sr)
        meta_template (dict): metadata containing positions of noise and original segments
    '''
    if list_noise is None:
        raise ValueError("list_noise must be provided for sampling")

    # Load and resample the original audio
    audio, sr = sf.read(file_path)
    audio = resample_audio(audio, sr, fixed_sr)
    sr = fixed_sr

    pad_mode = random.choice(["before", "after", "both"])

    noise_ranges = []
    noise_files = []
    total_audio = []
    current_time = 0.0

    # Padding before
    if pad_mode in ["before", "both"]:
        seg, noise_range, noise_file, duration = process_noise_segment(
            position="before",
            current_time=current_time,
            min_noise_len=min_noise_len,
            max_noise_len=max_noise_len,
            fixed_sr=fixed_sr,
            list_noise=list_noise
        )
        total_audio.append(seg)
        noise_ranges.append(noise_range)
        noise_files.append(noise_file)
        current_time += duration

    # Original audio
    audio_length_sec = len(audio) / fixed_sr
    total_audio.append(audio)
    label_range = {
        "start": current_time,
        "end": current_time + audio_length_sec,
        "label": 1
    }
    current_time += audio_length_sec

    # Padding after
    if pad_mode in ["after", "both"]:
        seg, noise_range, noise_file, duration = process_noise_segment(
            position="after",
            current_time=current_time,
            min_noise_len=min_noise_len,
            max_noise_len=max_noise_len,
            fixed_sr=fixed_sr,
            list_noise=list_noise
        )
        total_audio.append(seg)
        noise_ranges.append(noise_range)
        noise_files.append(noise_file)
        current_time += duration

    # Combine all
    audio_out = np.concatenate(total_audio)
    audio_duration = len(audio_out) / fixed_sr

    meta_template = {
        "label_range": [label_range],
        "noise_range": noise_ranges,
        "noise_file": noise_files,
        "duration": audio_duration,
    }

    return audio_out, fixed_sr, meta_template


In [None]:
import random
import numpy as np
import soundfile as sf

def overlap_noise(file: str,
                  list_noise=None,
                  noise_volume: float = 0.3,
                  min_overlap_ratio: float = 0.5,
                  max_overlap_ratio: float = 1.5,
                  fixed_sr: int = 16000):
    '''
    Randomly overlaps a segment of noise on the original audio, potentially exceeding its original duration.
    All audio will be resampled to fixed_sr.

    Args:
        file (str): Path to the original audio file
        list_noise (list): List of paths to noise audio files
        noise_volume (float): Volume scale of the noise (0.0 - 1.0)
        min_overlap_ratio (float): Minimum ratio of noise length to original audio
        max_overlap_ratio (float): Maximum ratio of noise length to original audio
        fixed_sr (int): Target sample rate to resample both original and noise audio

    Returns:
        audio_out (np.ndarray): Audio with overlapped noise
        sr (int): Sample rate (equal to fixed_sr)
        meta_template (dict): Metadata containing positions of the original and noise segments
    '''
    if list_noise is None:
        raise ValueError("list_noise must be provided for sampling")

    # Load and resample original audio
    audio, sr = sf.read(file)
    audio = resample_audio(audio, sr, fixed_sr)
    sr = fixed_sr

    audio_length = len(audio)
    audio_length_sec = audio_length / sr

    # Randomly determine noise length
    overlap_ratio = random.uniform(min_overlap_ratio, max_overlap_ratio)

    noise_length_samples = int(audio_length * overlap_ratio)

    noise_length_sec = noise_length_samples / sr

    # Sample and resample noise
    noise_seg, noise_sr, noise_path = noise_sampling(noise_length_sec, list_noise, volume=noise_volume)
    noise_seg = resample_audio(noise_seg, noise_sr, fixed_sr)

    # Ensure exact noise length
    if len(noise_seg) > noise_length_samples:
        noise_seg = noise_seg[:noise_length_samples]
    elif len(noise_seg) < noise_length_samples:
        noise_seg = np.concatenate([noise_seg, np.zeros(noise_length_samples - len(noise_seg))])

    # Randomly choose start position such that noise overlaps with original audio
    min_start = -noise_length_samples + 1
    max_start = audio_length - 1
    start_sample = random.randint(min_start, max_start)
    end_sample = start_sample + noise_length_samples

    # Pad audio if necessary
    padded_audio = audio
    if start_sample < 0:
        pad_before = -start_sample
        padded_audio = np.concatenate([np.zeros(pad_before), padded_audio])
        start_sample = 0
        end_sample = start_sample + noise_length_samples

    if end_sample > len(padded_audio):
        pad_after = end_sample - len(padded_audio)
        padded_audio = np.concatenate([padded_audio, np.zeros(pad_after)])

    # Mix noise
    audio_out = padded_audio.copy()
    audio_out[start_sample:end_sample] += noise_seg
    audio_out = np.clip(audio_out, -1.0, 1.0)

    # Metadata
    meta_template = {
        "label_range": [{
            "start": 0.0,
            "end": audio_length_sec,
            "label": 1
        }],
        "noise_range": [{
            "start": start_sample / sr,
            "end": end_sample / sr
        }],
        "noise_file": [{
            "file": noise_path,
            "type": "overlap"
        }],
        "duration": len(audio_out) / sr
    }

    return audio_out, sr, meta_template


In [None]:
import random
import soundfile as sf
import numpy as np

import random
import numpy as np
import soundfile as sf

def middle_noise(files,
                 list_noise=None,
                 noise_volume: float = 0.3,
                 fixed_sr: int = 16000):
    '''
    Concatenates audio files and inserts noise segments in between them (with optional overlap).
    All audio will be resampled to fixed_sr.

    Args:
        files (list of str): List of paths to audio files
        list_noise (list of str): List of paths to noise audio files
        noise_volume (float): Volume scaling factor for noise (0.0 - 1.0)
        fixed_sr (int): Target sample rate to resample all audio

    Returns:
        audio_out (np.ndarray): Final audio with inserted noises
        sr (int): Sample rate (equal to fixed_sr)
        meta_template (dict): Metadata including label and noise segments
    '''
    if not list_noise:
        raise ValueError("list_noise must be provided")

    total_audio = []
    label_ranges = []
    noise_ranges = []
    noise_files = []

    current_time = 0.0  # in seconds

    for idx, file_path in enumerate(files):
        # Load and resample original audio
        audio, sr = sf.read(file_path)
        audio = resample_audio(audio, sr, fixed_sr)
        sr = fixed_sr
        audio_length_sec = len(audio) / sr

        # Label range for this segment
        label_ranges.append({
            "start": current_time,
            "end": current_time + audio_length_sec,
            "label": 1
        })

        total_audio.append(audio)
        current_time += audio_length_sec

        # Insert noise between segments
        if idx < len(files) - 1:
            # Random noise duration: 0.5 - 2.0 seconds
            noise_length_sec = random.uniform(0.5, 2.0)
            noise_seg, noise_sr, noise_path = noise_sampling(noise_length_sec, list_noise, volume=noise_volume)
            noise_seg = resample_audio(noise_seg, noise_sr, fixed_sr)

            # Optional overlap with previous segment
            overlap_ratio = random.uniform(0.0, 0.5)
            overlap_samples = int(overlap_ratio * len(noise_seg))

            if overlap_samples > 0:
                prev_audio = total_audio[-1]
                # Pad if necessary
                if len(prev_audio) < overlap_samples:
                    prev_audio = np.pad(prev_audio, (0, overlap_samples - len(prev_audio)))
                prev_audio[-overlap_samples:] += noise_seg[:overlap_samples]
                prev_audio = np.clip(prev_audio, -1.0, 1.0)
                total_audio[-1] = prev_audio
                total_audio.append(noise_seg[overlap_samples:])
            else:
                total_audio.append(noise_seg)

            # Update metadata
            noise_start_sec = current_time - (overlap_samples / sr)
            noise_end_sec = noise_start_sec + (len(noise_seg) / sr)

            noise_ranges.append({
                "start": noise_start_sec,
                "end": noise_end_sec
            })
            noise_files.append({
                "file": noise_path,
                "type": "middle"
            })

            current_time = noise_end_sec

    # Final concatenated output
    audio_out = np.concatenate(total_audio)
    audio_out = np.clip(audio_out, -1.0, 1.0)

    meta_template = {
        "label_range": label_ranges,
        "noise_range": noise_ranges,
        "noise_file": noise_files,
        "duration": len(audio_out) / sr
    }

    return audio_out, sr, meta_template

In [None]:
test = pd.read_csv('commands.csv')

### Flow

In [None]:
import os
import json
import random
import pandas as pd
import soundfile as sf
import ast

def data_synthesis(commands_path: str = '/content/commands.csv',
                   export_folder: str = 'content/data_synthesis_with_noise',
                   command_audio_path: str = '/content/synthesis_folder/audio',
                   noise_folder: str = '/content/musan') -> pd.DataFrame:

    list_noise = get_list_samples(noise_folder)

    # create folders
    audio_folder = os.path.join(export_folder, 'audio')
    meta_folder = os.path.join(export_folder, 'meta')
    os.makedirs(audio_folder, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    # read command csv
    df = pd.read_csv(commands_path)

    results = []

    for idx, row in df.iterrows():
        base_name = f"{row['id']}"
        command_audio_file = os.path.join(command_audio_path, row['id'])

        # Single Case
        choice = ['pad', 'overlap']
        selected = random.choice(choice)

        if selected == 'pad':
            audio, sr, meta = pad_noise(command_audio_file + '.wav', list_noise = list_noise)
        elif selected == 'overlap':
            audio, sr, meta = overlap_noise(command_audio_file + '.wav', list_noise = list_noise)

        name = f"{base_name}_{selected}"
        audio_path = os.path.join(audio_folder, f"{name}.wav")
        meta_path = os.path.join(meta_folder, f"{name}.json")

        sf.write(audio_path, audio, sr)
        with open(meta_path, 'w') as f:
            json.dump(meta, f, indent=2)

        results.append({
            "id": row["id"],
            "type": row["type"],
            "variation": selected,
            "audio_path": audio_path,
            "meta_path": meta_path
        })

        # Chain Case
        if row['type'] == 'chain':
            try:
                sub_commands_id = [f"{row['id']}_sub{i}" for i in range(len(ast.literal_eval(row['sub_command'])))]
            except Exception as e:
                print(f"Error parsing sub_command at row {idx}: {e}")
                continue

            list_sub_command_path = [os.path.join(command_audio_path, sc + '.wav') for sc in sub_commands_id]
            audio, sr, meta = middle_noise(list_sub_command_path, list_noise = list_noise)

            name = f"{base_name}_middle"
            audio_path = os.path.join(audio_folder, f"{name}.wav")
            meta_path = os.path.join(meta_folder, f"{name}.json")

            sf.write(audio_path, audio, sr)
            with open(meta_path, 'w') as f:
                json.dump(meta, f, indent=2)

            results.append({
                "id": row["id"],
                "type": row["type"],
                "variation": "middle",
                "audio_path": audio_path,
                "meta_path": meta_path
            })

    return pd.DataFrame(results)


## Main

In [None]:
synthesis_df = data_synthesis()

In [None]:
synthesis_df

In [None]:
!zip -r synthesis_data_with_noise.zip /content/content/data_synthesis_with_noise
!zip -r synthesis_folder /content/synthesis_folder

In [None]:
# TODO, add noise as SNR