<a href="https://colab.research.google.com/github/10udCryp7/TV-command-synthesis/blob/main/notebooks/DDSS_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!gdown 1ZqgxOVKR14VDqwY19GAFgQiuVi0qs-N8

Downloading...
From (original): https://drive.google.com/uc?id=1ZqgxOVKR14VDqwY19GAFgQiuVi0qs-N8
From (redirected): https://drive.google.com/uc?id=1ZqgxOVKR14VDqwY19GAFgQiuVi0qs-N8&confirm=t&uuid=4bfa4766-9679-4822-9bdd-d5a30a206e66
To: /kaggle/working/sample-5000-concat.zip
100%|████████████████████████████████████████| 419M/419M [00:14<00:00, 29.7MB/s]


In [None]:
!unzip sample-5000-concat.zip

In [None]:
import os
import json
import pandas as pd

def collect_json_to_df(root_dir: str, flatten: bool = True) -> pd.DataFrame:
    """
    Gom tất cả file JSON trong root_dir thành DataFrame.

    Args:
        root_dir (str): thư mục gốc chứa các thư mục con.
        flatten (bool): nếu True, mỗi label trong json sẽ thành 1 dòng riêng.
                        nếu False, labels sẽ giữ nguyên dạng list trong 1 cột.

    Returns:
        pd.DataFrame
    """
    records = []

    for dirpath, _, filenames in os.walk(root_dir):
        for file in filenames:
            if file.endswith(".json"):
                json_path = os.path.join(dirpath, file)
                folder_name = os.path.basename(dirpath)

                # audio nằm cùng chỗ, tên trùng với folder
                audio_path = os.path.join(dirpath, folder_name + "_concat" + ".wav")

                with open(json_path, "r", encoding="utf-8") as f:
                    data = json.load(f)

                if flatten:
                    for entry in data:
                        records.append({
                            "id": folder_name,
                            "audio_path": audio_path,
                            "label": entry.get("label"),
                            "start": entry.get("start"),
                            "end": entry.get("end")
                        })
                else:
                    records.append({
                        "id": folder_name,
                        "audio_path": audio_path,
                        "labels": data
                    })

    return pd.DataFrame(records)


In [None]:
df = collect_json_to_df("/kaggle/working/concat_speech")

In [None]:
import torch
from torch.utils.data import Dataset
import torchaudio

def frame_wav(wav, frame_size, hop_size):
    wav = wav.squeeze(0)  # (N,)
    frames = wav.unfold(0, frame_size, hop_size)  # shape (num_frames, frame_size)
    return frames

def label_frames(num_frames, frame_size, hop_size, sr, annotations):
    labels = torch.zeros(num_frames, dtype=torch.long)  # mặc định = 0

    for _, row in annotations.iterrows():
        start_sample = int(row['start'] * sr)
        end_sample = int(row['end'] * sr)

        # quy đổi sample về index frame
        start_frame = max(0, (start_sample - frame_size) // hop_size + 1)
        end_frame   = min(num_frames, end_sample // hop_size)

        if row['label'] == 'active':
            labels[start_frame:end_frame+1] = 1

    return labels

class DDSSDataset(Dataset):
    def __init__(self, annotations_file, frame_size=400, hop_size=160, target_sr=16000, transform=None):
        self.annotations_file = annotations_file
        self.list_id = self.annotations_file['id'].unique()
        self.frame_size = frame_size
        self.hop_size = hop_size
        self.target_sr = target_sr
        self.transform = transform

    def __len__(self):
        return len(self.list_id)

    def __getitem__(self, idx):
        id_file = self.list_id[idx]
        df_file = self.annotations_file[self.annotations_file['id'] == id_file]
        audio_path = df_file['audio_path'].iloc[0]
        wav, sr = torchaudio.load(audio_path)

        # resample
        if sr != self.target_sr:
            resampler = torchaudio.transforms.Resample(sr, self.target_sr)
            wav = resampler(wav)
            sr = self.target_sr

        # frame
        frames = frame_wav(wav, self.frame_size, self.hop_size)

        # label sequence
        labels = label_frames(frames.shape[0], self.frame_size, self.hop_size, sr, df_file)

        return wav, frames, labels

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_wav_labels(wav, sr = 16000, labels, frame_size, hop_size):
    """
    wav: tensor (1, N) hoặc (N,)
    sr: sample rate
    labels: tensor (num_frames,)
    frame_size, hop_size: tham số framing
    """
    if wav.ndim > 1:
        wav = wav.squeeze(0)
    wav = wav.numpy()

    # time axis cho wav
    time_axis = np.arange(len(wav)) / sr

    # time axis cho labels (theo frame index)
    frame_times = np.arange(len(labels)) * hop_size / sr

    fig, ax = plt.subplots(figsize=(12, 4))

    # vẽ waveform gốc
    ax.plot(time_axis, wav, color="blue", alpha=0.6, label="Waveform")
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("Amplitude", color="blue")
    ax.tick_params(axis="y", labelcolor="blue")

    # vẽ nhãn (dạng step plot)
    ax2 = ax.twinx()
    ax2.step(frame_times, labels.numpy(), where="mid", color="red", label="Labels (0/1)")
    ax2.set_ylabel("Label", color="red")
    ax2.set_ylim(-0.1, 1.1)
    ax2.tick_params(axis="y", labelcolor="red")

    plt.title("Waveform vs Frame-level Labels")
    plt.show()


In [None]:
import torch
import torch.nn.functional as F

def convert_labels(labels, new_num_frames):
    """
    labels: tensor (old_num_frames,) với giá trị 0/1
    new_num_frames: số frame mới mong muốn

    Trả về: tensor (new_num_frames,)
    """
    old_num_frames = len(labels)
    labels = labels.float().unsqueeze(0).unsqueeze(0)  # (1, 1, old_num_frames)

    # interpolate để match new_num_frames
    new_labels = F.interpolate(labels, size=new_num_frames, mode="nearest")
    new_labels = new_labels.squeeze().long()  # (new_num_frames,)

    return new_labels
