In [1]:
import sys
import os
import json
import random

In [2]:
trainset_name = "trainset3_for_verification"
train_dir = "data/trainsets/" + trainset_name

trainset_list = {
    "pos": [
        "data/pos/pos3s_train_ftel_org",
        "data/pos/pos3s_train_ftel_clone_cleanenv",
        "data/pos/pos3s_train_ftel_clone_musicV30",
    ],
    "nev": [
        "data/sil/sil_from_dev_16K_3s",
        "data/fa/fa_ftel_isolate",
        "data/fa/fa_nami_1",
        "data/fa/fa_ftel_clone_cleanenv",
        "data/fa/fa_ftel_clone_musicV30",
    ],
    "note": "this data is for wuw verification task",
}

In [3]:
if not os.path.exists(train_dir):
    os.makedirs(train_dir, exist_ok=True)

with open(train_dir + "/info.txt", "wt") as fp:
    json.dump(trainset_list, fp, indent=2)

In [4]:
def load_dataset(pos_set_dir):
    pos_set = {}
    with open(pos_set_dir + "/uttid", "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) == 1:
                pos_set[toks[0]] = {}

    with open(pos_set_dir + "/wav.scp", "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) == 2 and toks[0] in pos_set.keys():
                pos_set[toks[0]]["wavfn"] = toks[1]

    with open(pos_set_dir + "/calib.txt", "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) >= 2 and toks[0] in pos_set.keys():
                pos_set[toks[0]]["calib"] = float(toks[1])

    return pos_set

In [5]:
with open(train_dir + "/label_time_wav.txt", "wt") as out_fp:
    for pos_set_dir in trainset_list["pos"]:
        pos_set = load_dataset(pos_set_dir)
        for k in pos_set.keys():
            if "wavfn" in pos_set[k].keys() and "calib" in pos_set[k].keys():
                line = (
                    "positive {:.3f} ".format(pos_set[k]["calib"]) + pos_set[k]["wavfn"]
                )
                out_fp.write(line + "\n")

    for nev_set_dir in trainset_list["nev"]:
        nev_set = load_dataset(nev_set_dir)
        for k in nev_set.keys():
            if "wavfn" in nev_set[k].keys() and "calib" in nev_set[k].keys():
                line = (
                    "negative {:.3f} ".format(nev_set[k]["calib"]) + nev_set[k]["wavfn"]
                )
                out_fp.write(line + "\n")

In [6]:
def split_train_dev(fn, fn_train, fn_dev, dev_per):
    data_info = {"pos": [], "nev": []}
    with open(fn, "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) == 3:
                if toks[0] == "positive":
                    data_info["pos"].append(" ".join(toks))
                elif toks[0] == "negative":
                    data_info["nev"].append(" ".join(toks))
                else:
                    print("ERROR " + line)

    npos = len(data_info["pos"])
    nnev = len(data_info["nev"])

    random.shuffle(data_info["pos"])
    random.shuffle(data_info["nev"])

    print("Number of positive samples: ", npos)
    print("Number of negative samples: ", nnev)
    print("=============")
    npos_dev = int(npos * dev_per / 100)
    nnev_dev = int(nnev * dev_per / 100)
    print("Number of positive samples in validate set: ", npos_dev)
    print("Number of negative samples in validate set: ", nnev_dev)
    print("Number of positive samples in train set: ", npos - npos_dev)
    print("Number of negative samples in train set: ", nnev - nnev_dev)
    with open(fn_dev, "wt") as fp:
        for line in data_info["pos"][0:npos_dev]:
            fp.write(line + "\n")
        for line in data_info["nev"][0:nnev_dev]:
            fp.write(line + "\n")

    with open(fn_train, "wt") as fp:
        for line in data_info["pos"][npos_dev:]:
            fp.write(line + "\n")
        for line in data_info["nev"][nnev_dev:]:
            fp.write(line + "\n")


split_train_dev(
    train_dir + "/label_time_wav.txt",
    train_dir + "/label_time_wav_80.txt",
    train_dir + "/label_time_wav_20.txt",
    20,
)

Number of positive samples:  14664
Number of negative samples:  2866
Number of positive samples in validate set:  2932
Number of negative samples in validate set:  573
Number of positive samples in train set:  11732
Number of negative samples in train set:  2293
