In [1]:
train_dir = "data/train3s_org_fa_sil_80"

In [2]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import random
import copy

In [3]:
def load_data_info(data_dir):
    data_info = {}
    with open(data_dir + "/uttid", "rt") as fp:
        for line in fp:
            uttid = line.strip()
            if len(uttid) > 0:
                data_info[uttid] = {}

    with open(data_dir + "/calib.txt", "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) >= 2:
                if toks[0] in data_info.keys():
                    data_info[toks[0]]["calib_time"] = float(toks[1])
                else:
                    print("Missing ", toks)

    with open(data_dir + "/labels", "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) >= 2:
                if toks[0] in data_info.keys():
                    data_info[toks[0]]["label"] = toks[1]
                else:
                    print("Missing ", toks)

    with open(data_dir + "/wav.scp", "rt") as fp:
        for line in fp:
            toks = line.strip().split()
            if len(toks) >= 2:
                if toks[0] in data_info.keys():
                    data_info[toks[0]]["wavfn"] = toks[1]
                else:
                    print("Missing ", toks)

    return data_info

In [4]:
data_info = load_data_info(train_dir)

In [5]:
def compute_labels_stats(data_info):
    stats = {}
    for uttid in data_info.keys():
        label = data_info[uttid]["label"]
        if label in stats.keys():
            stats[label] += 1
        else:
            stats[label] = 1

    return stats

In [6]:
labels_stats = compute_labels_stats(data_info)
print(labels_stats)

{'negative': 19009, 'positive': 3891}


In [7]:
out_data_info = {}
# copy positive samples
for uttid in data_info.keys():
    if data_info[uttid]["label"] == "positive":
        out_data_info[uttid] = data_info[uttid]

# copy 30% nev
uttids = [k for k in data_info.keys()]
random.shuffle(uttids)
N = labels_stats["positive"] * 70 // 100
n = 0
for uttid in uttids:
    if data_info[uttid]["label"] == "negative":
        out_data_info[uttid] = data_info[uttid]
        n += 1
        if N == n:
            break

# generate others nev from pos
N = labels_stats["positive"] - N
n = 0
dt_samples = 800  # 50ms
for uttid in uttids:
    if data_info[uttid]["label"] == "positive":
        new_uttid = "nev_" + uttid
        out_data_info[new_uttid] = copy.deepcopy(data_info[uttid])
        out_data_info[new_uttid]["label"] = "negative"
        offset = random.randint(-dt_samples, dt_samples) / 16000
        if random.randint(0, 1) == 1:
            out_data_info[new_uttid]["calib_time"] += offset + 0.9
        else:
            out_data_info[new_uttid]["calib_time"] += offset - 0.6
        n += 1

        if N == n:
            break

In [8]:
labels_stats1 = compute_labels_stats(out_data_info)
print(labels_stats1)

{'positive': 3891, 'negative': 3891}


In [9]:
def export_label_time_wav(dbinfo, label_time_wav_fn):
    with open(label_time_wav_fn, "wt") as fp:
        for uttid in dbinfo.keys():
            line = (
                dbinfo[uttid]["label"]
                + " {:.3f}".format(dbinfo[uttid]["calib_time"])
                + " "
                + dbinfo[uttid]["wavfn"]
                + "\n"
            )
            fp.write(line)

In [10]:
export_label_time_wav(out_data_info, train_dir + "/label_time_wav_balanced.txt")