In [16]:
from espnet2.asr.frontend.default import DefaultFrontend
from preprocess import F0DataNormalizer
from espnet2.train.dataset import ESPnetDataset
from model import F0EstimationModelCNN, F0EstimationModelLSTM, F0EstimationModelCBS
from dio import DioProcessor
import yaml
import torch
import tqdm
import math
import pandas as pd

In [17]:
condition_name = "lstm_160"
train_exp_dir = f"exp/exp_{condition_name}/"
train_config_file_path = train_exp_dir + f"config_{condition_name}.yaml"
train_model_file_path = train_exp_dir + f"model_test.pth"

In [18]:
# モデルの読み込み
train_cfg = yaml.safe_load(open(train_config_file_path))
frontend = DefaultFrontend(**train_cfg["frontend"])
f0data_normalizer = F0DataNormalizer(train_cfg["train_f0data_stats_filename"])

input_size = train_cfg["frontend"]["n_mels"]
model_cls = eval("F0EstimationModel" + train_cfg["model_type"])
model = model_cls(input_size=input_size, **train_cfg["model_options"])
model.load_state_dict(torch.load(train_model_file_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [19]:
# F0抽出モデルの初期化
dio_processor = DioProcessor(**train_cfg["dio"])

In [20]:
# データセットの読み込み
eval_dataset = ESPnetDataset(
    [("./dump/raw/eval2/wav.scp", "wav", "sound")]
)

In [21]:
eval_dataset_len = len(eval_dataset.loader_dict["wav"])

In [22]:
data_name = eval_dataset[0][0]
data_wav = eval_dataset[0][1]["wav"]

In [23]:
def evaluate(model, data_name, data_wav):
    # ターゲットの生成
    f0data = dio_processor(data_wav)

    tgt_vuv = torch.tensor(f0data[:, 2] > 0).int()

    f0data_tensor = torch.tensor(f0data, dtype=torch.float32).unsqueeze(0)

    tgt_f0 = f0data_tensor[0, :, 2]
    tgt_df0 = f0data_tensor[0, :, 1]
    # モデルの計算
    wav_tensor  = torch.from_numpy(data_wav).unsqueeze(0)
    wav_lengths = torch.tensor([wav_tensor.shape[1]])

    with torch.no_grad():
        frontend.eval()
        model.eval()
        feats, feat_lengths = frontend(wav_tensor, wav_lengths)
        f0, df0, vuv, out_lengths = model(feats, feat_lengths)
        f0data_tensor = torch.stack([f0, df0, f0], dim=-1)
        f0data_tensor = f0data_normalizer.denormalize(f0data_tensor)
        f0 = f0data_tensor[:, :, 2]
        df0 = f0data_tensor[:, :, 1]
    length = min(len(tgt_f0), out_lengths[0])

    tgt_vuv = tgt_vuv[:length].detach().cpu().numpy()
    tgt_f0 = tgt_f0[:length].detach().cpu().numpy()
    tgt_df0 = tgt_df0[:length].detach().cpu().numpy()

    f0 = f0[0, :length].detach().cpu().numpy()
    df0 = df0[0, :length].detach().cpu().numpy()
    vuv = vuv[0, :length].detach().cpu().numpy()
    tgt_f0
    f0
    vuv_bin = (vuv >= 0.0).astype(int)
    # VUVのTP列，FP列，TN列，FN列を計算
    tp = (vuv_bin * tgt_vuv) == 1
    fp = (vuv_bin * (1 - tgt_vuv)) == 1
    tn = ((1 - vuv_bin) * (1 - tgt_vuv)) == 1
    fn = ((1 - vuv_bin) * tgt_vuv) == 1
    # TPのところの二乗平均誤差（MSE）
    if tp.sum() > 0:
        mse_f0_tp = ((f0[tp] - tgt_f0[tp])**2).sum() / tp.sum()
        mse_f0_tp = float(mse_f0_tp)
        mse_df0_tp = ((df0[tp] - tgt_df0[tp])**2).sum() / tp.sum()
        mse_df0_tp = float(mse_df0_tp)
    else:
        mse_f0_tp = 0.0
        mse_df0_tp = 0.0
    # FNのところの二乗平均誤差（MSE）
    if fn.sum() > 0:
        mse_f0_fn = ((f0[fn] - tgt_f0[fn])**2).sum() / fn.sum()
        mse_f0_fn = float(mse_f0_fn)
        mse_df0_fn = ((df0[fn] - tgt_df0[fn])**2).sum() / fn.sum()
        mse_df0_fn = float(mse_df0_fn)
    else:
        mse_f0_fn = 0
        mse_df0_fn = 0
    # TPとFNのところの二乗平均誤差（MSE）
    if (tp + fn).sum() > 0:
        mse_f0_tpfn = ((f0[tp + fn] - tgt_f0[tp + fn])**2).sum() / (tp + fn).sum()
        mse_f0_tpfn = float(mse_f0_tpfn)
        mse_df0_tpfn = ((df0[tp + fn] - tgt_df0[tp + fn])**2).sum() / (tp + fn).sum()
        mse_df0_tpfn = float(mse_df0_tpfn)
    else:
        mse_f0_tpfn = 0
        mse_df0_tpfn = 0
    import math
    result = {
        'utt_id': data_name,
        'TP': tp.sum().item(),
        'FP': fp.sum().item(),
        'TN': tn.sum().item(),
        'FN': fn.sum().item(),
        'MSE_F0_TP': mse_f0_tp,
        'MSE_F0_FN': mse_f0_fn,
        'MSE_F0_TP_FN': mse_f0_tpfn,
        'MSE_DF0_TP': mse_df0_tp,
        'MSE_DF0_FN': mse_df0_fn,
        'MSE_DF0_TP_FN': mse_df0_tpfn,
    }

    return result

In [24]:
results = []
for i in tqdm.tqdm(range(eval_dataset_len)):
    data_name = eval_dataset[i][0]
    data_wav = eval_dataset[i][1]["wav"]
    result = evaluate(model, data_name, data_wav)
    results.append(result)    

100%|██████████| 1413/1413 [00:40<00:00, 35.27it/s]


In [25]:
df = pd.DataFrame(results)

In [26]:
tp_all = df["TP"].sum()
fp_all = df["FP"].sum()
tn_all = df["TN"].sum()
fn_all = df["FN"].sum()

precision = tp_all / (tp_all + fp_all)
recall = tp_all / (tp_all + fn_all)
f1 = 2 * precision * recall / (precision + recall)
print(f"precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")

precision: 0.950, recall: 0.944, f1: 0.947


In [27]:
mrse_f0_tp = math.sqrt((df["MSE_F0_TP"] * df["TP"]).sum() / tp_all)
mrse_df0_tp = math.sqrt((df["MSE_DF0_TP"] * df["TP"]).sum() / tp_all)

In [28]:
mrse_f0_fn = math.sqrt((df["MSE_F0_FN"] * df["FN"]).sum() / fn_all)
mrse_df0_fn = math.sqrt((df["MSE_DF0_FN"] * df["FN"]).sum() / fn_all)

In [29]:
mrse_f0_tpfn = math.sqrt((df["MSE_F0_TP_FN"] * (df["TP"] + df["FN"])).sum() / (tp_all + fn_all))
mrse_df0_tpfn = math.sqrt((df["MSE_DF0_TP_FN"] * (df["TP"] + df["FN"])).sum() / (tp_all + fn_all))

In [30]:
print(f"mrse_f0_tp: {mrse_f0_tp:.3f}, mrse_df0_tp: {mrse_df0_tp:.3f}")
print(f"mrse_f0_fn: {mrse_f0_fn:.3f}, mrse_df0_fn: {mrse_df0_fn:.3f}")
print(f"mrse_f0_tpfn: {mrse_f0_tpfn:.3f}, mrse_df0_tpfn: {mrse_df0_tpfn:.3f}")

mrse_f0_tp: 0.020, mrse_df0_tp: 0.015
mrse_f0_fn: 0.107, mrse_df0_fn: 0.056
mrse_f0_tpfn: 0.032, mrse_df0_tpfn: 0.019
