In [34]:
from espnet2.asr.frontend.default import DefaultFrontend
from preprocess import F0DataNormalizer
from espnet2.train.dataset import ESPnetDataset
from dio import DioProcessor
import yaml
import numpy as np
import torch

In [15]:
condition_name = "cnn_160"
train_exp_dir = f"exp/exp_{condition_name}/"
train_config_file_path = train_exp_dir + f"config_{condition_name}.yaml"
train_model_file_path = train_exp_dir + f"model_best.pth"

In [22]:
# モデルの読み込み
train_cfg = yaml.safe_load(open(train_config_file_path))
frontend = DefaultFrontend(**train_cfg["frontend"])
f0data_normalizer = F0DataNormalizer(train_cfg["train_f0data_stats_filename"])
dio_processor = DioProcessor()

In [23]:
train_wav_scp_filename = train_cfg['train_wav_scp_filename']
train_f0data_scp_filename = train_cfg['train_f0data_scp_filename']

In [24]:
from dataset import WavF0Dataset

In [25]:
dataset = WavF0Dataset(train_wav_scp_filename, train_f0data_scp_filename)

In [45]:
i = 0
wav = dataset[i][1]["wav"]
f0data = dataset[i][1]["f0data"]

In [46]:
np.concatenate([f0data, dio_processor(wav)], axis=1)

array([[ 5.44779921e+00, -6.31222725e-02,  5.45553541e+00,
         5.44779921e+00, -6.31222725e-02,  5.45553533e+00],
       [ 5.35586309e+00, -7.28292484e-03,  5.34271765e+00,
         5.35586309e+00, -7.28292465e-03,  5.34271765e+00],
       [ 5.31702328e+00, -8.62770062e-03,  5.31264496e+00,
         5.31702352e+00, -8.62770081e-03,  5.31264496e+00],
       [ 5.25878000e+00, -8.74266587e-03,  5.26087713e+00,
         5.25878000e+00, -8.74266624e-03,  5.26087725e+00],
       [ 5.17946434e+00, -4.22432423e-02,  5.17812538e+00,
         5.17946458e+00, -4.22432423e-02,  5.17812526e+00],
       [ 5.20307350e+00, -3.33991386e-02,  5.13730574e+00,
         5.20307350e+00, -3.33991391e-02,  5.13730565e+00],
       [ 5.33053827e+00,  2.30915546e-02,  5.29120398e+00,
         5.33053827e+00,  2.30915546e-02,  5.29120409e+00],
       [ 5.25208092e+00, -2.21854690e-02,  5.24410057e+00,
         5.25208092e+00, -2.21854687e-02,  5.24410069e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.000

In [57]:
f0data_t = torch.tensor(f0data, dtype=torch.float32).unsqueeze(0)

In [58]:
f0data_n = f0data_normalizer.normalize(f0data_t)

In [62]:
f0data_n_d = f0data_normalizer.denormalize(f0data_n)
f0data_n_d[:, f0data_t[0, :, 0] == 0, :] = 0

In [63]:
np.concatenate([f0data, f0data_n_d.squeeze(0).cpu().numpy()], axis=1)

array([[ 5.44779921e+00, -6.31222725e-02,  5.45553541e+00,
         5.44779921e+00, -6.31222725e-02,  5.45553541e+00],
       [ 5.35586309e+00, -7.28292484e-03,  5.34271765e+00,
         5.35586309e+00, -7.28292484e-03,  5.34271765e+00],
       [ 5.31702328e+00, -8.62770062e-03,  5.31264496e+00,
         5.31702328e+00, -8.62770062e-03,  5.31264496e+00],
       [ 5.25878000e+00, -8.74266587e-03,  5.26087713e+00,
         5.25878000e+00, -8.74266587e-03,  5.26087713e+00],
       [ 5.17946434e+00, -4.22432423e-02,  5.17812538e+00,
         5.17946434e+00, -4.22432423e-02,  5.17812538e+00],
       [ 5.20307350e+00, -3.33991386e-02,  5.13730574e+00,
         5.20307350e+00, -3.33991423e-02,  5.13730574e+00],
       [ 5.33053827e+00,  2.30915546e-02,  5.29120398e+00,
         5.33053827e+00,  2.30915546e-02,  5.29120398e+00],
       [ 5.25208092e+00, -2.21854690e-02,  5.24410057e+00,
         5.25208092e+00, -2.21854690e-02,  5.24410057e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.000