-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4752 from espnet/master
Update UASR branch with latest ESPnet functions
- Loading branch information
Showing
27 changed files
with
1,164 additions
and
248 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== | ||
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...> | ||
# e.g. | ||
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB | ||
# | ||
# Options: | ||
# --time <time>: Limit the maximum time to execute. | ||
# --mem <mem>: Limit the maximum memory usage. | ||
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs. | ||
# --num-threads <ngpu>: Specify the number of CPU core. | ||
# --gpu <ngpu>: Specify the number of GPU devices. | ||
# --config: Change the configuration file from default. | ||
# | ||
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs. | ||
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name, | ||
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively. | ||
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example. | ||
# | ||
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend. | ||
# These options are mapping to specific options for each backend and | ||
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default. | ||
# If jobs failed, your configuration might be wrong for your environment. | ||
# | ||
# | ||
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl: | ||
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html | ||
# =========================================================~ | ||
|
||
|
||
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh" | ||
cmd_backend='local' | ||
|
||
# Local machine, without any Job scheduling system | ||
if [ "${cmd_backend}" = local ]; then | ||
|
||
# The other usage | ||
export train_cmd="run.pl" | ||
# Used for "*_train.py": "--gpu" is appended optionally by run.sh | ||
export cuda_cmd="run.pl" | ||
# Used for "*_recog.py" | ||
export decode_cmd="run.pl" | ||
|
||
# Local machine logging to stdout and log file, without any Job scheduling system | ||
elif [ "${cmd_backend}" = stdout ]; then | ||
|
||
# The other usage | ||
export train_cmd="stdout.pl" | ||
# Used for "*_train.py": "--gpu" is appended optionally by run.sh | ||
export cuda_cmd="stdout.pl" | ||
# Used for "*_recog.py" | ||
export decode_cmd="stdout.pl" | ||
|
||
|
||
# "qsub" (Sun Grid Engine, or derivation of it) | ||
elif [ "${cmd_backend}" = sge ]; then | ||
# The default setting is written in conf/queue.conf. | ||
# You must change "-q g.q" for the "queue" for your environment. | ||
# To know the "queue" names, type "qhost -q" | ||
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler. | ||
|
||
export train_cmd="queue.pl" | ||
export cuda_cmd="queue.pl" | ||
export decode_cmd="queue.pl" | ||
|
||
|
||
# "qsub" (Torque/PBS.) | ||
elif [ "${cmd_backend}" = pbs ]; then | ||
# The default setting is written in conf/pbs.conf. | ||
|
||
export train_cmd="pbs.pl" | ||
export cuda_cmd="pbs.pl" | ||
export decode_cmd="pbs.pl" | ||
|
||
|
||
# "sbatch" (Slurm) | ||
elif [ "${cmd_backend}" = slurm ]; then | ||
# The default setting is written in conf/slurm.conf. | ||
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment. | ||
# To know the "partion" names, type "sinfo". | ||
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" | ||
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". | ||
|
||
export train_cmd="slurm.pl" | ||
export cuda_cmd="slurm.pl" | ||
export decode_cmd="slurm.pl" | ||
|
||
elif [ "${cmd_backend}" = ssh ]; then | ||
# You have to create ".queue/machines" to specify the host to execute jobs. | ||
# e.g. .queue/machines | ||
# host1 | ||
# host2 | ||
# host3 | ||
# Assuming you can login them without any password, i.e. You have to set ssh keys. | ||
|
||
export train_cmd="ssh.pl" | ||
export cuda_cmd="ssh.pl" | ||
export decode_cmd="ssh.pl" | ||
|
||
# This is an example of specifying several unique options in the JHU CLSP cluster setup. | ||
# Users can modify/add their own command options according to their cluster environments. | ||
elif [ "${cmd_backend}" = jhu ]; then | ||
|
||
export train_cmd="queue.pl --mem 2G" | ||
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf" | ||
export decode_cmd="queue.pl --mem 4G" | ||
|
||
else | ||
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2 | ||
return 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
tuning/decode_rnn.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
tuning/train_naive_rnn.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# This configuration is the decoding setting for FastSpeech or FastSpeech2. | ||
|
||
########################################################## | ||
# DECODING SETTING # | ||
########################################################## | ||
# speed_control_alpha: 1 # alpha to control the speed of generated speech | ||
# 1 < alpha makes slower and 1 > alpha makes faster | ||
use_teacher_forcing: false # whether to use teacher forcing | ||
# if true, we use groundtruth of durations | ||
# (+ pitch & energy for FastSpeech2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
|
||
|
||
########################################################## | ||
# SVS MODEL SETTING # | ||
########################################################## | ||
svs: naive_rnn # model architecture | ||
svs_conf: # keyword arguments for the selected model | ||
midi_dim: 129 # midi dimension (note number + silence) | ||
embed_dim: 512 # char or phn embedding dimension | ||
eprenet_conv_layers: 0 # prenet (from bytesing) conv layers | ||
eprenet_conv_chans: 256 # prenet (from bytesing) conv channels numbers | ||
eprenet_conv_filts: 3 # prenet (from bytesing) conv filters size | ||
elayers: 3 # number of lstm layers in encoder | ||
eunits: 512 # number of lstm units | ||
ebidirectional: True # if bidirectional in encoder | ||
midi_embed_integration_type: add # how to integrate midi information | ||
dlayers: 5 # number of lstm layers in decoder | ||
dunits: 1024 # number of lstm units in decoder | ||
dbidirectional: True # if bidirectional in decoder | ||
postnet_layers: 5 # number of layers in postnet | ||
postnet_chans: 512 # number of channels in postnet | ||
postnet_filts: 5 # filter size of postnet layer | ||
use_batch_norm: true # whether to use batch normalization in postnet | ||
reduction_factor: 1 # reduction factor | ||
eprenet_dropout_rate: 0.2 # prenet dropout rate | ||
edropout_rate: 0.1 # encoder dropout rate | ||
ddropout_rate: 0.1 # decoder dropout rate | ||
postnet_dropout_rate: 0.5 # postnet dropout_rate | ||
init_type: pytorch # parameter initialization | ||
use_masking: true # whether to apply masking for padded part in loss calculation | ||
loss_type: L1 | ||
|
||
|
||
########################################################## | ||
# OPTIMIZER SETTING # | ||
########################################################## | ||
optim: adam # optimizer type | ||
optim_conf: # keyword arguments for selected optimizer | ||
lr: 1.0e-03 # learning rate | ||
eps: 1.0e-06 # epsilon | ||
weight_decay: 0.0 # weight decay coefficient | ||
|
||
########################################################## | ||
# OTHER TRAINING SETTING # | ||
########################################################## | ||
# num_iters_per_epoch: 200 # number of iterations per epoch | ||
max_epoch: 500 # number of epochs | ||
grad_clip: 1.0 # gradient clipping norm | ||
grad_noise: false # whether to use gradient noise injection | ||
accum_grad: 1 # gradient accumulation | ||
|
||
batch_type: sorted | ||
batch_size: 16 | ||
|
||
sort_in_batch: descending # how to sort data in making batch | ||
sort_batch: descending # how to sort created batches | ||
num_workers: 8 # number of workers of data loader | ||
train_dtype: float32 # dtype in training | ||
log_interval: null # log interval in iterations | ||
keep_nbest_models: 2 # number of models to keep | ||
num_att_plot: 3 # number of attention figures to be saved in every check | ||
seed: 0 # random seed number | ||
best_model_criterion: | ||
- - valid | ||
- loss | ||
- min | ||
- - train | ||
- loss | ||
- min |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
|
||
|
||
########################################################## | ||
# SVS MODEL SETTING # | ||
########################################################## | ||
svs: naive_rnn_dp # model architecture | ||
svs_conf: # keyword arguments for the selected model | ||
midi_dim: 129 # midi dimension (note number + silence) | ||
embed_dim: 512 # char or phn embedding dimension | ||
tempo_dim: 500 | ||
eprenet_conv_layers: 0 # prenet (from bytesing) conv layers | ||
eprenet_conv_chans: 256 # prenet (from bytesing) conv channels numbers | ||
eprenet_conv_filts: 3 # prenet (from bytesing) conv filters size | ||
elayers: 3 # number of lstm layers in encoder | ||
eunits: 256 # number of lstm units | ||
ebidirectional: True # if bidirectional in encoder | ||
midi_embed_integration_type: add # how to integrate midi information | ||
dlayers: 2 # number of lstm layers in decoder | ||
dunits: 256 # number of lstm units in decoder | ||
dbidirectional: True # if bidirectional in decoder | ||
postnet_layers: 5 # number of layers in postnet | ||
postnet_chans: 512 # number of channels in postnet | ||
postnet_filts: 5 # filter size of postnet layer | ||
use_batch_norm: true # whether to use batch normalization in postnet | ||
reduction_factor: 1 # reduction factor | ||
eprenet_dropout_rate: 0.2 # prenet dropout rate | ||
edropout_rate: 0.1 # encoder dropout rate | ||
ddropout_rate: 0.1 # decoder dropout rate | ||
postnet_dropout_rate: 0.5 # postnet dropout_rate | ||
init_type: pytorch # parameter initialization | ||
use_masking: true # whether to apply masking for padded part in loss calculation | ||
|
||
|
||
########################################################## | ||
# OPTIMIZER SETTING # | ||
########################################################## | ||
optim: adam # optimizer type | ||
optim_conf: # keyword arguments for selected optimizer | ||
lr: 1.0e-03 # learning rate | ||
eps: 1.0e-06 # epsilon | ||
weight_decay: 0.0 # weight decay coefficient | ||
|
||
########################################################## | ||
# OTHER TRAINING SETTING # | ||
########################################################## | ||
# num_iters_per_epoch: 200 # number of iterations per epoch | ||
max_epoch: 500 # number of epochs | ||
grad_clip: 1.0 # gradient clipping norm | ||
grad_noise: false # whether to use gradient noise injection | ||
accum_grad: 1 # gradient accumulation | ||
|
||
batch_type: sorted | ||
batch_size: 16 | ||
|
||
sort_in_batch: descending # how to sort data in making batch | ||
sort_batch: descending # how to sort created batches | ||
num_workers: 8 # number of workers of data loader | ||
train_dtype: float32 # dtype in training | ||
log_interval: null # log interval in iterations | ||
keep_nbest_models: 2 # number of models to keep | ||
num_att_plot: 3 # number of attention figures to be saved in every check | ||
seed: 0 # random seed number | ||
best_model_criterion: | ||
- - valid | ||
- loss | ||
- min | ||
- - train | ||
- loss | ||
- min |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../TEMPLATE/svs1/db.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
set -u | ||
set -o pipefail | ||
|
||
. ./path.sh || exit 1; | ||
. ./cmd.sh || exit 1; | ||
. ./db.sh || exit 1; | ||
|
||
log() { | ||
local fname=${BASH_SOURCE[1]##*/} | ||
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" | ||
} | ||
|
||
SECONDS=0 | ||
stage=1 | ||
stop_stage=100 | ||
fs=24000 | ||
|
||
log "$0 $*" | ||
|
||
. utils/parse_options.sh || exit 1; | ||
|
||
if [ -z "${OPENCPOP}" ]; then | ||
log "Fill the value of 'OPENCPOP' of db.sh" | ||
exit 1 | ||
fi | ||
|
||
mkdir -p ${OPENCPOP} | ||
|
||
train_set="tr_no_dev" | ||
train_dev="dev" | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
log "stage 0: Data Download" | ||
# The Opencpop data should be downloaded from https://wenet.org.cn/opencpop/ | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
log "stage 1: Data preparaion " | ||
|
||
mkdir -p xml_dump | ||
mkdir -p wav_dump | ||
# we convert the music score to xml format | ||
python local/data_prep.py ${OPENCPOP} --midi_note_scp local/midi-note.scp \ | ||
--xml_dumpdir xml_dump \ | ||
--wav_dumpdir wav_dump \ | ||
--sr ${fs} | ||
for src_data in train eval; do | ||
utils/utt2spk_to_spk2utt.pl < data/${src_data}/utt2spk > data/${src_data}/spk2utt | ||
utils/fix_data_dir.sh --utt_extra_files "label musicxml.scp" data/${src_data} | ||
done | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
log "stage 2: Held out validation set" | ||
|
||
utils/copy_data_dir.sh data/train data/${train_set} | ||
utils/copy_data_dir.sh data/train data/${train_dev} | ||
for dset in ${train_set} ${train_dev}; do | ||
for extra_file in label musicxml.scp; do | ||
cp data/train/${extra_file} data/${dset} | ||
done | ||
done | ||
tail -n 50 data/train/wav.scp > data/dev/wav.scp | ||
utils/filter_scp.pl --exclude data/dev/wav.scp data/train/wav.scp > data/tr_no_dev/wav.scp | ||
|
||
utils/fix_data_dir.sh --utt_extra_files "label musicxml.scp" data/tr_no_dev | ||
utils/fix_data_dir.sh --utt_extra_files "label musicxml.scp" data/dev | ||
|
||
fi |
Oops, something went wrong.