Skip to content

Commit

Permalink
Merge pull request #5314 from Jungjee/speaker2
Browse files Browse the repository at this point in the history
ESPnet-Spk part 3 - inference every epoch using EER
  • Loading branch information
mergify[bot] committed Jul 22, 2023
2 parents fcc1c01 + 249bf6e commit 8c264ff
Show file tree
Hide file tree
Showing 26 changed files with 1,074 additions and 144 deletions.
1 change: 1 addition & 0 deletions ci/test_integration_espnet2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ cd ./egs2/mini_an4/spk1
gen_dummy_coverage
echo "==== [ESPnet2] SPK ==="
./run.sh --ngpu 0 --stage 0 --stop-stage 4 --feats-type "raw" --spk_args "--max_epoch=1" --python "${python}"
./run.sh --ngpu 0 --stage 4 --stop-stage 4 --feats-type "raw" --spk_args "--max_epoch=1" --python "${python}" --spk_config conf/train_mini_RawNet3_dataaug.yaml
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data
cd "${cwd}"
Expand Down
75 changes: 64 additions & 11 deletions egs2/TEMPLATE/spk1/spk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -181,21 +181,73 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
_dsets="${test_sets}"
fi
else
_dsets="${train_set} ${valid_set} ${test_sets}"
_dsets="${valid_set} ${test_sets}"
fi

if [ "${feats_type}" = raw ]; then
log "Stage 2: Format wav.scp: data/ -> ${data_feats}"
if [ "${skip_train}" = false ]; then
utils/copy_data_dir.sh --validate_opts --non-print data/"${train_set}" "${data_feats}/${train_set}"

# shellcheck disable=SC2086
scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
--audio-format "${audio_format}" --fs "${fs}" \
--multi-columns-input "${multi_columns_input_wav_scp}" \
--multi-columns-output "${multi_columns_output_wav_scp}" \
"data/${train_set}/wav.scp" "${data_feats}/${train_set}"

echo "${feats_type}" > "${data_feats}/${train_set}/feats_type"
if "${multi_columns_output_wav_scp}"; then
echo "multi_${audio_format}" > "${data_feats}/${train_set}/audio_format"
else
echo "${audio_format}" > "${data_feats}/${train_set}/audio_format"
fi
fi

# Calculate EER for valid/test since speaker verification is an open set problem
# Train can be either multi-column data or not, but valid/test always require multi-column trial
for dset in ${_dsets}; do
utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}/${dset}"
echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
cp data/${dset}/trial_label "${data_feats}/${dset}"

# shellcheck disable=SC2086
scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
--audio-format "${audio_format}" --fs "${fs}" \
--multi-columns-input "${multi_columns_input_wav_scp}" \
--multi-columns-output "${multi_columns_output_wav_scp}" \
"data/${dset}/wav.scp" "${data_feats}/${dset}"
--out_filename trial.scp \
"data/${dset}/trial.scp" "${data_feats}/${dset}"
# shellcheck disable=SC2086
scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
--audio-format "${audio_format}" --fs "${fs}" \
--multi-columns-input "${multi_columns_input_wav_scp}" \
--multi-columns-output "${multi_columns_output_wav_scp}" \
--out_filename trial2.scp \
"data/${dset}/trial2.scp" "${data_feats}/${dset}"

echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
echo "multi_${audio_format}" > "${data_feats}/${dset}/audio_format"

done
elif [ "${feats_type}" = raw_copy ]; then
if [ "${skip_train}" = false ]; then
utils/copy_data_dir.sh --validate_opts --non-print data/"${train_set}" "${data_feats}/${train_set}"

echo "${feats_type}" > "${data_feats}/${train_set}/feats_type"
if "${multi_columns_output_wav_scp}"; then
echo "multi_${audio_format}" > "${data_feats}/${train_set}/audio_format"
else
echo "${audio_format}" > "${data_feats}/${train_set}/audio_format"
fi
fi

# Calculate EER for valid/test since speaker verification is an open set problem
# Train can be either multi-column data or not, but valid/test always require multi-column trial
for dset in ${_dsets}; do
utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}/${dset}"
cp data/${dset}/trial_label "${data_feats}/${dset}"

echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
echo "multi_${audio_format}" > "${data_feats}/${dset}/audio_format"

done
else
Expand All @@ -215,7 +267,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
_opts+="--config ${spk_config} "
fi

_scp=wav.scp
if [[ "${audio_format}" == *ark* ]]; then
_type=kaldi_ark
else
Expand All @@ -227,16 +278,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
_logdir="${spk_stats_dir}/logdir"
mkdir -p "${_logdir}"

_nj=$(min "${nj}" "$(<${_spk_train_dir}/${_scp} wc -l)" "$(<${_spk_valid_dir}/${_scp} wc -l)")
_nj=$(min "${nj}" "$(<${_spk_train_dir}/wav.scp wc -l)" "$(<${_spk_valid_dir}/trial.scp wc -l)")

key_file="${_spk_train_dir}/${_scp}"
key_file="${_spk_train_dir}/wav.scp"
split_scps=""
for n in $(seq "${_nj}"); do
split_scps+=" ${_logdir}/train.${n}.scp"
done
utils/split_scp.pl "${key_file}" ${split_scps}

key_file="${_spk_valid_dir}/${_scp}"
key_file="${_spk_valid_dir}/trial.scp"
split_scps=""
for n in $(seq "${_nj}"); do
split_scps+=" ${_logdir}/valid.${n}.scp"
Expand All @@ -256,7 +307,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--collect_stats true \
--use_preprocessor false \
--train_data_path_and_name_and_type ${_spk_train_dir}/wav.scp,speech,${_type} \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/wav.scp,speech,${_type} \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/trial.scp,speech,${_type} \
--train_shape_file "${_logdir}/train.JOB.scp" \
--valid_shape_file "${_logdir}/valid.JOB.scp" \
--spk2utt ${_spk_train_dir}/spk2utt \
Expand All @@ -271,6 +322,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# shellcheck disable=SC2086
${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --skip_sum_stats --output_dir "${spk_stats_dir}"

cp ${spk_stats_dir}/valid/speech_shape ${spk_stats_dir}/valid/speech_shape2
fi

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
Expand Down Expand Up @@ -307,8 +359,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--train_data_path_and_name_and_type ${_spk_train_dir}/wav.scp,speech,sound \
--train_data_path_and_name_and_type ${_spk_train_dir}/utt2spk,spk_labels,text \
--train_shape_file ${spk_stats_dir}/train/speech_shape \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/wav.scp,speech,sound \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/utt2spk,spk_labels,text \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/trial.scp,speech,sound \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/trial2.scp,speech2,sound \
--valid_data_path_and_name_and_type ${_spk_valid_dir}/trial_label,spk_labels,text \
--spk2utt ${_spk_train_dir}/spk2utt \
--fold_length ${fold_length} \
--valid_shape_file ${spk_stats_dir}/valid/speech_shape \
Expand Down
27 changes: 16 additions & 11 deletions egs2/mini_an4/spk1/conf/train_mini_RawNet3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,36 @@
frontend: raw

encoder: rawnet3
encoder_conf: # uncomment for customized arguments
encoder_conf:
model_scale: 4
ndim: 64
ndim: 16

pooling: chn_attn_stat
pooling_conf:
input_size: 96
input_size: 24

projector: rawnet3
projector_conf:
input_size: 192
output_size: 16

input_size: 48
output_size: 8

preprocessor: spk
preprocessor_conf:
target_duration: 3.0
sr: 16000
num_eval: 1
target_duration: 3.0 # seconds
sample_rate: 16000
num_eval: 2
rir_apply_prob: 0.0
noise_apply_prob: 0.0

model_conf:
extract_feats_in_collect_stats: false

loss: aamsoftmax
loss_conf:
nOut: 16
nClasses: 108
nout: 8
nclasses: 10
margin: 0.3
scale: 15

optim: adam
num_att_plot: 0
42 changes: 42 additions & 0 deletions egs2/mini_an4/spk1/conf/train_mini_RawNet3_dataaug.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# RawNet3 reproduce recipe configuration.
frontend: raw

encoder: rawnet3
encoder_conf:
model_scale: 4
ndim: 16

pooling: chn_attn_stat
pooling_conf:
input_size: 24

projector: rawnet3
projector_conf:
input_size: 48
output_size: 8

preprocessor: spk
preprocessor_conf:
target_duration: 3.0 # seconds
sample_rate: 16000
num_eval: 1
rir_scp: data/train_nodev/rirs.scp
rir_apply_prob: 1.0
noise_info:
- [0.4, "data/train_nodev/noises.scp", [1, 1], [0, 10]]
- [0.5, "data/train_nodev/noises.scp", [1, 2], [10, 20]]
noise_apply_prob: 1.0
short_noise_thres: 0.5

model_conf:
extract_feats_in_collect_stats: false

loss: aamsoftmax
loss_conf:
nout: 8
nclasses: 10
margin: 0.3
scale: 15

optim: adam
num_att_plot: 0
7 changes: 6 additions & 1 deletion egs2/mini_an4/spk1/local/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
done

# make a dev set
utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
utils/subset_data_dir.sh --first data/train 2 data/${train_dev}
n=$(($(wc -l < data/train/text) - 1))
utils/subset_data_dir.sh --last data/train ${n} data/${train_set}

Expand All @@ -78,6 +78,11 @@ EOF
awk '{print $1 " 1ch_16k"}' data/${x}/wav.scp > data/${x}/utt2category
done

# for spk task validation
for x in test test_seg ${train_set} ${train_dev}; do
python local/make_trial.py data/${x}/wav.scp data/${x}
done

find downloads/noise/ -iname "*.wav" | awk '{print "noise" NR " " $1}' > data/${train_set}/noises.scp
find downloads/rirs/ -iname "*.wav" | awk '{print "rir" NR " " $1}' > data/${train_set}/rirs.scp
fi
Expand Down
14 changes: 14 additions & 0 deletions egs2/mini_an4/spk1/local/make_trial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os
import sys

if __name__ == "__main__":
with open(sys.argv[1]) as f:
lines = f.readlines()

joint_key = lines[0].strip().split(" ")[0] + "*" + lines[1].strip().split(" ")[0]
with open(os.path.join(sys.argv[2], "trial.scp"), "w") as f:
f.write(joint_key + " " + " ".join(lines[0].strip().split(" ")[1:]) + "\n")
with open(os.path.join(sys.argv[2], "trial2.scp"), "w") as f:
f.write(joint_key + " " + " ".join(lines[1].strip().split(" ")[1:]) + "\n")
with open(os.path.join(sys.argv[2], "trial_label"), "w") as f:
f.write(joint_key + " 0\n")
45 changes: 0 additions & 45 deletions egs2/voxceleb/spk1/conf/train_RawNet3.yaml

This file was deleted.

1 change: 1 addition & 0 deletions egs2/voxceleb/spk1/conf/train_RawNet3.yaml
45 changes: 45 additions & 0 deletions egs2/voxceleb/spk1/conf/tuning/train_RawNet3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# RawNet3 reproduce recipe configuration.

frontend: raw

encoder: rawnet3
encoder_conf:
model_scale: 8
ndim: 1024
sinc_stride: 16

pooling: chn_attn_stat
pooling_conf:
input_size: 1536 # 1.5 * ndim of RawNet3 encoder

projector: rawnet3
projector_conf:
input_size: 3072 # 2 * input_size of pooling
output_size: 256

preprocessor: spk
preprocessor_conf:
target_duration: 3.0
sr: 16000
num_eval: 5

model_conf:
extract_feats_in_collect_stats: false

loss: aamsoftmax
loss_conf:
nout: 256
nclasses: 7205
margin: 0.2
scale: 15

optim: adam
max_epoch: 40
num_att_plot: 0
num_workers: 6
cudnn_deterministic: False
cudnn_benchmark: True
batch_size: 32
iterator_type: sequence
shuffle_within_batch: True
log_interval: 50

0 comments on commit 8c264ff

Please sign in to comment.