espnet · mergify · Jul 22, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/egs2/TEMPLATE/spk1/spk.sh b/egs2/TEMPLATE/spk1/spk.sh
@@ -181,21 +181,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
             _dsets="${test_sets}"
         fi
     else
-        _dsets="${train_set} ${valid_set} ${test_sets}"
+        _dsets="${valid_set} ${test_sets}"
     fi
 
     if [ "${feats_type}" = raw ]; then
-        log "Stage 2: Format wav.scp: data/ -> ${data_feats}"
+        if [ "${skip_train}" = false ]; then
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${train_set}" "${data_feats}/${train_set}"
+
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                --audio-format "${audio_format}" --fs "${fs}" \
+                --multi-columns-input "${multi_columns_input_wav_scp}" \
+                --multi-columns-output "${multi_columns_output_wav_scp}" \
+                "data/${train_set}/wav.scp" "${data_feats}/${train_set}"
+
+            echo "${feats_type}" > "${data_feats}/${train_set}/feats_type"
+            if "${multi_columns_output_wav_scp}"; then
+                echo "multi_${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            else
+                echo "${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            fi
+        fi
+
+        # Calculate EER for valid/test since speaker verification is an open set problem
+        # Train can be either multi-column data or not, but valid/test always require multi-column trial
         for dset in ${_dsets}; do
             utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}/${dset}"
-            echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
+            cp data/${dset}/trial.scp "${data_feats}/${dset}"
+            cp data/${dset}/trial2.scp "${data_feats}/${dset}"
+            cp data/${dset}/trial_label "${data_feats}/${dset}"
 
             # shellcheck disable=SC2086
             scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
                 --audio-format "${audio_format}" --fs "${fs}" \
                 --multi-columns-input "${multi_columns_input_wav_scp}" \
                 --multi-columns-output "${multi_columns_output_wav_scp}" \
-                "data/${dset}/wav.scp" "${data_feats}/${dset}"
+                --out_filename trial.scp \
+                "data/${dset}/trial.scp" "${data_feats}/${dset}"
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                --audio-format "${audio_format}" --fs "${fs}" \
+                --multi-columns-input "${multi_columns_input_wav_scp}" \
+                --multi-columns-output "${multi_columns_output_wav_scp}" \
+                --out_filename trial2.scp \
+                "data/${dset}/trial2.scp" "${data_feats}/${dset}"
+
+            echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
+            echo "multi_${audio_format}" > "${data_feats}/${dset}/audio_format"
+
+        done
+    elif [ "${feats_type}" = raw_copy ]; then
+        if [ "${skip_train}" = false ]; then
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${train_set}" "${data_feats}/${train_set}"
+
+            echo "${feats_type}" > "${data_feats}/${train_set}/feats_type"
+            if "${multi_columns_output_wav_scp}"; then
+                echo "multi_${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            else
+                echo "${audio_format}" > "${data_feats}/${train_set}/audio_format"
+            fi
+        fi
+
+        # Calculate EER for valid/test since speaker verification is an open set problem
+        # Train can be either multi-column data or not, but valid/test always require multi-column trial
+        for dset in ${_dsets}; do
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}/${dset}"
+            cp data/${dset}/trial_label "${data_feats}/${dset}"
+
+            echo "${feats_type}" > "${data_feats}/${dset}/feats_type"
+            echo "multi_${audio_format}" > "${data_feats}/${dset}/audio_format"
 
         done
     else
@@ -253,10 +307,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # shellcheck disable=SC2046,SC2086
     ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
         ${python} -m espnet2.bin.spk_train \
+            --use_preprocessor false \
             --collect_stats true \
             --use_preprocessor false \
             --train_data_path_and_name_and_type ${_spk_train_dir}/wav.scp,speech,${_type} \
-            --valid_data_path_and_name_and_type ${_spk_valid_dir}/wav.scp,speech,${_type} \
+            --valid_data_path_and_name_and_type ${_spk_valid_dir}/trial.scp,speech,${_type} \
             --train_shape_file "${_logdir}/train.JOB.scp" \
             --valid_shape_file "${_logdir}/valid.JOB.scp" \
             --spk2utt ${_spk_train_dir}/spk2utt \
@@ -271,6 +326,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # shellcheck disable=SC2086
     ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --skip_sum_stats --output_dir "${spk_stats_dir}"
 
+    cp ${spk_stats_dir}/valid/speech_shape ${spk_stats_dir}/valid/speech_shape2
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then

diff --git a/egs2/mini_an4/spk1/conf/train_mini_RawNet3.yaml b/egs2/mini_an4/spk1/conf/train_mini_RawNet3.yaml
@@ -15,13 +15,15 @@ projector_conf:
   input_size: 192
   output_size: 16
 
-
 preprocessor: spk
 preprocessor_conf:
-  target_duration: 3.0
-  sr: 16000
+  target_duration: 3.0  # seconds
+  sample_rate: 16000
   num_eval: 1
 
+model_conf:
+  extract_feats_in_collect_stats: false
+
 loss: aamsoftmax
 loss_conf:
   nOut: 16
@@ -30,3 +32,4 @@ loss_conf:
   scale: 15
 
 optim: adam
+num_att_plot: 0
diff --git a/egs2/mini_an4/spk1/local/data.sh b/egs2/mini_an4/spk1/local/data.sh
@@ -58,7 +58,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     done
 
     # make a dev set
-    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    utils/subset_data_dir.sh --first data/train 2 data/${train_dev}
     n=$(($(wc -l < data/train/text) - 1))
     utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
 
@@ -78,6 +78,11 @@ EOF
         awk '{print $1 " 1ch_16k"}' data/${x}/wav.scp > data/${x}/utt2category
     done
 
+    # for spk task validation
+    for x in test test_seg ${train_set} ${train_dev}; do
+        python local/make_trial.py data/${x}/wav.scp data/${x}
+    done
+
     find downloads/noise/ -iname "*.wav" | awk '{print "noise" NR " " $1}' > data/${train_set}/noises.scp
     find downloads/rirs/ -iname "*.wav" | awk '{print "rir" NR " " $1}' > data/${train_set}/rirs.scp
 fi

diff --git a/egs2/mini_an4/spk1/local/make_trial.py b/egs2/mini_an4/spk1/local/make_trial.py
@@ -0,0 +1,14 @@
+import os
+import sys
+
+if __name__ == "__main__":
+    with open(sys.argv[1]) as f:
+        lines = f.readlines()
+
+    joint_key = lines[0].strip().split(" ")[0] + "*" + lines[1].strip().split(" ")[0]
+    with open(os.path.join(sys.argv[2], "trial.scp"), "w") as f:
+        f.write(joint_key + " " + " ".join(lines[0].strip().split(" ")[1:]) + "\n")
+    with open(os.path.join(sys.argv[2], "trial2.scp"), "w") as f:
+        f.write(joint_key + " " + " ".join(lines[1].strip().split(" ")[1:]) + "\n")
+    with open(os.path.join(sys.argv[2], "trial_label"), "w") as f:
+        f.write(joint_key + " 0\n")
diff --git a/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr.yaml b/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr.yaml
@@ -0,0 +1,62 @@
+# RawNet3 reproduce recipe configuration.
+
+frontend: raw
+
+encoder: rawnet3
+encoder_conf:
+  model_scale: 8
+  ndim: 1024
+  sinc_stride: 16
+
+pooling: chn_attn_stat
+pooling_conf:
+  input_size: 1536  # 1.5 * ndim of RawNet3 encoder
+
+projector: rawnet3
+projector_conf:
+  input_size: 3072  # 2 * input_size of pooling
+  output_size: 256
+
+preprocessor: spk
+preprocessor_conf:
+  utt2spk: dump/raw_copy/voxceleb12_devs/utt2spk
+  spk2utt: dump/raw_copy/voxceleb12_devs/spk2utt
+  target_duration: 3.0  # seconds
+  sample_rate: 16000
+  num_eval: 5
+
+model_conf:
+  extract_feats_in_collect_stats: false
+
+loss: aamsoftmax
+loss_conf:
+  nOut: 256
+  nClasses: 7205
+  margin: 0.2
+  scale: 15
+
+max_epoch: 40
+#num_iters_per_epoch: 1
+num_att_plot: 0
+num_workers: 6
+cudnn_deterministic: False
+cudnn_benchmark: True
+batch_size: 32
+#batch_type: unsorted
+iterator_type: sequence
+shuffle_within_batch: True
+log_interval: 50
+optim: adamw
+optim_conf:
+  lr: 0.001
+  weight_decay: 0.00005
+  amsgrad: False
+
+scheduler: CosineAnnealingWarmupRestarts
+scheduler_conf:
+  first_cycle_steps: 310160
+  cycle_mult: 1.0
+  max_lr: 0.001
+  min_lr: 0.000005
+  warmup_steps: 5000
+  gamma: 0.8
diff --git a/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr_bs.yaml b/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr_bs.yaml
@@ -0,0 +1,60 @@
+# RawNet3 reproduce recipe configuration.
+
+frontend: raw
+
+encoder: rawnet3
+encoder_conf:
+  model_scale: 8
+  ndim: 1024
+  sinc_stride: 16
+
+pooling: chn_attn_stat
+pooling_conf:
+  input_size: 1536  # 1.5 * ndim of RawNet3 encoder
+
+projector: rawnet3
+projector_conf:
+  input_size: 3072  # 2 * input_size of pooling
+  output_size: 256
+
+preprocessor: spk
+preprocessor_conf:
+  target_duration: 3.0  # seconds
+  sample_rate: 16000
+  num_eval: 5
+
+model_conf:
+  extract_feats_in_collect_stats: false
+
+loss: aamsoftmax
+loss_conf:
+  nOut: 256
+  nClasses: 7205
+  margin: 0.3
+  scale: 30
+
+max_epoch: 40
+#num_iters_per_epoch: 1
+num_att_plot: 0
+num_workers: 8
+cudnn_deterministic: False
+cudnn_benchmark: True
+batch_size: 128
+#batch_type: unsorted
+iterator_type: sequence
+shuffle_within_batch: True
+log_interval: 50
+optim: adamw
+optim_conf:
+  lr: 0.001
+  weight_decay: 0.00005
+  amsgrad: False
+
+scheduler: CosineAnnealingWarmupRestarts
+scheduler_conf:
+  first_cycle_steps: 77544
+  cycle_mult: 1.0
+  max_lr: 0.001
+  min_lr: 0.000005
+  warmup_steps: 1000
+  gamma: 0.75
diff --git a/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr_bs2.yaml b/egs2/voxceleb/spk1/conf/tuning/train_RawNet3_sgdr_bs2.yaml
@@ -0,0 +1,60 @@
+# RawNet3 reproduce recipe configuration.
+
+frontend: raw
+
+encoder: rawnet3
+encoder_conf:
+  model_scale: 8
+  ndim: 1024
+  sinc_stride: 16
+
+pooling: chn_attn_stat
+pooling_conf:
+  input_size: 1536  # 1.5 * ndim of RawNet3 encoder
+
+projector: rawnet3
+projector_conf:
+  input_size: 3072  # 2 * input_size of pooling
+  output_size: 256
+
+preprocessor: spk
+preprocessor_conf:
+  target_duration: 3.0  # seconds
+  sample_rate: 16000
+  num_eval: 5
+
+model_conf:
+  extract_feats_in_collect_stats: false
+
+loss: aamsoftmax
+loss_conf:
+  nOut: 256
+  nClasses: 7205
+  margin: 0.3
+  scale: 30
+
+max_epoch: 40
+#num_iters_per_epoch: 1
+num_att_plot: 0
+num_workers: 8
+cudnn_deterministic: False
+cudnn_benchmark: True
+iterator_type: sequence
+shuffle_within_batch: True
+log_interval: 50
+batch_size: 200
+valid_batch_size: 40
+optim: adamw
+optim_conf:
+  lr: 0.0005
+  weight_decay: 0.00005
+  amsgrad: False
+
+scheduler: CosineAnnealingWarmupRestarts
+scheduler_conf:
+  first_cycle_steps: 49632
+  cycle_mult: 1.0
+  max_lr: 0.0005
+  min_lr: 0.00001
+  warmup_steps: 1000
+  gamma: 0.75
diff --git a/egs2/voxceleb/spk1/local/convert_trial.py b/egs2/voxceleb/spk1/local/convert_trial.py
@@ -0,0 +1,52 @@
+import argparse
+import os
+import sys
+
+
+def main(args):
+    with open(args.trial, "r") as f:
+        lines_trial_org = f.readlines()
+    with open(args.scp, "r") as f:
+        lines_scp = f.readlines()
+
+    scp_dict = dict()
+    for scp in lines_scp:
+        utt_id, path = scp.strip().split(" ")
+        scp_dict[utt_id] = path
+
+    with open(os.path.join(args.out, "trial.scp"), "w") as f_trial, open(
+        os.path.join(args.out, "trial2.scp"), "w"
+    ) as f_trial2, open(os.path.join(args.out, "trial_label"), "w") as f_label:
+        for tr in lines_trial_org:
+            label, utt1, utt2 = tr.strip().split(" ")
+            utt1 = utt1[:-4]
+            utt2 = utt2[:-4]
+            joint_key = "*".join([utt1, utt2])
+            f_trial.write(f"{joint_key} {scp_dict[utt1]}\n")
+            f_trial2.write(f"{joint_key} {scp_dict[utt2]}\n")
+            f_label.write(f"{joint_key} {label}\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Trial mapper")
+    parser.add_argument(
+        "--trial",
+        type=str,
+        required=True,
+        help="directory of the original trial file",
+    )
+    parser.add_argument(
+        "--scp",
+        type=str,
+        required=True,
+        help="directory of wav.scp file",
+    )
+    parser.add_argument(
+        "--out",
+        type=str,
+        required=True,
+        help="destinatino directory of processed trial and label files",
+    )
+    args = parser.parse_args()
+
+    sys.exit(main(args))