## Augmentation

### Speed perturbation

In [None]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ]; then
  dev_set=train_dev
fi

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp

train_set=
dev_set=
clean_test_sets=""
test_sets=""
gmm=tri4

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  # Although the nnet will be trained by high resolution data, we still have to perturb
  # the normal data to get the alignment _tp stands for tempo-perturbed
  echo "$0: preparing directory for low-resolution tempo-perturbed data (for alignment)"
  
  local/fph_perturb_data_dir_tempo_4way.sh --always-include-prefix true \
    data/${clean_train_set} data/${tempo_aug_set}

  steps/make_mfcc.sh --write-utt2num-frames true --cmd "$train_cmd" --nj 24 data/${tempo_aug_set} || exit 1;
  steps/compute_cmvn_stats.sh data/${tempo_aug_set} || exit 1;
  utils/fix_data_dir.sh data/${tempo_aug_set}
fi


### Reverberation

In [None]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ]; then
  dev_set=train_dev
fi

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
rvb_tempo_aug_set=${tempo_aug_set}_rvb
train_set=${clean_train_set}_tp_aug
dev_set=
clean_test_sets=""
test_sets=""
gmm=tri4

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 2 ]; then
  frame_shift=0.01
  ## version 1 for recordings already segmented by utterance (NO NEED FOR MIXED)
  #awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;} END {}' data/${clean_train_set}_tp/utt2num_frames \
  #  > data/${clean_train_set}_tp/reco2dur

  rvb_opts=()
  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
  rvb_opts+=(--noise-set-parameters "RIRS_NOISES/pointsource_noises/noise_list")
  
  steps/data/reverberate_data_dir.py \
    "${rvb_opts[@]}" \
    --prefix "reverb" \
    --foreground-snrs "20:10:15:5:0" \
    --background-snrs "20:10:15:5:0" \
    --speech-rvb-probability 1 \
    --pointsource-noise-addition-probability 1 \
    --isotropic-noise-addition-probability 1 \
    --num-replications 1 \
    --max-noises-per-minute 1 \
    --source-sampling-rate 16000 \
    data/${tempo_aug_set} data/${rvb_tempo_aug_set}

  
fi


In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
rvb_tempo_aug_set=${tempo_aug_set}_rvb
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  # this step is only for complicated pipelines in wav.scp,
  # we create explicit audio files containing the augments
  cp data/${rvb_tempo_aug_set}/wav.scp data/${rvb_tempo_aug_set}/wav.scp_BACKUP

  targetdir=data/raw/rvb
  mkdir -p $targetdir

  while read -r line; do
    outfile=$(echo $line | awk -v outpath=$targetdir '{print outpath"/"$1".wav"}')
    sem -j 32 bash <(echo $line | cut -d' ' -f2- | sed 's/|$//g') > $outfile
    echo $line | awk -v outfile=$outfile '{print $1" cat "outfile" |"}' >> $targetdir/wav.scp
  done < <( cat data/${rvb_tempo_aug_set}/wav.scp )

  mv $targetdir/wav.scp data/${rvb_tempo_aug_set}/wav.scp

fi

In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
rvb_tempo_aug_set=${tempo_aug_set}_rvb
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  echo "$0: making MFCC featuresfor low-resolution speed-then-noise-perturbed data"
  steps/make_mfcc.sh --cmd "$train_cmd" --nj 32 data/${rvb_tempo_aug_set} || exit 1;
  steps/compute_cmvn_stats.sh data/${rvb_tempo_aug_set} || exit 1;
  utils/fix_data_dir.sh data/${rvb_tempo_aug_set}
    
fi

### MUSAN Preparation

In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail


if [ $stage -le 1 ]; then
  # Prepare the MUSAN corpus, which consists of music, speech, and noise
  # suitable for augmentation.
  musan_root=/storage06/share_data/intelligence/Speech/Corpus/Noise/musan
  local/make_musan.sh $musan_root data
  echo "Finished preparing musan"
  echo ""

  # Get the duration of the MUSAN recordings.  This will be used by the
  # script augment_data_dir.py.
  for name in speech noise music; do
    utils/data/get_utt2dur.sh data/musan_${name}
    mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
  done
  echo "Finished getting duration of MUSAN recordings"
  echo ""
fi

### Noise models

In [6]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
noise_tempo_aug_set=${tempo_aug_set}_noise
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  # Augment with musan_noise
  steps/data/augment_data_dir.py --utt-prefix "noise" --modify-spk-id true --fg-interval 1 --fg-snrs "15:10:5:0" \
    --fg-noise-dir "data/musan_noise" data/${tempo_aug_set} data/${noise_tempo_aug_set}
    
fi

steps/data/augment_data_dir.py --utt-prefix noise --modify-spk-id true --fg-interval 1 --fg-snrs 15:10:5:0 --fg-noise-dir data/musan_noise data/train_nodev_tp data/train_nodev_tp_noise


  import sys, random, argparse, os, imp


CPU times: user 8 ms, sys: 12 ms, total: 20 ms
Wall time: 3min 40s


In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
noise_tempo_aug_set=${tempo_aug_set}_noise
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  # Write files
  cp data/${noise_tempo_aug_set}/wav.scp data/${noise_tempo_aug_set}/wav.scp_BACKUP

  targetdir=data/raw/noise
  
  while read -r line; do
    outfile=$(echo $line | awk -v outpath=$targetdir '{print outpath"/"$1".wav"}')
    sem -j 28 bash <(echo $line | cut -d' ' -f2- | sed 's/|$//g') > $outfile
    echo $line | awk -v outfile=$outfile '{print $1" cat "outfile" |"}' >> $targetdir/wav.scp
  done < <( cat data/${noise_tempo_aug_set}/wav.scp )
  
  mv $targetdir/wav.scp data/${noise_tempo_aug_set}/wav.scp
    
fi

In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
noise_tempo_aug_set=${tempo_aug_set}_noise
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  echo "$0: making MFCC featuresfor low-resolution speed-then-noise-perturbed data"
  steps/make_mfcc.sh --cmd "$train_cmd" --nj 32 data/${noise_tempo_aug_set} || exit 1;
  steps/compute_cmvn_stats.sh data/${noise_tempo_aug_set} || exit 1;
  utils/fix_data_dir.sh data/${noise_tempo_aug_set}
    
fi

### Mixing music

In [8]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
music_tempo_aug_set=${tempo_aug_set}_music
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  
  # Augment with musan_music
  steps/data/augment_data_dir.py --utt-prefix "music" --modify-spk-id true --bg-snrs "25:20:15:10:5" \
    --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/${tempo_aug_set} data/${music_tempo_aug_set}
fi

steps/data/augment_data_dir.py --utt-prefix music --modify-spk-id true --bg-snrs 25:20:15:10:5 --num-bg-noises 1 --bg-noise-dir data/musan_music data/train_nodev_tp data/train_nodev_tp_music


  import sys, random, argparse, os, imp


CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 3min 30s


In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
music_tempo_aug_set=${tempo_aug_set}_music
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  echo "$0: making MFCC featuresfor low-resolution speed-then-noise-perturbed data"
  steps/make_mfcc.sh --cmd "$train_cmd" --nj 32 data/${music_tempo_aug_set} || exit 1;
  steps/compute_cmvn_stats.sh data/${music_tempo_aug_set} || exit 1;
  utils/fix_data_dir.sh data/${music_tempo_aug_set}
    
fi

### Babble noise

In [9]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
babble_tempo_aug_set=${tempo_aug_set}_babble
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  rm -fr data/${babble_tempo_aug_set}
  # Augment with musan_speech
  steps/data/augment_data_dir.py --utt-prefix "babble" --modify-spk-id true --bg-snrs "30:25:20:17:15" \
    --num-bg-noises "7:8:9:10:11" --bg-noise-dir "data/musan_speech" data/${tempo_aug_set} \
    data/${babble_tempo_aug_set}
  
fi

steps/data/augment_data_dir.py --utt-prefix babble --modify-spk-id true --bg-snrs 30:25:20:17:15 --num-bg-noises 7:8:9:10:11 --bg-noise-dir data/musan_speech data/train_nodev_tp data/train_nodev_tp_babble


  import sys, random, argparse, os, imp


CPU times: user 16 ms, sys: 8 ms, total: 24 ms
Wall time: 4min 35s


### Just for reference

In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
train_set=
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  # Augment with musan_noise
  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" \
    --fg-noise-dir "data/musan_noise" data/${clean_train_set}_tp data/${clean_train_set}_tp_noise
  
  # Augment with musan_music
  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" \
    --bg-noise-dir "data/musan_music" data/${clean_train_set}_tp data/${clean_train_set}_tp_music
  
  # Augment with musan_speech
  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" \
    --bg-noise-dir "data/musan_speech" data/${clean_train_set}_tp data/${clean_train_set}_tp_babble
  
  # Combine reverb, noise, music, and babble into one directory.
  utils/combine_data.sh data/${train_set} data/${clean_train_set}_tp_noise data/${clean_train_set}_tp_music \
    data/${clean_train_set}_tp_babble
  echo "Finished combining everything"
  echo ""
  
fi

In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
train_set=${clean_train_set}_tp_aug
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

if [ $stage -le 1 ]; then
  echo "$0: making MFCC featuresfor low-resolution speed-then-reverb-perturbed data"
  steps/make_mfcc.sh --cmd "$train_cmd" --nj 24 data/${clean_train_set}_tp_reverb || exit 1;
  steps/compute_cmvn_stats.sh data/${clean_train_set}_tp_reverb || exit 1;
  utils/fix_data_dir.sh data/${clean_train_set}_tp_reverb

  # Combine the clean and augmented subset.  This is now roughly
  # quaruple the size of the original clean list.
  utils/combine_data.sh data/train_robust data/${clean_train_set}_tp data/${train_set} data/${clean_train_set}_tp_reverb
  calcdur data/train_robust/segments
fi


In [None]:
%%time
%%bash

# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
# (and may eventually be called by more scripts). It contains the common feature
# preparation and ivector-related parts of the script. See those scripts for 
# examples of usage.

stage=0
clean_train_set=train_nodev
train_set=${clean_train_set}_tp_aug
dev_set=
clean_test_sets="eval1 eval2 eval3"
test_sets="noisy_eval1 noisy_eval2 noisy_eval3"
gmm=tri4

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

#set -euo pipefail

if [ -e data/train_dev ] ;then
  dev_set=train_dev
fi

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${clean_train_set}_tp

for f in data/${clean_train_set}/feats.scp ${gmm_dir}/final.mdl; do
  if [ ! -f $f ]; then
    echo "$0: expected file $f to exist"
    exit 1
  fi
done

export LC_ALL=C;
if [ $stage -le 1 ]; then
  #rm -fr data/train_robust

  # fix speakers for the augmentation case, turn noisy speech into their own "speakers"
  #cat data/${train_set}/utt2spk_BACKUP | sed 's/.*\(babble\|music\|noise\).*/&-\1/g' | sort --parallel=8 \
  #  > data/${train_set}/utt2spk
  #sort -k 2 data/${train_set}/utt2spk | utils/utt2spk_to_spk2utt.pl > data/${train_set}/spk2utt || exit 1;
  # because of this, we have to recompute the cmvn stats
  #steps/compute_cmvn_stats.sh data/${train_set} || exit 1;
  #utils/fix_data_dir.sh data/${train_set}
  
  # Combine the clean and augmented subset.  This is now roughly
  # quaruple the size of the original clean list.
  utils/combine_data.sh data/train_robust data/${clean_train_set}_tp_clean \
                        data/${train_set} data/${clean_train_set}_tp_reverb
  utils/fix_data_dir.sh data/train_robust
fi


In [None]:
num_utts_total=$(wc -l <data/${train_set}_hires/utt2spk)
  num_utts=$[$num_utts_total/3]
  utils/data/subset_data_dir.sh data/${train_set}_hires \
    $num_utts ${temp_data_root}/${train_set}_hires_subset

### Subset decisions

In [None]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp

for settype in data/${tempo_aug_set}{,_rvb,_noise,_music,_babble}; do
  grep -v 'sp0.8-' ${settype}/text | cut -d' ' -f1 > ${settype}/uttlist_noslow
  utils/data/subset_data_dir.sh --utt-list ${settype}/uttlist_noslow ${settype} ${settype}_noslow
done

num_utts_per_set=$(wc -l < data/${tempo_aug_set}_noslow/utt2spk)
num_utts_wanted=$[$num_utts_per_set * 5 / 2]

# special treatment for clean and random noise
utils/data/subset_data_dir.sh data/${tempo_aug_set}_noslow $[$num_utts_wanted * 3 / 16] \
  data/${tempo_aug_set}_noslow_div
utils/data/subset_data_dir.sh data/${tempo_aug_set}_noise_noslow $[$num_utts_wanted / 16] \
  data/${tempo_aug_set}_noise_noslow_div
for cortype in rvb music babble; do
  utils/data/subset_data_dir.sh data/${tempo_aug_set}_${cortype}_noslow $[$num_utts_wanted / 4] \
    data/${tempo_aug_set}_${cortype}_noslow_div
done

utils/combine_data.sh data/train_robust data/${tempo_aug_set}_noslow data/${tempo_aug_set}_rvb_noslow \
  data/${tempo_aug_set}_noise_noslow data/${tempo_aug_set}_music_noslow data/${tempo_aug_set}_babble_noslow


utils/combine_data.sh data/train_robust_small data/${tempo_aug_set}_noslow_div \
  data/${tempo_aug_set}_rvb_noslow_div data/${tempo_aug_set}_noise_noslow_div \
  data/${tempo_aug_set}_music_noslow_div data/${tempo_aug_set}_babble_noslow_div


rm -r data/*noslow*

In [None]:
%%time
%%bash

# fixed babble, not redoing ivector extractor training

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp

for settype in data/${tempo_aug_set}{,_rvb,_noise,_music,_babble}; do
  grep -v 'sp0.8-' ${settype}/text | cut -d' ' -f1 > ${settype}/uttlist_noslow
  utils/data/subset_data_dir.sh --utt-list ${settype}/uttlist_noslow ${settype} ${settype}_noslow
done

utils/combine_data.sh data/train_robust data/${tempo_aug_set}_noslow data/${tempo_aug_set}_rvb_noslow \
  data/${tempo_aug_set}_noise_noslow data/${tempo_aug_set}_music_noslow data/${tempo_aug_set}_babble_noslow


In [None]:
%%time
%%bash

# cheat set creation

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

clean_train_set=train_nodev
tempo_aug_set=${clean_train_set}_tp
fake_name=fake_tp
cheat_set=cheat_set_for_lats

# there's a discrepancy between rvb's prefix "reverb1-" so we're isolating
utils/copy_data_dir.sh --spk-prefix "reverb1-" --utt-prefix "reverb1-" \
  data/${tempo_aug_set}_noslow data/${fake_name}_rvb_noslow
# full fakery
awk '{print "reverb1-"$0}' data/${fake_name}_rvb_noslow/reco2dur > data/${fake_name}_rvb_noslow/reco2dur_fake
mv data/${fake_name}_rvb_noslow/reco2dur_fake data/${fake_name}_rvb_noslow/reco2dur
awk '{print $1" reverb1-"$2" "$3" "$4}' data/${fake_name}_rvb_noslow/segments \
  > data/${fake_name}_rvb_noslow/segments_fake
mv data/${fake_name}_rvb_noslow/segments_fake data/${fake_name}_rvb_noslow/segments
awk '{print "reverb1-"$0}' data/${fake_name}_rvb_noslow/wav.scp > data/${fake_name}_rvb_noslow/wav.scp_fake
mv data/${fake_name}_rvb_noslow/wav.scp_fake data/${fake_name}_rvb_noslow/wav.scp

# musan
for settype in {noise,music,babble}; do
  utils/copy_data_dir.sh --spk-prefix ${settype}- --utt-prefix ${settype}- \
    data/${tempo_aug_set}_noslow data/${fake_name}_${settype}_noslow
  for orig in {reco2dur,segments,wav.scp}; do
    sed "s/reverb1/$settype/g" data/${fake_name}_rvb_noslow/$orig \
      > data/${fake_name}_${settype}_noslow/${orig}_fake
    mv data/${fake_name}_${settype}_noslow/${orig}_fake data/${fake_name}_${settype}_noslow/$orig
  done
done

utils/combine_data.sh data/${cheat_set} data/${tempo_aug_set}_noslow data/${fake_name}_rvb_noslow \
  data/${fake_name}_noise_noslow data/${fake_name}_music_noslow data/${fake_name}_babble_noslow

### iVector common starts here ###

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
gmm=tri4
cheat_set=cheat_set_for_lats

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}

if [ $stage -le 2 ]; then
  echo "$0: aligning with the perturbed low-resolution data"
  steps/align_fmllr.sh --nj 32 --cmd "$train_cmd" \
    data/${cheat_set} data/lang $gmm_dir $ali_dir || exit 1;
fi

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
gmm=tri4
cheat_set=cheat_set_for_lats

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}

if [ $stage -le 3 ]; then
  steps/train_sat.sh  --cmd "$train_cmd" \
    13000 300000 data/${cheat_set} data/lang $ali_dir exp/tri5
fi

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
dev_set=
test_sets=
gmm=tri5

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if [ $stage -le 3 ]; then
  #Create high-resolution MFCC features (with 40 cepstra instead of 13).
  # this shows how you can split across multiple file-systems.
  echo "$0: creating high-resolution MFCC features"
  mfccdir=data/${train_set}_hires/data

  for datadir in ${train_set}; do
    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
  done

  # do volume-perturbation on the training data prior to extracting hires
  # features; this helps make trained nnets more invariant to test data volume
  utils/data/perturb_data_dir_volume.sh data/${train_set}_hires || exit 1;

  # generate high-resolution MFCC feautres
  for datadir in ${train_set}; do
    steps/make_mfcc.sh --nj 32 --mfcc-config conf/mfcc_hires.conf \
      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
  done
fi

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
dev_set=
test_sets="eval1 eval2 eval3"
gmm=tri5

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if [ $stage -le 4 ]; then
  echo "$0: train the diagonal UBM."
  # There's no data, so use all
  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
  #temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm

  #num_utts_total=$(wc -l <data/${train_set}_hires/utt2spk)
  #num_utts=$[$num_utts_total]
  #utils/data/subset_data_dir.sh data/${train_set}_hires \
  #  $num_utts ${temp_data_root}/${train_set}_hires_subset

  echo "$0: computing a PCA transform from the hires data."
  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=4 --right-context=4" \
    --max-utts 30000 --subsample 2 \
    #${temp_data_root}/${train_set}_hires_subset \
    data/${train_set}_small_hires \
    exp/nnet3${nnet3_affix}/pca_transform

  echo "$0: training the diagonal UBM."
  # Use 512 Gaussians in the UBM.
  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 32 \
    --num-frames 500000 \
    #--num-threads 8 ${temp_data_root}/${train_set}_hires_subset 512 \
    data/${train_set}_small_hires 512 \
    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
fi

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
dev_set=
test_sets=
gmm=tri5

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if [ $stage -le 5 ]; then
  # Train the iVector extractor.Use all of the speed-perturbed data since iVector extractors
  # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100
  # even though $nj is just 2 (10 on high end set ups), each job uses multiple processes and threads.
  echo "$0: training the iVector extractor"
  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
    #exp/nnet3${nnet3_affix}/diag_ubm/${train_set}_hires_subset
    data/${train_set}_small_hires exp/nnet3${nnet3_affix}/diag_ubm \
    exp/nnet3${nnet3_affix}/extractor || exit 1;
fi

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
dev_set=
test_sets="eval1 eval2 eval3"
gmm=tri5

nnet3_affix=

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if [ $stage -le 6 ]; then
  # We extract iVectors on the speed-perturbed training data after combining
  # short segments, which will be what we train the system on. With --utts-per-spk-max 2,
  # the scripts pairs the utterances into twos, and treats each of these pairs
  # as one speaker; this gives more diversity in iVectors.
  # Note that these are extracted 'online'.

  # Note, we don't encode the 'max2' in the name of the iVectordir even though
  # that's the data we extract the iVectors from, as it's still going to be 
  # valid for the non-'max2' data, the utterance list is the same.
  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

  # having a larger number of speakers is helpful for generalization, and to
  # handle per-utterance decoding well (iVector starts at zero).
  temp_data_root=${ivectordir}
  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
    data/${train_set}_hires ${temp_data_root}/${train_set}_hires_max2

  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 24 \
    ${temp_data_root}/${train_set}_hires_max2 \
    exp/nnet3${nnet3_affix}/extractor $ivectordir

  # Also extract iVectors for the test data, but in this case we don't need the speed
  # perturbation (sp).
  #for datadir in $dev_set $test_sets; do
  #  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
  #    data/${datadir}_hires exp/nnet3${nnet3_affix}/extractor \
  #    exp/nnet3${nnet3_affix}/ivectors_${datadir}_hires
  #done
fi

### Neural net training starts here

In [None]:
%%time
%%bash

#set -euo pipefail

stage=0
train_set=train_robust
cheat_set=cheat_set_for_lats
gmm=tri5

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/${gmm}
ali_dir=exp/${gmm}_ali_${train_set}

if [ $stage -le 2 ]; then
  echo "$0: aligning with the perturbed low-resolution data"
  steps/align_fmllr.sh --nj 32 --cmd "$train_cmd" \
    data/${cheat_set} data/lang $gmm_dir $ali_dir || exit 1;
fi

In [1]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
gmm=tri5
train_set=train_robust
nnet3_affix=
# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=7n  # affix for the TDNN directory name
tree_affix=

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if ! cuda-compiled; then
  cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi


gmm_dir=exp/$gmm
ali_dir=exp/${gmm}_ali_${train_set}
tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
dir=exp/chain${nnet3_affix}/tdnn${affix}
train_data_dir=data/${train_set}_hires
lores_train_data_dir=data/${train_set}
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

for fa in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
  $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
  # unfortunately, this check won't work on jupyter's bash magic
  #[ ! -f $fa ] && echo "$0; expected file $fa to exist" && exit 1;
  ls -ltrh $fa
done

bash 
lrwxrwxrwx 1 nobody nogroup 6 Jan 11 17:27 exp/tri5/final.mdl -> 35.mdl
-rw-r--r-- 1 nobody nogroup 1.2G Jan 11 08:15 data/train_robust_hires/feats.scp
-rw-r--r-- 1 nobody nogroup 1.1G Jan 11 13:51 exp/nnet3/ivectors_train_robust_hires/ivector_online.scp
-rw-r--r-- 1 nobody nogroup 1.2G Jan 11 02:58 data/train_robust/feats.scp
-rw-r--r-- 1 nobody nogroup 19M Jan 12 04:12 exp/tri5_ali_train_robust/ali.1.gz
CPU times: user 0 ns, sys: 16 ms, total: 16 ms
Wall time: 353 ms


In [None]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=0
gmm=tri5
train_set=train_robust
cheat_set=cheat_set_for_lats
nnet3_affix=
# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=7n  # affix for the TDNN directory name
tree_affix=

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if ! cuda-compiled; then
  cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi


gmm_dir=exp/$gmm
ali_dir=exp/${gmm}_ali_${train_set}
tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
dir=exp/chain${nnet3_affix}/tdnn${affix}
train_data_dir=data/${train_set}_hires
#lores_train_data_dir=data/${train_set}
lores_train_data_dir=data/${cheat_set}
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

if [ $stage -le 9 ]; then
  # Get the alignments as lattices (gives the LF-MMI training more freedom).
  # use the same num-jobs as the alignments
  steps/align_fmllr_lats.sh --nj 36 --cmd "$train_cmd" ${lores_train_data_dir} \
    data/lang $gmm_dir $lat_dir
  rm $lat_dir/fsts.*.gz # save space
fi

In [None]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=0

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if ! cuda-compiled; then
  cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi

lang=data/lang_chain

if [ $stage -le 10 ]; then
  # Create a version of the lang/ directory that has one state per phone in the
  # topo file. [note, it really has two states.. the first one is only repeated
  # once, the second one has zero or more repeats.]
  rm -rf $lang
  cp -r data/lang $lang
  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
  # Use our special topology... note that later on may have to tune this
  # topology.
  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi


In [None]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=0
decode_nj=10
train_set=train_robust
cheat_set=cheat_set_for_lats
dev_set=
test_sets="eval1 eval2 eval3"
gmm=tri5
nnet3_affix=

# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=7n  # affix for the TDNN directory name
tree_affix=
train_stage=-10
get_egs_stage=-10
decode_iter=

# training options
# training chunk-options
decode_iter=
num_epochs=10
initial_effective_lrate=0.001
final_effective_lrate=0.0001
leftmost_questions_truncate=-1
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=1
num_jobs_final=3
minibatch_size=128,64
#frames_per_eg=150,140,100
frames_per_eg=75,70,50
remove_egs=false
common_egs_dir=
xent_regularize=0.1

test_online_decoding=false  # if true, it will run the last decoding stage.

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/$gmm
ali_dir=exp/${gmm}_ali_${train_set}
tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
dir=exp/chain${nnet3_affix}/tdnn${affix}
train_data_dir=data/${train_set}_hires
#lores_train_data_dir=data/${train_set}
lores_train_data_dir=data/${cheat_set}
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

if [ $stage -le 11 ]; then
  # Build a tree using our new topology. This is the critically different
  # step compared with other recipes.
  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
      --leftmost-questions-truncate $leftmost_questions_truncate \
      --context-opts "--context-width=2 --central-position=1" \
      --cmd "$train_cmd" 11000 ${lores_train_data_dir} $lang $ali_dir $tree_dir
fi

In [None]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=0
decode_nj=10
train_set=train_robust
dev_set=
test_sets=
gmm=tri5
nnet3_affix=

# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=7n_robust  # affix for the TDNN directory name
tree_affix=
train_stage=-10
get_egs_stage=-10
decode_iter=

# training options
# training chunk-options
decode_iter=
num_epochs=10
initial_effective_lrate=0.001
final_effective_lrate=0.0001
leftmost_questions_truncate=-1
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=1
num_jobs_final=3
#minibatch_size=128,64
minibatch_size=256,128
#frames_per_eg=150,140,100
frames_per_eg=75,70,50
remove_egs=false
common_egs_dir=
xent_regularize=0.1

test_online_decoding=false  # if true, it will run the last decoding stage.

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/$gmm
ali_dir=exp/${gmm}_ali_${train_set}
tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
dir=exp/chain${nnet3_affix}/tdnn${affix}
train_data_dir=data/${train_set}_hires
lores_train_data_dir=data/${train_set}
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

if [ $stage -le 12 ]; then
  echo "$0: creating neural net configs using the xconfig parser";

  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python2)
  opts="l2-regularize=0.002"
  linear_opts="orthonormal-constraint=1.0"
  output_opts="l2-regularize=0.0005 bottleneck-dim=256"

  mkdir -p $dir/configs

  cat <<EOF > $dir/configs/network.xconfig
  input dim=100 name=ivector
  input dim=40 name=input

  # please note that it is important to have input layer with the name=input
  # as the layer immediately preceding the fixed-affine-layer to enable
  # the use of short notation for the descriptor
  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat

  # the first splicing is moved before the lda layer, so no splicing here
  relu-batchnorm-layer name=tdnn1 $opts dim=1280
  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
  linear-component name=tdnn3l dim=256 $linear_opts
  relu-batchnorm-layer name=tdnn3 $opts dim=1280
  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
  linear-component name=tdnn5l dim=256 $linear_opts
  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
  linear-component name=prefinal-l dim=256 $linear_opts

  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts

  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
EOF
  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
fi

In [None]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=0
decode_nj=10
train_set=train_robust
dev_set=
test_sets=
gmm=tri5
nnet3_affix=

# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=7n  # affix for the TDNN directory name
tree_affix=
train_stage=-10
get_egs_stage=-10
decode_iter=

# training options
# training chunk-options
decode_iter=
num_epochs=24
initial_effective_lrate=0.001
final_effective_lrate=0.0001
leftmost_questions_truncate=-1
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=1
num_jobs_final=1
#minibatch_size=128,64
minibatch_size=256,128
#frames_per_eg=150,140,100
frames_per_eg=75,70,50
remove_egs=false
common_egs_dir=
xent_regularize=0.1

test_online_decoding=false  # if true, it will run the last decoding stage.

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/$gmm
ali_dir=exp/${gmm}_ali_${train_set}
tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
dir=exp/chain${nnet3_affix}/tdnn${affix}
train_data_dir=data/${train_set}_hires
lores_train_data_dir=data/${train_set}
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

if [ $stage -le 13 ]; then
  steps/nnet3/chain/train.py --stage $train_stage \
    --cmd "$train_cmd" \
    --feat.online-ivector-dir $train_ivector_dir \
    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
    --chain.xent-regularize $xent_regularize \
    --chain.leaky-hmm-coefficient 0.1 \
    --chain.l2-regularize 0.0 \
    --chain.apply-deriv-weights false \
    --chain.lm-opts="--num-extra-lm-states=2000" \
    --egs.dir "$common_egs_dir" \
    --egs.stage $get_egs_stage \
    --egs.opts "--frames-overlap-per-eg 0" \
    --egs.chunk-width $frames_per_eg \
    --trainer.num-chunk-per-minibatch 128 \
    --trainer.frames-per-iter 1500000 \
    --trainer.num-epochs $num_epochs \
    --trainer.optimization.num-jobs-initial $num_jobs_initial \
    --trainer.optimization.num-jobs-final $num_jobs_final \
    --trainer.optimization.initial-effective-lrate 0.001 \
    --trainer.optimization.final-effective-lrate 0.0001 \
    --trainer.max-param-change 2.0 \
    --use-gpu=wait \
    --cleanup.remove-egs $remove_egs \
    --feat-dir $train_data_dir \
    --tree-dir $tree_dir \
    --lat-dir $lat_dir \
    --dir $dir  || exit 1;

fi


### decoding graph creation

In [None]:
%%time
%%bash

lmDir=data/exp/lm
srcdict=data/exp/dict_bigtext/lexicon.txt
dir=data/exp/dict_bigtext

export LC_ALL=C

cat data/exp/dict_bigtext/ALL_dict | sed '/<sp>/d; /<unk>/d; /^[ ]*$/d' | sort --parallel=8 | uniq > $srcdict


cat $srcdict > $dir/lexicon1.txt || exit 1;

#cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
#  grep -v SIL > $dir/nonsilence_phones.txt  || exit 1;
cp data/local/dict/nonsilence_phones.txt $dir/nonsilence_phones.txt

#( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
#( echo SIL ; echo SPN ; ) > $dir/silence_phones.txt
cp data/local/dict/silence_phones.txt $dir/silence_phones.txt

#echo SIL > $dir/optional_silence.txt
cp data/local/dict/optional_silence.txt $dir/optional_silence.txt

# No "extra questions" in the input to this setup, as we don't
# have stress or tone.
#echo -n >$dir/extra_questions.txt
cp data/local/dict/extra_questions.txt $dir/extra_questions.txt

# Add to the lexicon the silences, noises etc.
( echo '<sp> SIL' ; echo '<unk> SPN'; ) | cat - $dir/lexicon1.txt  > $dir/lexicon2.txt || exit 1;

cp $dir/lexicon2.txt $dir/lexicon.txt

In [None]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

rm -fr data/exp/lang_bigtext
utils/prepare_lang.sh --num-sil-states 4 data/exp/dict_bigtext "<unk>" data/exp/lang_bigtext/tmp data/exp/lang_bigtext

In [None]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

dir=data/exp/lm
text=data/TEXT/EVERYTHING_UNIQ
lexicon=data/exp/dict_bigtext/lexicon.txt
mkdir -p $dir
export LC_ALL=C 

cat $text | gzip -c > $dir/train.all.gz
#cut -d' ' -f2- $text | tail -n +$heldout_sent | gzip -c > $dir/train.gz
#cut -d' ' -f2- $text | head -n $heldout_sent > $dir/heldout

cut -d' ' -f1 $lexicon > $dir/wordlist

ngram-count -text $dir/train.all.gz -order 4 -limit-vocab -vocab $dir/wordlist \
  -unk -map-unk "<unk>" -kndiscount -interpolate -lm $dir/fph.o4g.kn.gz
#echo "PPL for CSJ LM:"
#ngram -unk -lm $dir/csj.o3g.kn.gz -ppl $dir/heldout
#ngram -unk -lm $dir/csj.o3g.kn.gz -ppl $dir/heldout -debug 2 >& $dir/ppl2

In [None]:
%%time
%%bash

. ./cmd.sh
. ./path.sh
set -e # exit on error

# We don't really need all these options for SRILM, since the LM training script
# does some of the same processing (e.g. -subset -tolower)
srilm_opts="-subset -prune-lowprobs -unk -tolower -order 4"
LM=data/exp/lm/fph.o4g.kn.gz
rm -fr data/exp/lang_nosilp_fph_4g
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
  data/exp/lang_bigtext $LM data/exp/dict_bigtext/lexicon.txt data/exp/lang_nosilp_fph_4g

In [1]:
%%time
%%bash

#set -euo pipefail

# First the options that are passed through to run_ivector_common.sh
# (some of which are also used in this script directly).
stage=0
decode_nj=10
train_set=train_robust
dev_set=
test_sets="eval1 eval2 eval3"
gmm=tri4
nnet3_affix=

# The rest are configs specific to this script. Most of the parameters
# are just hardcoded at this level, in the commands below.
affix=7n_robust  # affix for the TDNN directory name
tree_affix=
train_stage=-10
get_egs_stage=-10
decode_iter=

# training options
# training chunk-options
decode_iter=
num_epochs=10
initial_effective_lrate=0.001
final_effective_lrate=0.0001
leftmost_questions_truncate=-1
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=1
num_jobs_final=3
minibatch_size=128,64
#frames_per_eg=150,140,100
frames_per_eg=75,70,50
remove_egs=false
common_egs_dir=
xent_regularize=0.1

test_online_decoding=false  # if true, it will run the last decoding stage.

# End configuration section.
echo "$0 $@"  # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

gmm_dir=exp/$gmm
ali_dir=exp/${gmm}_ali_${train_set}
tree_dir=exp/chain${nnet3_affix}/tree${tree_affix:+_$tree_affix}
lang=data/lang_chain
lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
dir=exp/chain${nnet3_affix}/tdnn${affix}
train_data_dir=data/${train_set}_hires
lores_train_data_dir=data/${train_set}
train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_hires

if [ $stage -le 14 ]; then
  # Note: it might appear that this $lang directory is mismatched, and it is as
  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
  # the lang directory.
  utils/mkgraph.sh --self-loop-scale 1.0 data/exp/lang_nosilp_fph_4g $dir $dir/graph_fph_4g
fi


bash 
0.291139 -0.660707
HCLGa is not stochastic
0.135011 -0.387397
[info]: final HCLG is not stochastic.


tree-info exp/chain/tdnn7n_robust/tree 
tree-info exp/chain/tdnn7n_robust/tree 
make-h-transducer --disambig-syms-out=exp/chain/tdnn7n_robust/graph_fph_4g/disambig_tid.int --transition-scale=1.0 data/exp/lang_nosilp_fph_4g/tmp/ilabels_2_1 exp/chain/tdnn7n_robust/tree exp/chain/tdnn7n_robust/final.mdl 
fstrmepslocal 
fsttablecompose exp/chain/tdnn7n_robust/graph_fph_4g/Ha.fst data/exp/lang_nosilp_fph_4g/tmp/CLG_2_1.fst 
fstminimizeencoded 
fstrmsymbols exp/chain/tdnn7n_robust/graph_fph_4g/disambig_tid.int 
fstdeterminizestar --use-log=true 
fstisstochastic exp/chain/tdnn7n_robust/graph_fph_4g/HCLGa.fst 
add-self-loops --self-loop-scale=1.0 --reorder=true exp/chain/tdnn7n_robust/final.mdl exp/chain/tdnn7n_robust/graph_fph_4g/HCLGa.fst 
fstisstochastic exp/chain/tdnn7n_robust/graph_fph_4g/HCLG.fst 


CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 5min 57s
