diff --git a/.gitattributes b/.gitattributes index 5a815654b4c..bede44edf8a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,4 +15,6 @@ windows/INSTALL* eol=native windows/NewGuidCmd.exe.config text eol=crlf windows/NewGuidCmd.exe binary +# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows). +**/*.patch -text diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt index 4df9eb2a2c5..71ab9f0fa45 100644 --- a/egs/swbd/s5c/local/chain/README.txt +++ b/egs/swbd/s5c/local/chain/README.txt @@ -6,5 +6,7 @@ ones to look at right now: 4f is a good jesus-layer system 4q is an improved TDNN with various bells and whistles from Vijay. 4r is a slightly-better jesus-layer system than 4f, with one more layer. + 5e is the best configuration run so far. + diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh index 6a85bde4653..9cdbfefb5a2 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh @@ -4,6 +4,15 @@ # from 1.0 to 2.0 because there is a lot of parameter change in the final xent # layer, and this limits the rate of change of the other layers. +#./compare_wer.sh 4r 4v +#System 4r 4v +#WER on train_dev(tg) 16.50 15.95 +#WER on train_dev(fg) 15.45 14.69 +#WER on eval2000(tg) 18.3 17.7 +#WER on eval2000(fg) 16.7 16.0 +#Final train prob -0.103652 -0.106646 -1.60775 +#Final valid prob -0.121105 -0.118631 -1.62832 + # _4r is as _4f, but one more hidden layer, and reducing context of existing # layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly # from 1500 to 1400. diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh index 62b87cccd06..6dd5c587f7a 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh @@ -1,6 +1,16 @@ #!/bin/bash -# _4w is as _4v, but doubling --xent-regularize to 0.2 +# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a +# bit worse, although final valid prob is very slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 # _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change # from 1.0 to 2.0 because there is a lot of parameter change in the final xent diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh index cb04a39be51..0290e0bdbd5 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh @@ -1,7 +1,17 @@ #!/bin/bash # _4x is as _4u, but with --leaky-hmm-coefficient 0.2. Note: the -# ultimate baseline is 4f. +# ultimate baseline is 4f. It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1). +# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1. +# +# ./compare_wer.sh 4f 4u 4x +# System 4f 4u 4x +# WER on train_dev(tg) 16.83 16.47 16.63 +# WER on train_dev(fg) 15.73 15.23 15.42 +# WER on eval2000(tg) 18.4 18.4 18.4 +# WER on eval2000(fg) 16.6 16.7 16.6 +# Final train prob -0.105832 -0.118911 -0.130674 +# Final valid prob -0.123021 -0.135768 -0.146351 # _4u is as _4t, but with --leaky-hmm-coefficient 0.08. Note: the # ultimate baseline is 4f. diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh new file mode 100755 index 00000000000..cd1de07a80d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh @@ -0,0 +1,401 @@ +#!/bin/bash + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. Very helpful (between 0.2% +# and 0.6%). + +#./compare_wer.sh 4w 5a +#System 4w 5a +#WER on train_dev(tg) 16.05 15.86 +#WER on train_dev(fg) 14.92 14.74 +#WER on eval2000(tg) 18.0 17.4 +#WER on eval2000(fg) 16.2 15.6 +#Final train prob -0.108816-0.0998359 +#Final valid prob -0.118254 -0.115884 + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh new file mode 100755 index 00000000000..7e44c10920e --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh @@ -0,0 +1,404 @@ +#!/bin/bash + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh new file mode 100755 index 00000000000..93ebb59b16d --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh @@ -0,0 +1,409 @@ +#!/bin/bash + +# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be +# worse than 0.1. +# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2). +#System 4w 5c +#WER on train_dev(tg) 16.05 16.35 +#WER on train_dev(fg) 14.92 15.21 +#WER on eval2000(tg) 18.0 17.8 +#WER on eval2000(fg) 16.2 16.4 +#Final train prob -0.108816 -0.107098 +#Final valid prob -0.118254 -0.118209 + +# _4w is as _4v, but doubling --xent-regularize to 0.2. WER seems consistently +# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very +# slightly better. + +#./compare_wer.sh 4v 4w +#System 4v 4w +#WER on train_dev(tg) 15.95 16.05 +#WER on train_dev(fg) 14.69 14.92 +#WER on eval2000(tg) 17.7 18.0 +#WER on eval2000(fg) 16.0 16.2 +#Final train prob -0.106646 -0.108816 +#Final valid prob -0.118631 -0.118254 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.05 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh new file mode 100755 index 00000000000..8e6e9358003 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh @@ -0,0 +1,407 @@ +#!/bin/bash + +# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and +# jesus-forward-output-dim from 1800 to 2000. + +# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1). +#./compare_wer.sh 5b 5d +#System 5b 5d +#WER on train_dev(tg) 15.51 15.29 +#WER on train_dev(fg) 14.39 14.17 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.7 +#Final train prob -0.112013 -0.107858 +#Final valid prob -0.130879 -0.128862 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.2 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh new file mode 100755 index 00000000000..ed48b0673b8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh @@ -0,0 +1,417 @@ +#!/bin/bash + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh new file mode 100755 index 00000000000..5fb1f0c445c --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh @@ -0,0 +1,423 @@ +#!/bin/bash + +# _5f is as _5e, but making the 5b->5d change (increasing the +# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000, +# and jesus-forward-input-dim from 500 to 600. + +# WER change is (-0.1, -0.2, +0.2, +0.1). So zero on average. +# This means 5e remains the best system so far. + +#./compare_wer.sh 5e 5f +#System 5e 5f +#WER on train_dev(tg) 15.43 15.35 +#WER on train_dev(fg) 14.32 14.15 +#WER on eval2000(tg) 17.3 17.5 +#WER on eval2000(fg) 15.5 15.6 +#Final train prob -0.110056 -0.10574 +#Final valid prob -0.129184 -0.128112 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1). + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh index c840e014250..7bd4ecf5647 100755 --- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh +++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh @@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do echo >> $dir/indexes num_blocks=$[$num_blocks+1] cur_index=$[$cur_index+$block_shift] - if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then + if [ $[$cur_index+$block_size] -gt $feat_dim ]; then cur_index=$[$feat_dim-$block_size]; fi done diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh index 93588ffc874..f2af7d0fdcb 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh +++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh @@ -101,7 +101,7 @@ right_deriv_truncate= # number of time-steps to avoid using the deriv of, on th # End configuration section. -trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM +trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM echo "$0 $@" # Print the command line for logging @@ -497,7 +497,9 @@ while [ $x -lt $num_iters ]; do rm $dir/.error 2>/dev/null - ( # this sub-shell is so that when we "wait" below, + ( + trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM + # this sub-shell is so that when we "wait" below, # we only wait for the training jobs that we just spawned, # not the diagnostic jobs that we spawned above. diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index e28ddcc1a09..ac590a06a25 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -41,20 +41,19 @@ #endif #ifndef M_PI -# define M_PI 3.1415926535897932384626433832795 +#define M_PI 3.1415926535897932384626433832795 #endif #ifndef M_SQRT2 -# define M_SQRT2 1.4142135623730950488016887 +#define M_SQRT2 1.4142135623730950488016887 #endif - #ifndef M_2PI -# define M_2PI 6.283185307179586476925286766559005 +#define M_2PI 6.283185307179586476925286766559005 #endif #ifndef M_SQRT1_2 -# define M_SQRT1_2 0.7071067811865475244008443621048490 +#define M_SQRT1_2 0.7071067811865475244008443621048490 #endif #ifndef M_LOG_2PI @@ -65,6 +64,11 @@ #define M_LN2 0.693147180559945309417232121458 #endif +#ifndef M_LN10 +#define M_LN10 2.302585092994045684017991454684 +#endif + + #define KALDI_ISNAN std::isnan #define KALDI_ISINF std::isinf #define KALDI_ISFINITE(x) std::isfinite(x) diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h index d3ffb913d78..52e388a3f2e 100644 --- a/src/chain/chain-datastruct.h +++ b/src/chain/chain-datastruct.h @@ -46,7 +46,7 @@ extern "C" { // Search for this in chain-kernels.cu for an explanation. - enum { kOccupationRescalingPowerOfTwo = 20, kThresholdingPowerOfTwo = 14 }; + enum { kThresholdingPowerOfTwo = 14 }; } diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc index 7414bb5fd39..ceb61a550f0 100644 --- a/src/chain/chain-den-graph.cc +++ b/src/chain/chain-den-graph.cc @@ -139,87 +139,6 @@ void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) { Vector avg_prob_float(avg_prob); initial_probs_ = avg_prob_float; - special_hmm_state_ = ComputeSpecialState(fst, avg_prob_float); -} - -int32 NumStatesThatCanReach(const fst::StdVectorFst &fst, - int32 dest_state) { - int32 num_states = fst.NumStates(), - num_states_can_reach = 0; - KALDI_ASSERT(dest_state >= 0 && dest_state < num_states); - std::vector can_reach(num_states, false); - std::vector > reverse_transitions(num_states); - for (int32 s = 0; s < num_states; s++) - for (fst::ArcIterator aiter(fst, s); !aiter.Done(); - aiter.Next()) - reverse_transitions[aiter.Value().nextstate].push_back(s); - std::vector queue; - can_reach[dest_state] = true; - queue.push_back(dest_state); - num_states_can_reach++; - while (!queue.empty()) { - int32 state = queue.back(); - queue.pop_back(); - std::vector::const_iterator iter = reverse_transitions[state].begin(), - end = reverse_transitions[state].end(); - for (; iter != end; ++iter) { - int32 prev_state = *iter; - if (!can_reach[prev_state]) { - can_reach[prev_state] = true; - queue.push_back(prev_state); - num_states_can_reach++; - } - } - } - KALDI_ASSERT(num_states_can_reach >= 1 && - num_states_can_reach <= num_states); - return num_states_can_reach; -} - - -int32 DenominatorGraph::ComputeSpecialState( - const fst::StdVectorFst &fst, - const Vector &initial_probs) { - int32 num_states = initial_probs.Dim(); - std::vector num_transitions_into(num_states, 0); - for (int32 s = 0; s < fst.NumStates(); s++) { - for (fst::ArcIterator aiter(fst, s); !aiter.Done(); - aiter.Next()) - num_transitions_into[aiter.Value().nextstate]++; - } - // this vector 'pairs' is a vector of pairs (-num-transitions-into-state, state). - std::vector > pairs(num_states); - for (int32 i = 0; i < num_states; i++) { - pairs[i].first = -num_transitions_into[i]; - pairs[i].second = i; - } - // the first element of each pair is the negative of the num-transitions, so - // when we sort, the highest num-transitions will be first. - std::sort(pairs.begin(), pairs.end()); - - // this threshold of 0.75 is pretty arbitrary. We reject any - // state if it can't be reached by 75% of all other states. - // In practice we think that states will either be reachable by - // almost-all states, or almost-none (e.g. states that are active - // only at utterance-beginning), so this threshold shouldn't - // be too critical. - int32 min_states_can_reach = 0.75 * num_states; - for (int32 i = 0; i < num_states; i++) { - int32 state = pairs[i].second; - int32 n = NumStatesThatCanReach(fst, state); - if (n < min_states_can_reach) { - KALDI_WARN << "Rejecting state " << state << " as a 'special' HMM state " - << "(for renormalization in fwd-bkwd), because it's only " - << "reachable by " << n << " out of " << num_states - << " states."; - } else { - return state; - } - } - KALDI_ERR << "Found no states that are reachable by at least " - << min_states_can_reach << " out of " << num_states - << " states. This is unexpected. Change the threshold"; - return -1; } void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst, @@ -271,6 +190,34 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) { fst::Decode(fst, encoder); } +// This static function, used in CreateDenominatorFst, sorts an +// fst's states in decreasing order of number of transitions (into + out of) +// the state. The aim is to have states that have a lot of transitions +// either into them or out of them, be numbered earlier, so hopefully +// they will be scheduled first and won't delay the computation +static void SortOnTransitionCount(fst::StdVectorFst *fst) { + // negative_num_transitions[i] will contain (before sorting), the pair + // ( -(num-transitions-into(i) + num-transition-out-of(i)), i) + int32 num_states = fst->NumStates(); + std::vector > negative_num_transitions(num_states); + for (int32 i = 0; i < num_states; i++) { + negative_num_transitions[i].first = 0; + negative_num_transitions[i].second = i; + } + for (int32 i = 0; i < num_states; i++) { + for (fst::ArcIterator aiter(*fst, i); !aiter.Done(); + aiter.Next()) { + negative_num_transitions[i].first--; + negative_num_transitions[aiter.Value().nextstate].first--; + } + } + std::sort(negative_num_transitions.begin(), negative_num_transitions.end()); + std::vector order(num_states); + for (int32 i = 0; i < num_states; i++) + order[negative_num_transitions[i].second] = i; + fst::StateSort(fst, order); +} + void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) { for (int32 i = 1; i <= 3; i++) { fst::PushSpecial(fst, fst::kDelta * 0.01); @@ -424,6 +371,8 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep, DenGraphMinimizeWrapper(&transition_id_fst); + SortOnTransitionCount(&transition_id_fst); + *den_fst = transition_id_fst; CheckDenominatorFst(trans_model.NumPdfs(), *den_fst); PrintDenGraphStats(*den_fst); diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h index 8e5ee39e4bd..b2510651f39 100644 --- a/src/chain/chain-den-graph.h +++ b/src/chain/chain-den-graph.h @@ -88,13 +88,6 @@ class DenominatorGraph { // Note: we renormalize each HMM-state to sum to one before doing this. const CuVector &InitialProbs() const; - // returns the index of the HMM-state that has the highest value in - // InitialProbs (and which we believe will always be reachable from most other - // states... later on we may check this more carefully [TODO]). - // It's used in getting the 'arbitrary_scale' value to keep the alphas - // in a good dynamic range. - int32 SpecialHmmState() const { return special_hmm_state_; } - // This function outputs a modifified version of the FST that was used to // build this object, that has an initial-state with epsilon transitions to // each state, with weight determined by initial_probs_; and has each original @@ -116,23 +109,15 @@ class DenominatorGraph { // functions called from the constructor void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds); - // work out the initial-probs and the 'special state' - // Note, there are no final-probs; we treat all states as final - // with probability one [we have a justification for this.. - // assuming it's roughly a well-normalized HMM, this makes sense; - // note that we train on chunks, so the beginning and end of a chunk - // appear at arbitrary points in the sequence. - // At both beginning and end of the chunk, we limit ourselves to - // only those pdf-ids that were allowed in the numerator sequence. + // work out the initial-probs. Note, there are no final-probs; we treat all + // states as final with probability one [we have a justification for this.. + // assuming it's roughly a well-normalized HMM, this makes sense; note that we + // train on chunks, so the beginning and end of a chunk appear at arbitrary + // points in the sequence. At both beginning and end of the chunk, we limit + // ourselves to only those pdf-ids that were allowed in the numerator + // sequence. void SetInitialProbs(const fst::StdVectorFst &fst); - // return a suitable 'special' HMM-state used for normalizing probabilities in - // the forward-backward. It has to have a reasonably high probability and be - // reachable from most of the graph. returns a suitable state-index - // that we can set special_hmm_state_ to. - int32 ComputeSpecialState(const fst::StdVectorFst &fst, - const Vector &initial_probs); - // forward_transitions_ is an array, indexed by hmm-state index, // of start and end indexes into the transition_ array, which // give us the set of transitions out of this state. @@ -152,23 +137,9 @@ class DenominatorGraph { // distribution of the HMM. This isn't too critical. CuVector initial_probs_; - // The index of a somewhat arbitrarily chosen HMM-state that we - // use for adjusting the alpha probabilities. It needs to be - // one that is reachable from all states (i.e. not a special - // state that's only reachable at sentence-start). We choose - // whichever one has the greatest initial-prob. It's set - // in SetInitialProbs(). - int32 special_hmm_state_; - int32 num_pdfs_; }; -// returns the number of states from which there is a path to -// 'dest_state'. Utility function used in selecting 'special' state -// for normalization of probabilities. -int32 NumStatesThatCanReach(const fst::StdVectorFst &fst, - int32 dest_state); - // Function that does acceptor minimization without weight pushing... // this is useful when constructing the denominator graph. diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 80d51bc661f..258c33cd465 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -86,10 +86,7 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { const DenominatorGraphTransition *transitions = den_graph_.Transitions(); int32 num_pdfs = exp_nnet_output_transposed_.NumRows(), num_hmm_states = den_graph_.NumStates(), - num_sequences = num_sequences_, - special_hmm_state = num_hmm_states; - // special_hmm_state now points to the alpha-sum quantity which is located - // in the sam place as the num_hmm_states'th hmm state would be. + num_sequences = num_sequences_; // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. CuSubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, @@ -103,8 +100,8 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions, - num_sequences, special_hmm_state, prob_data, - probs.Stride(), prev_alpha_dash, this_alpha); + num_sequences, prob_data, probs.Stride(), + prev_alpha_dash, this_alpha); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); @@ -126,15 +123,16 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) { this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s]; this_tot_alpha += this_prev_alpha * transition_prob * prob; } - // Let arbitrary_scale be the inverse of the alpha value for the - // hmm-state indexed special_hmm_state_ on the previous frame (for this - // sequence); we multiply this into all the transition-probabilities - // from the previous frame to this frame, in both the forward and - // backward passes, in order to keep the alphas in a good numeric range. - // This won't affect the posteriors, but when computing the total - // likelihood we'll need to compensate for it later on. + // Let arbitrary_scale be the inverse of the alpha-sum value that we + // store in the same place we'd store the alpha for the state numbered + // 'num_hmm_states'. We multiply this into all the + // transition-probabilities from the previous frame to this frame, in + // both the forward and backward passes, in order to keep the alphas in + // a good numeric range. This won't affect the posteriors, but when + // computing the total likelihood we'll need to compensate for it later + // on. BaseFloat arbitrary_scale = - 1.0 / prev_alpha_dash[special_hmm_state * num_sequences + s]; + 1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s]; KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0); this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -259,10 +257,7 @@ bool DenominatorComputation::Backward( *nnet_output_deriv, t * num_sequences_, chunk_frames * num_sequences_, 0, num_pdfs); - const BaseFloat occupation_arbitrary_factor_inv = - (1 << kOccupationRescalingPowerOfTwo); - output_deriv_part.AddMat(deriv_weight * occupation_arbitrary_factor_inv, - transposed_deriv_part, kTrans); + output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans); if (t != 0) transposed_deriv_part.SetZero(); } @@ -310,8 +305,7 @@ void DenominatorComputation::BetaDashGeneralFrame(int32 t) { t_wrapped * num_sequences_, num_sequences_); int32 num_hmm_states = den_graph_.NumStates(), - num_sequences = num_sequences_, - special_hmm_state = num_hmm_states; + num_sequences = num_sequences_; #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -319,10 +313,9 @@ void DenominatorComputation::BetaDashGeneralFrame(int32 t) { dim3 dimBlock(std::min(CU1DBLOCK, num_sequences), 1, 1); dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1); cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions, - num_sequences, special_hmm_state, - probs.Data(), probs.Stride(), this_alpha_dash, - next_beta, this_beta_dash, log_prob_deriv.Data(), - log_prob_deriv.Stride()); + num_sequences, probs.Data(), probs.Stride(), + this_alpha_dash, next_beta, this_beta_dash, + log_prob_deriv.Data(), log_prob_deriv.Stride()); CU_SAFE_CALL(cudaGetLastError()); CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else @@ -336,14 +329,9 @@ void DenominatorComputation::BetaDashGeneralFrame(int32 t) { for (int32 s = 0; s < num_sequences; s++) { BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s], inv_arbitrary_scale = - this_alpha_dash[special_hmm_state * num_sequences + s]; + this_alpha_dash[num_hmm_states * num_sequences + s]; double tot_variable_factor = 0.0; - // search for 'occupation_arbitrary_factor' in chain-kernels.cu for - // an explanation. - const BaseFloat occupation_arbitrary_factor = - (1.0 / (1 << kOccupationRescalingPowerOfTwo)); - BaseFloat occupation_factor = - (occupation_arbitrary_factor * this_alpha_dash_prob) / + BaseFloat occupation_factor = this_alpha_dash_prob / inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, @@ -376,12 +364,9 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) { CuSubMatrix this_log_prob_deriv( nnet_output_deriv_transposed_, 0, num_pdfs, t_wrapped * num_sequences_, num_sequences_); - const BaseFloat occupation_inv_arbitrary_factor = - 1 << kOccupationRescalingPowerOfTwo; BaseFloat alpha_beta_product = VecVec(this_alpha_dash, this_beta_dash), - this_log_prob_deriv_sum = this_log_prob_deriv.Sum() * - occupation_inv_arbitrary_factor; + this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); if (!ApproxEqual(alpha_beta_product, num_sequences_)) { KALDI_WARN << "On time " << t << ", alpha-beta product " << alpha_beta_product << " != " << num_sequences_ diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index de3e64cc693..b0f616673d6 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -41,6 +41,153 @@ namespace kaldi { namespace chain { +/* + This extended comment describes how we implement forward-backward without log + and without overflow, and also the leaky-HMM idea. + + We'll start by establishing the notation for conventional forward-backward, + then add the 'arbitrary-scale' concept that prevents overflow, and then + add the 'leaky-hmm' concept. + + All this is done in parallel over multiple sequences, but the computations + are independent over the separate sequences, so we won't introduce any notation + or index for the sequence; we'll just explain it for one sequences. + + Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for + hmm-state indexes). Let foll(i) give a list of arcs leaving state i, and + pred(i) give a list of arcs entering state i, and we'll use notation like: + for (j, p, n) in foll(i): + for iterating over those arcs, where in this case j is the destination-state, + p is the transition-probability of the arc and n is the pdf-id index. + We can then look up the emission probability as x(t, n) for some frame + 0 <= t < T. + + ** Version 1 of the computation (naive version) ** + + * Forward computation (version 1) + + In the forward computation we're computing alpha(i, t) for 0 <= t <= T): + - For the first frame, set alpha(0, i) = init(i), where init(i) is the + initial-probabilitiy from state i. # in our framework these are obtained + # by running the HMM for a while and getting an averaged occupation + # probability, and using this as an initial-prob, since the boundaries of + # chunks don't really correspond to utterance boundaries in general.] + - For t = 1 ... T: + for i = 0 ... I-1: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p. + + - total-prob = \sum_i alpha(T, i). # note, we take the final-probs of all states + # to be 1.0. + + * Backward computation (version 1) + + And now for the backward computation. Contrary to tradition, we include the + inverse of the total-prob as a factor in the betas. This is both more + convenient (it simplifies the way we obtain posteriors), and makes the + algorithm more generalizable as all the beta quantities can be interpreted as + the partial derivative of the logprob with respect to their corresponding + alpha. + + In forward backward notation, gamma is normally used for state-level + occupation probabilities, but what we care about here is pdf-id-level + occupation probabilities (i.e. the partial derivative of the log-likelihood + w.r.t. the logs of the x(t, n) quantities), so we use gamma for that. + + - for the final frame: + for each i, beta(T, i) = 1 / total-prob. + - for t = T-1 ... 0: + for i = 0 ... I-1: + beta(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta(t, i) += x(t, n) * beta(t+1, j) * p. + gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p. + + ** Version 2 of the computation (renormalized version) ** + + Version 1 of the algorithm is susceptible to numeric underflow and overflow, + due to the limited range of IEEE floating-point exponents. + Define tot-alpha(t) = \sum_i alpha(t, i). Then the renormalized version of + the computation is as above, except whenever the quantity x(t, n) appears, + we replace it with x(t, n) / alpha(t). In the algorithm we refer to + 1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any + value here as long as we are consistent and the value only varies with t + and not with n; we'll always get the same posteriors (gamma). + + When the algorithm outputs log(total-prob) as the total log-probability + of the HMM, we have to instead return the expression: + log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t). + to correct for the scaling of the x values. + + The algorithm is still vulnerable to overflow in the beta computation because + it's possible that the dominant path could have a very tiny alpha. However, + once we introduce the leaky-HMM idea (below), this problem will disappear. + + ** Version 3 of the computation (leaky-HMM version) ** + + The leaky-HMM idea is intended to improve generalization by allowing paths + other than those explicitly allowed by the FST we compiled. Another way to + look at it is as a way of hedging our bets about where we split the utterance, + so it's as we're marginalizing over different splits of the utterance. You + could also think of it as a modification of the FST so that there is an + epsilon transition from each state to a newly added state, with probability + one, and then an epsilon transition from the newly added state to each state + with probability leaky-hmm-prob * init(i) [except we need a mechanism so that + no more than two epsilon transitions can be taken per frame- this would involve + creating two copies of the states] + + Recall that we mentioned that init(i) is the initial-probability of + HMM-state i, but these are obtained in such a way that they can be treated + as priors, or average occupation-probabilities. + + Anyway, the way we formulate leaky-hmm is as follows: + + * Forward computation (version 3) + + Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical + value. It defines how much probability we give to the 'leaky' transitions. + + - For frame 0, set alpha(0, i) = init(i). + - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i). + - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i). + + - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use + the previous frame's alpha' instead of alpha. That is: + alpha(t, i) = 0 + for (j, p, n) in pred(i): # note: j is preceding-state. + alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1) + + - total-prob = \sum_i alpha'(T, i) + + The corrected log-prob that we return from the algorithm will be + (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)). + + * Backward computation (version 3) + + The backward computation is as follows. It is fairly straightforward to + derive if you think of it as an instance of backprop where beta, tot-beta and + beta' are the partial derivatives of the output log-prob w.r.t. the + corresponding alpha, tot-alpha and alpha' quantities. Note, tot-beta is not + really the sum of the betas as its name might suggest, it's just the + derivative w.r.t. tot-alpha. + + - beta'(T, i) = 1 / total-prob. + - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i) + - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t). + - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows: + for 0 <= i < I: + beta'(t, i) = 0 + for (j, p, n) in foll(i): # note: j is following-state. + beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t) + gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p * x(t, n) / tot-alpha(t) + + Note: in the code, the tot-alpha and tot-beta quantities go in the same + memory location that the corresponding alpha and beta for state I would go. + + */ + + // This does forward-backward in parallel on a number of sequences, using a // single HMM. class DenominatorComputation { @@ -128,7 +275,8 @@ class DenominatorComputation { // the (temporarily) alpha and (more permanently) alpha-dash probabilities; // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences + - // num_sequences). Note, they are not logs. The last 'num_sequences' columns + // num_sequences). Note, they are not logs. The last 'num_sequences' + // columns, where the alpha for the state indexed 'num_hmm_states' would live, // are for the alpha-sums, which relates to leaky HMM. CuMatrix alpha_; @@ -150,10 +298,10 @@ class DenominatorComputation { CuVector tot_log_prob_; // the log of the total correction term for each sequence, which is the - // product of the alpha_[special hmm state] over all the frames. The - // 'correction terms' are terms that we divide the alphas and betas by in - // order to keep them in a good dynamic range. The product of them - // must be included in the total likelihood. + // product of the alpha-sums [used in the leaky-hmm computation] over all the + // frames. The 'correction terms' are terms that we divide the alphas and + // betas by in order to keep them in a good dynamic range. The product of + // them must be included in the total likelihood. CuVector log_correction_term_; bool ok_; diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h index af7a1a6b176..8ec1dcf322c 100644 --- a/src/chain/chain-kernels-ansi.h +++ b/src/chain/chain-kernels-ansi.h @@ -29,7 +29,6 @@ extern "C" { const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, @@ -42,7 +41,6 @@ extern "C" { const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 05127ed4c51..ea10b6680f0 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -40,15 +40,9 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) { // threshold itself with probability (value / threshold). This preserves // expectations. Note: we assume that value >= 0. - // kThresholdingPowerOfTwo is defined in chain-datastruct.h; think of this as - // defining the real threshold. (larger power -> more exact, smaller power -> - // faster). The occupation factors that we add ('value' in this code) will - // previously have been scaled by by 2^{-kOccupationRescalingPowerOfTwo}, so - // we need to adjust the threshold to compensate for this. - // In the next line we compute 'threshold' in what an odd way to avoid - // overflow; it should be computed as a constant in the compiler. - const Real threshold = (1.0 / (1 << kThresholdingPowerOfTwo)) / - (1 << kOccupationRescalingPowerOfTwo); + // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines + // the threshold for randomized posterior pruning. + const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo); if (value >= threshold) { atomic_add(address, value); } else { @@ -87,7 +81,6 @@ __global__ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, @@ -142,15 +135,18 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions, this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0; } - // Let arbitrary_scale be the inverse of the alpha value for the - // hmm-state indexed special_hmm_state_ on the previous frame (for this - // sequence); we multiply this into all the transition-probabilities - // from the previous frame to this frame, in both the forward and - // backward passes, in order to keep the alphas in a good numeric range. - // This won't affect the posteriors, but when computing the total - // likelihood we'll need to compensate for it later on. + int32_cuda num_hmm_states = gridDim.y; + // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the + // previous frame this sum of all the alpha values is stored in the place that + // we'd store the previous alpha for state-index equal to num_hmm_states + // (i.e. one past the end). We multiply this into all the + // transition-probabilities from the previous frame to this frame, in both the + // forward and backward passes, in order to keep the alphas in a good numeric + // range. This won't affect the posteriors, as it's just a constant factor + // for each frame, but when computing the total likelihood we'll need to + // compensate for it later on. BaseFloat arbitrary_scale = - 1.0 / prev_alpha[special_hmm_state * num_sequences + s]; + 1.0 / prev_alpha[num_hmm_states * num_sequences + s]; this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale; } @@ -159,7 +155,6 @@ __global__ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, const BaseFloat *next_beta, BaseFloat *this_beta, BaseFloat *log_prob_deriv, @@ -184,32 +179,15 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, if (s >= num_sequences) return; + // below, you can read 'gridDim.y' as 'num_hmm_states'. See where + // arbitrary_scale is defined in the forward computation above, for more + // explanation. BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s], inv_arbitrary_scale = - this_alpha[special_hmm_state * num_sequences + s]; + this_alpha[gridDim.y * num_sequences + s]; double tot_variable_factor = 0.0; - // this should be compiled as a constant. This factor 'occupation_factor' - // here is arbitrarily chosen and will be canceled out by its inverse factor - // in chain-denomnator.cc. It is to avoid infinities appearing in the - // derivatives when the 'special' HMM state gets very unlikely and - // 'this_alpha_prob' gets close to the maximum representable floating point - // value. A check in chain-training.cc that tot_objf is finite would detect - // the case where the alphas are actually infinite and discard the - // derivatives, so we can assume that all the alphas are finite. However, if - // one of the alphas is close to the maximum representable floating point - // value and if inv_arbitrary_scale is less than one, we could (if not for - // this factor of 10^-6) easily get overflow in the next line and produce an - // inf, which would not be detected as the alphas remain finite; this would - // produce an inf in the nnet-output derivatives and propagate back to the - // training. Because 'inv_arbitrary_scale' is in the same range as the exp of - // the nnet outputs, and for a non-diverging chain model these will always be - // fairly close to 1, this small factor (around 10^-6 currently) should be - // sufficient to prevent an inf appearing here. - const BaseFloat occupation_arbitrary_factor = - (1.0 / (1 << kOccupationRescalingPowerOfTwo)); - BaseFloat occupation_factor = (occupation_arbitrary_factor * this_alpha_prob) / - inv_arbitrary_scale; + BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale; const DenominatorGraphTransition *trans_iter = transitions + forward_transitions[h].first, *trans_end = transitions + forward_transitions[h].second; @@ -250,12 +228,7 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions, occupation_prob0); } BaseFloat beta = tot_variable_factor / inv_arbitrary_scale; - // If an overflow was generated while computing the beta (which should be - // extremely rare), substitute zero. This will likely lead to denominator - // occupancies which are less than one for this sequence, as the resulting - // betas will be less than they should be. but it's better than generating an - // inf and ruining the whole backprop. - this_beta[h * num_sequences + s] = (beta - beta == 0 ? beta : 0.0); + this_beta[h * num_sequences + s] = beta; } @@ -263,28 +236,26 @@ void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl, const Int32Pair *backward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *prev_alpha, BaseFloat *this_alpha) { _cuda_chain_hmm_forward<<>>(backward_transitions, transitions, - num_sequences, special_hmm_state, - probs, prob_stride, prev_alpha, this_alpha); + num_sequences, probs, prob_stride, + prev_alpha, this_alpha); } void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, const Int32Pair *forward_transitions, const DenominatorGraphTransition *transitions, int32_cuda num_sequences, - int32_cuda special_hmm_state, const BaseFloat *probs, int32_cuda prob_stride, const BaseFloat *this_alpha, const BaseFloat *next_beta, BaseFloat *this_beta, BaseFloat *log_prob_deriv, int32_cuda log_prob_deriv_stride) { _cuda_chain_hmm_backward<<>>(forward_transitions, transitions, - num_sequences, special_hmm_state, - probs, prob_stride, this_alpha, next_beta, + num_sequences, probs, prob_stride, + this_alpha, next_beta, this_beta, log_prob_deriv, log_prob_deriv_stride); } diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 1bf0201fbfa..9c8f3424390 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -29,6 +29,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, + const CuMatrixBase *xent_output, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, @@ -103,13 +104,65 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, *l2_term = 0.0; } else { // compute the l2 penalty term and its derivative - BaseFloat scale = supervision.weight * opts.l2_regularize; - *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans); - if (nnet_output_deriv) - nnet_output_deriv->AddMat(-1.0 * scale, nnet_output); + BaseFloat scale_coeff = supervision.weight * opts.l2_regularize; + // If xent_output provided, l2 penalty is trying to regress the chain output + // to be a linear function of cross-entropy output. + // It minimizes -0.5 * l2_regularize * l2_norm(diag(scale) * x + offset - y)^2, + // where x is cross-entropy output and y is chain output. + if (xent_output) { + //compute offset and scale + // The objecitve is to minimize L w.r.t scale_i, offset_i, + // L = -0.5 * l2_regularize * + // \sum_{j=1}^{m_size}(\sum_i (nnet_output_ji - target_ji)^2), + // where the target_ji = scale_i * xent_output_ji + offset_i. + // + // scale_i = [\sum_j (nnet_output_ji * xent_output_ji) - + // 1/m_size * \sum_j(nnet_output_ji) * \sum_j(xent_output_ji)] / + // [\sum_j(xent_output_ji^2) - 1/m_size * (\sum_j(xent_output_ji))^2] + // offset_i = 1 ./ m_size * \sum_j (nnet_output_ji - scale_i * xent_output_ji) + // where m_size is minibatch_size. + CuVector scale(xent_output->NumCols()), + offset(xent_output->NumCols()), + nnet_col_sum(nnet_output.NumCols()), + xent_col_sum(xent_output->NumCols()), + scale_denom(nnet_output.NumCols()); + + nnet_col_sum.AddRowSumMat(1.0, nnet_output, 0.0); + xent_col_sum.AddRowSumMat(1.0, *xent_output, 0.0); + scale.AddDiagMatMat(1.0, *xent_output, kTrans, nnet_output, kNoTrans, 0.0); + scale.AddVecVec(-1.0 / nnet_output.NumRows(), nnet_col_sum, xent_col_sum, 1.0); + scale_denom.AddDiagMat2(1.0, *xent_output, kTrans, 0.0); + scale_denom.AddVecVec(-1.0 / nnet_output.NumRows(), xent_col_sum, xent_col_sum, 1.0); + scale.DivElements(scale_denom); + + offset.AddVec(1.0 / xent_output->NumRows(), nnet_col_sum); + offset.AddVecVec(-1.0 / xent_output->NumRows(), scale, xent_col_sum, 1.0); + + if (rand() % 10 == 1) + KALDI_LOG << "l1_norm(scale) = " << scale.Norm(1.0) + << " l1_norm(offset) = " << offset.Norm(1.0); + + //output_diff = (xent_output * diag(scale) + offset) - nnet_output; + CuMatrix output_diff(xent_output->NumRows(), xent_output->NumCols()); + output_diff.AddMatDiagVec(1.0, *xent_output, kNoTrans, scale, 0.0); + output_diff.AddVecToRows(1.0, offset); + output_diff.AddMat(-1.0, nnet_output); + *l2_term = -0.5 * scale_coeff * TraceMatMat(output_diff, output_diff, kTrans); + + //update the nnet_output and xent_output derivative w.r.t. regularizer term. + if (nnet_output_deriv) + nnet_output_deriv->AddMat(scale_coeff, output_diff); + + if (xent_output_deriv) + xent_output_deriv->AddMatDiagVec(-1.0 * scale_coeff, output_diff, kNoTrans, scale, 1.0); + + } else { + *l2_term = -0.5 * scale_coeff * TraceMatMat(nnet_output, nnet_output, kTrans); + if (nnet_output_deriv) + nnet_output_deriv->AddMat(-1.0 * scale_coeff, nnet_output); + } } } - } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index e6143d10846..1e2cfe8cf88 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -116,6 +116,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, const CuMatrixBase &nnet_output, + const CuMatrixBase *xent_output, BaseFloat *objf, BaseFloat *l2_term, BaseFloat *weight, diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 804bea1a217..b1cab67362f 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -103,6 +103,7 @@ void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a, float param void cudaF_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim); void cudaF_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim); void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim); +void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim); void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim); void cudaF_vec_min(const float* v, float* value, int dim); void cudaF_vec_max(const float* v, float* value, int dim); @@ -243,6 +244,7 @@ void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a, double pa void cudaD_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim); void cudaD_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim); void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim); +void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, int dim); void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim); void cudaD_vec_min(const double* v, double* value, int dim); void cudaD_vec_max(const double* v, double* value, int dim); diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 4e1b69f0cce..b8958616b2b 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -444,6 +444,14 @@ static void _vec_mul_elements(Real* v, const Real* a, int dim) { v[i] = v[i] * a[i]; } +template +__global__ +static void _vec_div_elements(Real* v, const Real* a, int dim) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < dim) + v[i] = v[i] / a[i]; +} + template __global__ @@ -2337,6 +2345,10 @@ void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { _vec_mul_elements<<>>(v, a, dim); } +void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim) { + _vec_div_elements<<>>(v, a, dim); +} + void cudaF_vec_min(const float* v, float* value, int dim) { _vec_min<<<1,CU1DBLOCK>>>(v, value, dim); } @@ -2797,6 +2809,10 @@ void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) _vec_mul_elements<<>>(v, a, dim); } +void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, int dim) { + _vec_div_elements<<>>(v, a, dim); +} + void cudaD_vec_min(const double* v, double* value, int dim) { _vec_min<<<1,CU1DBLOCK>>>(v, value, dim); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index fc1fbae54da..dec0797f015 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -189,6 +189,7 @@ inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a, float inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); } inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); } inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr,Bl,v,a,dim); } +inline void cuda_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_div_elements(Gr,Bl,v,a,dim); } inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); } inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); } inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); } @@ -373,6 +374,7 @@ inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a, dou inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); } inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); } inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_mul_elements(Gr,Bl,v,a,dim); } +inline void cuda_vec_div_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_div_elements(Gr,Bl,v,a,dim); } inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); } inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); } inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); } diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index eb5a268d543..2ea5457fefd 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -1166,7 +1166,7 @@ template void CuMatrixBase::AddMatDiagVec( const Real alpha, const CuMatrixBase &M, MatrixTransposeType transM, - CuVectorBase &v, + const CuVectorBase &v, Real beta) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index fd4c642ab7f..13f41e26dec 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -420,7 +420,7 @@ class CuMatrixBase { // The same as adding M but scaling each column M_j by v(j). void AddMatDiagVec(const Real alpha, const CuMatrixBase &M, MatrixTransposeType transM, - CuVectorBase &v, + const CuVectorBase &v, Real beta = 1.0); /// *this = beta * *this + alpha * A .* B (.* element by element multiplication) diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 6deb3809d85..98d22892515 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -708,6 +708,24 @@ void CuVectorBase::MulElements(const CuVectorBase &v) { Vec().MulElements(v.Vec()); } } +template +void CuVectorBase::DivElements(const CuVectorBase &v) { + KALDI_ASSERT(dim_ == v.dim_); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + if (dim_ == 0) return; + Timer tim; + int dimBlock(CU1DBLOCK); + int dimGrid(n_blocks(dim_, CU1DBLOCK)); + cuda_vec_div_elements(dimGrid, dimBlock, data_, v.Data(), dim_); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile("CuVectorBase::DivElements", tim.Elapsed()); + } else +#endif + { + Vec().DivElements(v.Vec()); + } +} template<> template<> diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h index 54c1ac0ad4f..f8a213e148e 100644 --- a/src/cudamatrix/cu-vector.h +++ b/src/cudamatrix/cu-vector.h @@ -193,6 +193,8 @@ class CuVectorBase { void ReplaceValue(Real orig, Real changed); void MulElements(const CuVectorBase &v); + + void DivElements(const CuVectorBase &v); protected: // The following two functions should only be called if we did not compile diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox index 9935fa52711..938321fd7b2 100644 --- a/src/doc/hmm.dox +++ b/src/doc/hmm.dox @@ -447,9 +447,10 @@ We now explain what these three scales do: when we add the self-loop, let the probability mass given to the self-loop be p and the mass given to the rest be (1-p). We add a self-loop with log-probability self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other - log transition probabilities - out of that state. In typical topologies, the self-loop scale is the only scale - that matters. + log transition probabilities out of that state. (Note: in the initial stage of + graph creation we create a graph without self-loops, and with the non-self-loop + transition probabilities renormalized to sum to one). In typical topologies, the + self-loop scale is the only scale that matters. The reason we feel it might make sense to apply a different probability scale to the self-loops versus the normal transition scale is we think they could be diff --git a/src/lm/Makefile b/src/lm/Makefile index ddda9576557..acf327d994f 100644 --- a/src/lm/Makefile +++ b/src/lm/Makefile @@ -10,10 +10,10 @@ MATHLIB = NONE include ../kaldi.mk -TESTFILES = lm-lib-test +TESTFILES = arpa-file-parser-test lm-lib-test -OBJFILES = const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o kaldi-rnnlm.o \ - mikolov-rnnlm-lib.o +OBJFILES = arpa-file-parser.o const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o \ + kaldi-rnnlm.o mikolov-rnnlm-lib.o TESTOUTPUTS = composed.fst output.fst output1.fst output2.fst diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc new file mode 100644 index 00000000000..e37a916d263 --- /dev/null +++ b/src/lm/arpa-file-parser-test.cc @@ -0,0 +1,365 @@ +// lm/arpa-file-parser-test.cc + +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +/** + * @file lm-lib-test.cc + * @brief Unit tests for language model code. + */ + +#include +#include +#include +#include +#include +#include "lm/kaldi-lm.h" + +#include "lm/arpa-file-parser.h" + +namespace kaldi { +namespace { + +const int kMaxOrder = 3; + +struct NGramTestData { + int32 line_number; + float logprob; + int32 words[kMaxOrder]; + float backoff; +}; + +std::ostream& operator<<(std::ostream& os, const NGramTestData& data) { + std::ios::fmtflags saved_state(os.flags()); + os << std::fixed << std::setprecision(6); + + os << data.logprob << ' '; + for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' '; + os << data.backoff << " // Line " << data.line_number; + + os.flags(saved_state); + return os; +} + +// This does not own the array pointer, and uset to simplify passing expected +// result to TestableArpaFileParser::Verify. +template +struct CountedArray { + template + CountedArray(T(&array)[N]) : array(array), count(N) { } + const T* array; + const size_t count; +}; + +template +inline CountedArray MakeCountedArray(T(&array)[N]) { + return CountedArray(array); +} + +class TestableArpaFileParser : public ArpaFileParser { + public: + TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols) + : ArpaFileParser(options, symbols), + header_available_(false), + read_complete_(false), + last_order_(0) { } + void Validate(CountedArray counts, CountedArray ngrams); + + private: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete(); + + bool header_available_; + bool read_complete_; + int32 last_order_; + std::vector ngrams_; +}; + +void TestableArpaFileParser::HeaderAvailable() { + KALDI_ASSERT(!header_available_); + KALDI_ASSERT(!read_complete_); + header_available_ = true; + KALDI_ASSERT(NgramCounts().size() <= kMaxOrder); +} + +void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) { + KALDI_ASSERT(header_available_); + KALDI_ASSERT(!read_complete_); + KALDI_ASSERT(ngram.words.size() <= NgramCounts().size()); + KALDI_ASSERT(ngram.words.size() >= last_order_); + last_order_ = ngram.words.size(); + + NGramTestData entry = { 0 }; + entry.line_number = LineNumber(); + entry.logprob = ngram.logprob; + entry.backoff = ngram.backoff; + std::copy(ngram.words.begin(), ngram.words.end(), entry.words); + ngrams_.push_back(entry); +} + +void TestableArpaFileParser::ReadComplete() { + KALDI_ASSERT(header_available_); + KALDI_ASSERT(!read_complete_); + read_complete_ = true; +} + +// +bool CompareNgrams(const NGramTestData& actual, + const NGramTestData& expected) { + if (actual.line_number != expected.line_number + || !std::equal(actual.words, actual.words + kMaxOrder, + expected.words) + || !ApproxEqual(actual.logprob, expected.logprob) + || !ApproxEqual(actual.backoff, expected.backoff)) { + KALDI_WARN << "Actual n-gram [" << actual + << "] differs from expected [" << expected << "]"; + return false; + } + return true; +} + +void TestableArpaFileParser::Validate( + CountedArray expect_counts, + CountedArray expect_ngrams) { + // This needs better disagnostics probably. + KALDI_ASSERT(NgramCounts().size() == expect_counts.count); + KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(), + expect_counts.array)); + + KALDI_ASSERT(ngrams_.size() == expect_ngrams.count); + // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(), + // expect_ngrams.array, CompareNgrams); + // if (mpos.first != ngrams_.end()) + // KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin(); + //TODO:auto above requres C++11, and I cannot spell out the type!!! + KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(), + expect_ngrams.array, CompareNgrams)); +} + +// Read integer LM (no symbols) with log base conversion. +void ReadIntegerLmLogconvExpectSuccess() { + KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()"; + + static std::string integer_lm = "\ +\\data\\\n\ +ngram 1=4\n\ +ngram 2=2\n\ +ngram 3=2\n\ +\n\ +\\1-grams:\n\ +-5.234679 4 -3.3\n\ +-3.456783 5\n\ +0.0000000 1 -2.5\n\ +-4.333333 2\n\ +\n\ +\\2-grams:\n\ +-1.45678 4 5 -3.23\n\ +-1.30490 1 4 -4.2\n\ +\n\ +\\3-grams:\n\ +-0.34958 1 4 5\n\ +-0.23940 4 5 2\n\ +\n\ +\\end\\"; + + int32 expect_counts[] = { 4, 2, 2 }; + NGramTestData expect_ngrams[] = { + { 7, -12.05329, { 4, 0, 0 }, -7.598531 }, + { 8, -7.959537, { 5, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -5.756463 }, + { 10, -9.977868, { 2, 0, 0 }, 0.0 }, + + { 13, -3.354360, { 4, 5, 0 }, -7.437350 }, + { 14, -3.004643, { 1, 4, 0 }, -9.670857 }, + + { 17, -0.804938, { 1, 4, 5 }, 0.0 }, + { 18, -0.551239, { 4, 5, 2 }, 0.0 } }; + + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + + TestableArpaFileParser parser(options, NULL); + std::istringstream stm(integer_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), + MakeCountedArray(expect_ngrams)); +} + +// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks. +static std::string symbolic_lm = "\ +\\data\\\n\ +ngram 1=4\n\ +ngram 2=2\n\ +ngram 3=2\n\ +\n\ +\\1-grams:\n\ +-5.2 a -3.3\n\ +-3.4 \xCE\xB2\n\ +0.0 -2.5\n\ +-4.3 \n\ +\n\ +\\2-grams:\n\ +-1.5 a \xCE\xB2 -3.2\n\ +-1.3 a -4.2\n\ +\n\ +\\3-grams:\n\ +-0.3 a \xCE\xB2\n\ +-0.2 a \n\ +\n\ +\\end\\"; + +// Symbol table that is created with predefined test symbols, "a" but no "b". +class TestSymbolTable : public fst::SymbolTable { + public: + TestSymbolTable() { + AddSymbol("", 0); + AddSymbol("", 1); + AddSymbol("", 2); + AddSymbol("", 3); + AddSymbol("a", 4); + } +}; + +// Full expected result shared between ReadSymbolicLmNoOovImpl and +// ReadSymbolicLmWithOovAddToSymbols(). +NGramTestData expect_symbolic_full[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 8, -3.4, { 5, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 13, -1.5, { 4, 5, 0 }, -3.2 }, + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 17, -0.3, { 1, 4, 5 }, 0.0 }, + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + +// This is run with all possible oov setting and yields same result. +void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) { + int32 expect_counts[] = { 4, 2, 2 }; + TestSymbolTable symbols; + symbols.AddSymbol("\xCE\xB2", 5); + + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + options.unk_symbol = 3; + options.use_log10 = true; + options.oov_handling = oov; + TestableArpaFileParser parser(options, &symbols); + std::istringstream stm(symbolic_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), + MakeCountedArray(expect_symbolic_full)); + KALDI_ASSERT(symbols.NumSymbols() == 6); +} + +void ReadSymbolicLmNoOovTests() { + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk); + KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)"; + ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram); +} + +// This is run with all possible oov setting and yields same result. +void ReadSymbolicLmWithOovImpl( + ArpaParseOptions::OovHandling oov, + CountedArray expect_ngrams, + fst::SymbolTable* symbols) { + int32 expect_counts[] = { 4, 2, 2 }; + ArpaParseOptions options; + options.bos_symbol = 1; + options.eos_symbol = 2; + options.unk_symbol = 3; + options.use_log10 = true; + options.oov_handling = oov; + TestableArpaFileParser parser(options, symbols); + std::istringstream stm(symbolic_lm, std::ios_base::in); + parser.Read(stm, false); + parser.Validate(MakeCountedArray(expect_counts), expect_ngrams); +} + +void ReadSymbolicLmWithOovAddToSymbols() { + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols, + MakeCountedArray(expect_symbolic_full), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 6); + KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5); +} + +void ReadSymbolicLmWithOovReplaceWithUnk() { + NGramTestData expect_symbolic_unk_b[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 8, -3.4, { 3, 0, 0 }, 0.0 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 13, -1.5, { 4, 3, 0 }, -3.2 }, + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 17, -0.3, { 1, 4, 3 }, 0.0 }, + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk, + MakeCountedArray(expect_symbolic_unk_b), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 5); +} + +void ReadSymbolicLmWithOovSkipNGram() { + NGramTestData expect_symbolic_no_b[] = { + { 7, -5.2, { 4, 0, 0 }, -3.3 }, + { 9, 0.0, { 1, 0, 0 }, -2.5 }, + { 10, -4.3, { 2, 0, 0 }, 0.0 }, + + { 14, -1.3, { 1, 4, 0 }, -4.2 }, + + { 18, -0.2, { 1, 4, 2 }, 0.0 } }; + + TestSymbolTable symbols; + ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram, + MakeCountedArray(expect_symbolic_no_b), + &symbols); + KALDI_ASSERT(symbols.NumSymbols() == 5); +} + +void ReadSymbolicLmWithOovTests() { + KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()"; + ReadSymbolicLmWithOovAddToSymbols(); + KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()"; + ReadSymbolicLmWithOovReplaceWithUnk(); + KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()"; + ReadSymbolicLmWithOovSkipNGram(); +} + +} // namespace +} // namespace kaldi + +int main(int argc, char *argv[]) { + kaldi::ReadIntegerLmLogconvExpectSuccess(); + kaldi::ReadSymbolicLmNoOovTests(); + kaldi::ReadSymbolicLmWithOovTests(); +} diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc new file mode 100644 index 00000000000..2d8f9f18638 --- /dev/null +++ b/src/lm/arpa-file-parser.cc @@ -0,0 +1,236 @@ +// lm/arpa-file-parser.cc + +// Copyright 2014 Guoguo Chen +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "base/kaldi-error.h" +#include "base/kaldi-math.h" +#include "lm/arpa-file-parser.h" +#include "util/text-utils.h" + +namespace kaldi { + +ArpaFileParser::ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols) + : options_(options), symbols_(symbols), line_number_(0) { +} + +ArpaFileParser::~ArpaFileParser() { +} + +void ArpaFileParser::Read(std::istream &is, bool binary) { + if (binary) { + KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser"; + } + + // Argument sanity checks. + if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 || + options_.bos_symbol == options_.eos_symbol) + KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and " + << "differ from each other. Given:" + << " BOS=" << options_.bos_symbol + << " EOS=" << options_.eos_symbol; + if (symbols_ != NULL && + options_.oov_handling == ArpaParseOptions::kReplaceWithUnk && + (options_.unk_symbol <= 0 || + options_.unk_symbol == options_.bos_symbol || + options_.unk_symbol == options_.eos_symbol)) + KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, " + << "UNK symbol is required, must not be epsilon, and " + << "differ from both BOS and EOS symbols. Given:" + << " UNK=" << options_.unk_symbol + << " BOS=" << options_.bos_symbol + << " EOS=" << options_.eos_symbol; + if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty()) + KALDI_ERR << "BOS symbol must exist in symbol table"; + if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty()) + KALDI_ERR << "EOS symbol must exist in symbol table"; + if (symbols_ != NULL && options_.unk_symbol > 0 && + symbols_->Find(options_.unk_symbol).empty()) + KALDI_ERR << "UNK symbol must exist in symbol table"; + + ngram_counts_.clear(); + line_number_ = 0; + +#define PARSE_ERR (KALDI_ERR << "in line " << line_number_ << ": ") + + // Give derived class an opportunity to prepare its state. + ReadStarted(); + + std::string line; + + // Processes "\data\" section. + bool keyword_found = false; + while (++line_number_, getline(is, line) && !is.eof()) { + if (line.empty()) continue; + + // The section keywords starts with backslash. We terminate the while loop + // if a new section is found. + if (line[0] == '\\') { + if (!keyword_found && line == "\\data\\") { + KALDI_LOG << "Reading \\data\\ section."; + keyword_found = true; + continue; + } + break; + } + + if (!keyword_found) continue; + + // Enters "\data\" section, and looks for patterns like "ngram 1=1000", + // which means there are 1000 unigrams. + std::size_t equal_symbol_pos = line.find("="); + if (equal_symbol_pos != std::string::npos) + line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "=" + std::vector col; + SplitStringToVector(line, " \t", true, &col); + if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") { + int32 order, ngram_count = 0; + if (!ConvertStringToInteger(col[1], &order) || + !ConvertStringToInteger(col[3], &ngram_count)) { + PARSE_ERR << "Cannot parse ngram count '" << line << "'."; + } + if (ngram_counts_.size() <= order) { + ngram_counts_.resize(order); + } + ngram_counts_[order - 1] = ngram_count; + } else { + KALDI_WARN << "Uninterpretable line in \\data\\ section: " << line; + } + } + + if (ngram_counts_.size() == 0) + PARSE_ERR << "\\data\\ section missing or empty."; + + // Signal that grammar order and n-gram counts are known. + HeaderAvailable(); + + NGram ngram; + ngram.words.reserve(ngram_counts_.size()); + + // Processes "\N-grams:" section. + for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) { + // Skips n-grams with zero count. + if (ngram_counts_[cur_order - 1] == 0) { + KALDI_WARN << "Zero ngram count in ngram order " << cur_order + << "(look for 'ngram " << cur_order << "=0' in the \\data\\ " + << " section). There is possibly a problem with the file."; + continue; + } + + // Must be looking at a \k-grams: directive at this point. + std::ostringstream keyword; + keyword << "\\" << cur_order << "-grams:"; + if (line != keyword.str()) { + PARSE_ERR << "Invalid directive '" << line << "', " + << "expecting '" << keyword.str() << "'."; + } + KALDI_LOG << "Reading " << line << " section."; + + int32 ngram_count = 0; + while (++line_number_, getline(is, line) && !is.eof()) { + if (line.empty()) continue; + if (line[0] == '\\') break; + + std::vector col; + SplitStringToVector(line, " \t", true, &col); + + if (col.size() < 1 + cur_order || + col.size() > 2 + cur_order || + (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) { + PARSE_ERR << "Invalid n-gram line '" << line << "'"; + } + ++ngram_count; + + // Parse out n-gram logprob and, if present, backoff weight. + if (!ConvertStringToReal(col[0], &ngram.logprob)) { + PARSE_ERR << "Invalid n-gram logprob '" << col[0] << "'."; + } + ngram.backoff = 0.0; + if (col.size() > cur_order + 1) { + if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff)) + PARSE_ERR << "Invalid backoff weight '" << col[cur_order + 1] << "'."; + } + // Convert to natural log unless the option is set not to. + if (!options_.use_log10) { + ngram.logprob *= M_LN10; + ngram.backoff *= M_LN10; + } + + ngram.words.resize(cur_order); + bool skip_ngram = false; + for (int32 index = 0; !skip_ngram && index < cur_order; ++index) { + int32 word; + if (symbols_) { + // Symbol table provided, so symbol labels are expected. + if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) { + word = symbols_->AddSymbol(col[1 + index]); + } else { + word = symbols_->Find(col[1 + index]); + if (word == fst::SymbolTable::kNoSymbol) { + switch(options_.oov_handling) { + case ArpaParseOptions::kReplaceWithUnk: + word = options_.unk_symbol; + break; + case ArpaParseOptions::kSkipNGram: + skip_ngram = true; + break; + default: + PARSE_ERR << "Word '" << col[1 + index] + << "' not in symbol table."; + } + } + } + } else { + // Symbols not provided, LM file should contain integers. + if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) { + PARSE_ERR << "invalid symbol '" << col[1 + index] << "'"; + } + } + // Whichever way we got it, an epsilon is invalid. + if (word == 0) { + PARSE_ERR << "Epsilon symbol '" << col[1 + index] + << "' is illegal in ARPA LM."; + } + ngram.words[index] = word; + } + if (!skip_ngram) { + ConsumeNGram(ngram); + } + } + if (ngram_count > ngram_counts_[cur_order - 1]) { + PARSE_ERR << "Header said there would be " << ngram_counts_[cur_order] + << " n-grams of order " << cur_order << ", but we saw " + << ngram_count; + } + } + + if (line != "\\end\\") { + PARSE_ERR << "Invalid or unexpected directive line '" << line << "', " + << "expected \\end\\."; + } + + ReadComplete(); + +#undef PARSE_ERR +} + +} // namespace kaldi diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h new file mode 100644 index 00000000000..0011fb4ee21 --- /dev/null +++ b/src/lm/arpa-file-parser.h @@ -0,0 +1,125 @@ +// lm/arpa-file-parser.h + +// Copyright 2014 Guoguo Chen +// Copyright 2016 Smart Action Company LLC (kkm) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_LM_ARPA_FILE_PARSER_H_ +#define KALDI_LM_ARPA_FILE_PARSER_H_ + +#include +#include + +#include + +#include "base/kaldi-types.h" + +namespace kaldi { + +/** + Options that control ArpaFileParser +*/ +struct ArpaParseOptions { + enum OovHandling { + kRaiseError, ///< Abort on OOV words + kAddToSymbols, ///< Add novel words to the symbol table. + kReplaceWithUnk, ///< Replace OOV words with . + kSkipNGram ///< Skip n-gram with OOV word and continue. + }; + + ArpaParseOptions() + : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1), + oov_handling(kRaiseError), use_log10(false) { } + + int32 bos_symbol; ///< Symbol for , Required non-epsilon. + int32 eos_symbol; ///< Symbol for , Required non-epsilon. + int32 unk_symbol; ///< Symbol for , Required for kReplaceWithUnk. + OovHandling oov_handling; ///< How to handle OOV words in the file. + bool use_log10; ///< Use log10 for prob and backoff weight, not ln. +}; + +/** + A parsed n-gram from ARPA LM file. +*/ +struct NGram { + NGram() : logprob(0.0), backoff(0.0) { } + std::vector words; ///< Symbols in LTR order. + float logprob; ///< Log-prob of the n-gram. + float backoff; ///< log-backoff weight of the n-gram. +}; + +/** + ArpaFileParser is an abstract base class for ARPA LM file conversion. + + See ConstArpaLmBuilder for a usage example. +*/ +class ArpaFileParser { + public: + /// Constructs the parser with the given options and optional symbol table. + /// If symbol table is provided, then the file should contain text n-grams, + /// and the words are mapped to symbols through it. bos_symbol and + /// eos_symbol in the options structure must be valid symbols in the table, + /// and so must be unk_symbol if provided. The table is not owned by the + /// parser, but may be augmented, if oov_handling is set to kAddToSymbols. + /// If symbol table is a null pointer, the file should contain integer + /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol + /// must be valid symbols still. + ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols); + virtual ~ArpaFileParser(); + + /// Read ARPA LM file through Kaldi I/O functions. Only text mode is + /// supported. + void Read(std::istream &is, bool binary); + + const ArpaParseOptions& Options() const { return options_; } + + protected: + /// Override called before reading starts. This is the point to prepare + /// any state in the derived class. + virtual void ReadStarted() { } + + /// Override function called to signal that ARPA header with the expected + /// number of n-grams has been read, and ngram_counts() is now valid. + virtual void HeaderAvailable() { } + + /// Pure override that must be implemented to process current n-gram. The + /// n-grams are sent in the file order, which guarantees that all + /// (k-1)-grams are processed before the first k-gram is. + virtual void ConsumeNGram(const NGram&) = 0; + + /// Override function called after the last n-gram has been consumed. + virtual void ReadComplete() { } + + /// Read-only access to symbol table. + const fst::SymbolTable* Symbols() const { return symbols_; } + + /// Inside ConsumeNGram(), provides the current line number. + int32 LineNumber() const { return line_number_; } + + /// N-gram counts. Valid in and after a call to HeaderAvailable(). + const std::vector& NgramCounts() const { return ngram_counts_; } + + private: + ArpaParseOptions options_; + fst::SymbolTable* symbols_; // Not owned. + int32 line_number_; + std::vector ngram_counts_; +}; + +} // namespace kaldi + +#endif // KALDI_LM_ARPA_FILE_PARSER_H_ diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc index 7f63dce886e..5043933d7f0 100644 --- a/src/lm/const-arpa-lm.cc +++ b/src/lm/const-arpa-lm.cc @@ -22,13 +22,14 @@ #include #include +#include "base/kaldi-math.h" +#include "lm/arpa-file-parser.h" #include "lm/const-arpa-lm.h" #include "util/stl-utils.h" #include "util/text-utils.h" -#include "base/kaldi-math.h" -namespace kaldi { +namespace kaldi { // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa // format. @@ -173,13 +174,10 @@ class LmState { // Class to build ConstArpaLm from Arpa format language model. It relies on the // auxiliary class LmState above. -class ConstArpaLmBuilder { +class ConstArpaLmBuilder : public ArpaFileParser { public: - ConstArpaLmBuilder( - const bool natural_base, const int32 bos_symbol, - const int32 eos_symbol, const int32 unk_symbol) : - natural_base_(natural_base), bos_symbol_(bos_symbol), - eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) { + ConstArpaLmBuilder(ArpaParseOptions options) + : ArpaFileParser(options, NULL) { ngram_order_ = 0; num_words_ = 0; overflow_buffer_size_ = 0; @@ -204,21 +202,21 @@ class ConstArpaLmBuilder { } } - // Reads in the Arpa format language model, parses it and creates LmStates. - void Read(std::istream &is, bool binary); - // Writes ConstArpaLm. void Write(std::ostream &os, bool binary) const; - // Builds ConstArpaLm. - void Build(); - void SetMaxAddressOffset(const int32 max_address_offset) { KALDI_WARN << "You are changing ; the default should " << "not be changed unless you are in testing mode."; max_address_offset_ = max_address_offset; } + protected: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete(); + private: struct WordsAndLmStatePairLessThan { bool operator()( @@ -229,10 +227,6 @@ class ConstArpaLmBuilder { }; private: - // If true, use natural base e for log-prob, otherwise use base 10. The - // default base in Arpa format language model is base 10. - bool natural_base_; - // Indicating if ConstArpaLm has been built or not. bool is_built_; @@ -240,16 +234,6 @@ class ConstArpaLmBuilder { // The default value is 30-bits and should not be changed except for testing. int32 max_address_offset_; - // Integer corresponds to . - int32 bos_symbol_; - - // Integer corresponds to . - int32 eos_symbol_; - - // Integer corresponds to unknown-word. -1 if no unknown-word symbol is - // provided. - int32 unk_symbol_; - // N-gram order of language model. This can be figured out from "/data/" // section in Arpa format language model. int32 ngram_order_; @@ -280,201 +264,58 @@ class ConstArpaLmBuilder { LmState*, VectorHasher > seq_to_state_; }; -// Reads in the Arpa format language model, parses it and puts the word sequence -// into the corresponding LmState in . -void ConstArpaLmBuilder::Read(std::istream &is, bool binary) { - if (binary) { - KALDI_ERR << "binary-mode reading is not implemented for " - << "ConstArpaLmBuilder."; - } - - std::string line; - - // Number of n-grams from "\data\" section. Those numbers should match the - // actual number of n-grams from "\N-grams:" sections. - // Note that when we convert the words in the Arpa format language model into - // integers, we remove lines with OOV words. We also modify the n-gram counts - // in "\data\" correspondingly. - std::vector num_ngrams; - - // Processes "\data\" section. - bool keyword_found = false; - while (getline(is, line) && !is.eof()) { - // The section keywords starts with backslash. We terminate the while loop - // if a new section is found. - if (!line.empty() && line[0] == '\\') { - if (line.find("-grams:") != std::string::npos) break; - if (line.find("\\end\\") != std::string::npos) break; - } - - std::size_t equal_symbol_pos = line.find("="); - if (equal_symbol_pos != std::string::npos) - line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "=" - std::vector col; - SplitStringToVector(line, " \t", true, &col); - - // Looks for keyword "\data\". - if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") { - KALDI_LOG << "Reading \"\\data\\\" section."; - keyword_found = true; - continue; - } +void ConstArpaLmBuilder::HeaderAvailable() { + ngram_order_ = NgramCounts().size(); +} - // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which - // means there are 1000 unigrams. - if (keyword_found && col.size() == 4 && col[0] == "ngram") { - if (col[2] == "=") { - int32 order, ngram_count; - if (!ConvertStringToInteger(col[1], &order)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[1] << " to integer."; - } - if (!ConvertStringToInteger(col[3], &ngram_count)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[3] << " to integer."; - } - if (num_ngrams.size() <= order) { - num_ngrams.resize(order + 1); - } - num_ngrams[order] = ngram_count; - } else { - KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line; - } - } else if (keyword_found) { - KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line; - } +void ConstArpaLmBuilder::ConsumeNGram(const NGram& ngram) { + int32 cur_order = ngram.words.size(); + // If is larger than 1, then we do not create LmState for + // the final order entry. We only keep the log probability for it. + LmState *lm_state = NULL; + if (cur_order != ngram_order_ || ngram_order_ == 1) { + lm_state = new LmState(cur_order == 1, + cur_order == ngram_order_ - 1, + ngram.logprob, ngram.backoff); + + KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end()); + seq_to_state_[ngram.words] = lm_state; } - if (num_ngrams.size() == 0) - KALDI_ERR << "Fail to read \"\\data\\\" section."; - ngram_order_ = num_ngrams.size() - 1; - - // Processes "\N-grams:" section. - int32 max_word_id = 0; - for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) { - // Skips n-grams with zero count. - if (num_ngrams[cur_order] == 0) continue; - - keyword_found = false; - int32 ngram_count = 0; - std::ostringstream keyword; - keyword << "\\" << cur_order << "-grams:"; - // We use "do ... while" loop since one line has already been read. - do { - // The section keywords starts with backslash. We terminate the while loop - // if a new section is found. - if (!line.empty() && line[0] == '\\') { - if (line.find("-grams:") != std::string::npos && keyword_found) break; - if (line.find("\\end\\") != std::string::npos) break; - } - std::vector col; - SplitStringToVector(line, " \t", true, &col); - - // Looks for keyword "\N-gram:" if the keyword has not been located. - if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) { - KALDI_LOG << "Reading \"" << keyword.str() << "\" section."; - ngram_count = 0; - keyword_found = true; - continue; - } - - // Enters "\N-grams:" section if the keyword has been located. - if (keyword_found && col.size() > 0) { - KALDI_ASSERT(col.size() >= 1 + cur_order); - KALDI_ASSERT(col.size() <= 2 + cur_order); // backoff_logprob can be 0. - if (cur_order == ngram_order_ && col.size() == 2 + cur_order) { - KALDI_ERR << "Backoff probability detected for final-order entry \"" - << line << "\"."; - } - ngram_count++; - - // If backoff_logprob is 0, it will not appear in Arpa format language - // model. We put it back so the processing afterwards will be easier. - if (col.size() == 1 + cur_order) { - col.push_back("0"); - } - - // Creates LmState for the current word sequence. - bool is_unigram = (cur_order == 1) ? true : false; - float logprob; - float backoff_logprob; - KALDI_ASSERT(ConvertStringToReal(col[0], &logprob)); - KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob)); - if (natural_base_) { - logprob *= Log(10.0f); - backoff_logprob *= Log(10.0f); - } - - // If is larger than 1, then we do not create LmState for - // the final order entry. We only keep the log probability for it. - LmState *lm_state = NULL; - if (cur_order != ngram_order_ || ngram_order_ == 1) { - lm_state = new LmState(is_unigram, - (cur_order == ngram_order_ - 1), - logprob, backoff_logprob); - } - - // Figures out the sequence of words. - std::vector seq(cur_order, 0); - for (int32 index = 0; index < cur_order; ++index) { - int32 word; - if (!ConvertStringToInteger(col[1 + index], &word)) { - KALDI_ERR << "bad line: " << line << "; fail to convert " - << col[1 + index] << " to integer."; - } - seq[index] = word; - } - - // If is larger than 1, then we do not insert LmState to - // . - if (cur_order != ngram_order_ || ngram_order_ == 1) { - KALDI_ASSERT(lm_state != NULL); - KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end()); - seq_to_state_[seq] = lm_state; - } - - // If n-gram order is larger than 1, we have to add possible child to - // existing LmStates. We have the following two assumptions: - // 1. N-grams are processed from small order to larger ones, i.e., from - // 1, 2, ... to the highest order. - // 2. If a n-gram exists in the Arpa format language model, then the - // "history" n-gram also exists. For example, if "A B C" is a valid - // n-gram, then "A B" is also a valid n-gram. - if (cur_order > 1) { - std::vector hist(seq.begin(), seq.begin() + cur_order - 1); - int32 word = seq[seq.size() - 1]; - unordered_map, - LmState*, VectorHasher >::iterator hist_iter; - hist_iter = seq_to_state_.find(hist); - KALDI_ASSERT(hist_iter != seq_to_state_.end()); - if (cur_order != ngram_order_ || ngram_order_ == 1) { - KALDI_ASSERT(lm_state != NULL); - KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); - hist_iter->second->AddChild(word, lm_state); - } else { - KALDI_ASSERT(lm_state == NULL); - KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); - hist_iter->second->AddChild(word, logprob); - } - } else { - // Figures out . - KALDI_ASSERT(seq.size() == 1); - if (seq[0] > max_word_id) { - max_word_id = seq[0]; - } - } - } - } while (getline(is, line) && !is.eof()); - if (ngram_count > num_ngrams[cur_order] || - (ngram_count == 0 && num_ngrams[cur_order] != 0)) { - KALDI_ERR << "Header said there would be " << num_ngrams[cur_order] - << " n-grams of order " << cur_order << ", but we saw " - << ngram_count; + // If n-gram order is larger than 1, we have to add possible child to + // existing LmStates. We have the following two assumptions: + // 1. N-grams are processed from small order to larger ones, i.e., from + // 1, 2, ... to the highest order. + // 2. If a n-gram exists in the Arpa format language model, then the + // "history" n-gram also exists. For example, if "A B C" is a valid + // n-gram, then "A B" is also a valid n-gram. + int32 last_word = ngram.words[cur_order - 1]; + if (cur_order > 1) { + std::vector hist(ngram.words.begin(), ngram.words.end() - 1); + unordered_map, + LmState*, VectorHasher >::iterator hist_iter; + hist_iter = seq_to_state_.find(hist); + if (hist_iter == seq_to_state_.end()) { + std::ostringstream ss; + for (int i = 0; i < cur_order; ++i) + ss << (i == 0 ? '[' : ' ') << ngram.words[i]; + KALDI_ERR << "In line " << LineNumber() << ": " + << cur_order << "-gram " << ss.str() << "] does not have " + << "a parent model " << cur_order << "-gram."; + } + if (cur_order != ngram_order_ || ngram_order_ == 1) { + KALDI_ASSERT(lm_state != NULL); + KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); + hist_iter->second->AddChild(last_word, lm_state); + } else { + KALDI_ASSERT(lm_state == NULL); + KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); + hist_iter->second->AddChild(last_word, ngram.logprob); } + } else { + // Figures out . + num_words_ = std::max(num_words_, last_word + 1); } - - // is plus 1. - num_words_ = max_word_id + 1; } // ConstArpaLm can be built in the following steps, assuming we have already @@ -503,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) { // At the same time, we will also create two special buffers: // // -void ConstArpaLmBuilder::Build() { +void ConstArpaLmBuilder::ReadComplete() { // STEP 1: sorting LmStates lexicographically. // Vector for holding the sorted LmStates. std::vector*, LmState*> > sorted_vec; @@ -637,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const { KALDI_ASSERT(is_built_); // Creates ConstArpaLm. - ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_, - num_words_, overflow_buffer_size_, lm_states_size_, - unigram_states_, overflow_buffer_, lm_states_); + ConstArpaLm const_arpa_lm( + Options().bos_symbol, Options().eos_symbol, Options().unk_symbol, + ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_, + unigram_states_, overflow_buffer_, lm_states_); const_arpa_lm.Write(os, binary); } @@ -1224,10 +1066,15 @@ bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol, const int32 eos_symbol, const int32 unk_symbol, const std::string& arpa_rxfilename, const std::string& const_arpa_wxfilename) { - ConstArpaLmBuilder lm_builder(natural_base, bos_symbol, - eos_symbol, unk_symbol); + ArpaParseOptions options; + options.bos_symbol = bos_symbol; + options.eos_symbol = eos_symbol; + options.unk_symbol = unk_symbol; + options.use_log10 = !natural_base; + + ConstArpaLmBuilder lm_builder(options); + KALDI_LOG << "Reading " << arpa_rxfilename; ReadKaldiObject(arpa_rxfilename, &lm_builder); - lm_builder.Build(); WriteKaldiObject(lm_builder, const_arpa_wxfilename, true); return true; } diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc index 76b83ea7114..b9c85e9ae6e 100644 --- a/src/matrix/kaldi-matrix.cc +++ b/src/matrix/kaldi-matrix.cc @@ -448,7 +448,7 @@ template void MatrixBase::AddMatDiagVec( const Real alpha, const MatrixBase &M, MatrixTransposeType transM, - VectorBase &v, + const VectorBase &v, Real beta) { if (beta != 1.0) this->Scale(beta); diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index c16ffb22135..add6fab93b3 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -554,7 +554,7 @@ class MatrixBase { /// The same as adding M but scaling each column M_j by v(j). void AddMatDiagVec(const Real alpha, const MatrixBase &M, MatrixTransposeType transM, - VectorBase &v, + const VectorBase &v, Real beta = 1.0); /// *this = beta * *this + alpha * A .* B (.* element by element multiplication) diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 46e2b0c01dc..bcb6c9e581a 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -108,14 +108,19 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, if (nnet_config_.compute_deriv) nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - if (use_xent) + + const CuMatrixBase *xent_output = NULL; + if (use_xent) { xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - + + // this block computes the cross-entropy objective. + xent_output = &(computer->GetOutput(xent_name)); + } + BaseFloat tot_like, tot_l2_term, tot_weight; - ComputeChainObjfAndDeriv(chain_config_, den_graph_, - sup.supervision, nnet_output, + sup.supervision, nnet_output, xent_output, &tot_like, &tot_l2_term, &tot_weight, (nnet_config_.compute_deriv ? &nnet_output_deriv : NULL), (use_xent ? &xent_deriv : NULL)); @@ -138,13 +143,10 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg, if (use_xent) { ChainObjectiveInfo &xent_totals = objf_info_[xent_name]; - // this block computes the cross-entropy objective. - const CuMatrixBase &xent_output = computer->GetOutput( - xent_name); // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_deriv has a factor of '.supervision.weight', // but so does tot_weight. - BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); + BaseFloat xent_objf = TraceMatMat(*xent_output, xent_deriv, kTrans); xent_totals.tot_weight += tot_weight; xent_totals.tot_like += xent_objf; } diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index dee0eee2a33..1dbade49469 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -113,25 +113,24 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg, bool use_xent = (opts_.chain_config.xent_regularize != 0.0); std::string xent_name = sup.name + "-xent"; // typically "output-xent". CuMatrix xent_deriv; - if (use_xent) + const CuMatrixBase *xent_output = NULL; + if (use_xent) { xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(), kUndefined); - + // this block computes the cross-entropy objective. + xent_output = &(computer->GetOutput(xent_name)); + } BaseFloat tot_objf, tot_l2_term, tot_weight; - ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, - sup.supervision, nnet_output, + sup.supervision, nnet_output, xent_output, &tot_objf, &tot_l2_term, &tot_weight, &nnet_output_deriv, (use_xent ? &xent_deriv : NULL)); if (use_xent) { - // this block computes the cross-entropy objective. - const CuMatrixBase &xent_output = computer->GetOutput( - xent_name); // at this point, xent_deriv is posteriors derived from the numerator // computation. note, xent_objf has a factor of '.supervision.weight' - BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans); + BaseFloat xent_objf = TraceMatMat((*xent_output), xent_deriv, kTrans); objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval, num_minibatches_processed_, tot_weight, xent_objf); diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch index e142341f5ba..858a61160fa 100644 --- a/tools/extras/openfstwin-1.3.4.patch +++ b/tools/extras/openfstwin-1.3.4.patch @@ -1,425 +1,425 @@ -diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h -index 5ad3b52..d9c0ca6 100644 ---- a/src/include/fst/fst.h -+++ b/src/include/fst/fst.h -@@ -45,6 +45,12 @@ DECLARE_bool(fst_align); - - namespace fst { - -+ typedef ::int64 int64; -+ typedef ::uint64 uint64; -+ typedef ::int32 int32; -+ typedef ::uint32 uint32; -+ -+ - bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD - - class FstHeader; -diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h -index c4362f2..58cad44 100644 ---- a/src/include/fst/interval-set.h -+++ b/src/include/fst/interval-set.h -@@ -37,38 +37,38 @@ template - class IntervalSet { - public: - struct Interval { -- T begin; -- T end; -+ T begin_; -+ T end_; - -- Interval() : begin(-1), end(-1) {} -+ Interval() : begin_(-1), end_(-1) {} - -- Interval(T b, T e) : begin(b), end(e) {} -+ Interval(T b, T e) : begin_(b), end_(e) {} - - bool operator<(const Interval &i) const { -- return begin < i.begin || (begin == i.begin && end > i.end); -+ return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_); - } - - bool operator==(const Interval &i) const { -- return begin == i.begin && end == i.end; -+ return begin_ == i.begin_ && end_ == i.end_; - } - - bool operator!=(const Interval &i) const { -- return begin != i.begin || end != i.end; -+ return begin_ != i.begin_ || end_ != i.end_; - } - - istream &Read(istream &strm) { - T n; - ReadType(strm, &n); -- begin = n; -+ begin_ = n; - ReadType(strm, &n); -- end = n; -+ end_ = n; - return strm; - } - - ostream &Write(ostream &strm) const { -- T n = begin; -+ T n = begin_; - WriteType(strm, n); -- n = end; -+ n = end_; - WriteType(strm, n); - return strm; - } -@@ -108,7 +108,7 @@ class IntervalSet { - lower_bound(intervals_.begin(), intervals_.end(), interval); - if (lb == intervals_.begin()) - return false; -- return (--lb)->end > value; -+ return (--lb)->end_ > value; - } - - // Requires intervals be normalized. -@@ -123,7 +123,7 @@ class IntervalSet { - - bool Singleton() const { - return intervals_.size() == 1 && -- intervals_[0].begin + 1 == intervals_[0].end; -+ intervals_[0].begin_ + 1 == intervals_[0].end_; - } - - -@@ -178,17 +178,17 @@ void IntervalSet::Normalize() { - T size = 0; - for (T i = 0; i < intervals_.size(); ++i) { - Interval &inti = intervals_[i]; -- if (inti.begin == inti.end) -+ if (inti.begin_ == inti.end_) - continue; - for (T j = i + 1; j < intervals_.size(); ++j) { - Interval &intj = intervals_[j]; -- if (intj.begin > inti.end) -+ if (intj.begin_ > inti.end_) - break; -- if (intj.end > inti.end) -- inti.end = intj.end; -+ if (intj.end_ > inti.end_) -+ inti.end_ = intj.end_; - ++i; - } -- count_ += inti.end - inti.begin; -+ count_ += inti.end_ - inti.begin_; - intervals_[size++] = inti; - } - intervals_.resize(size); -@@ -208,17 +208,17 @@ void IntervalSet::Intersect(const IntervalSet &iset, - oset->count_ = 0; - - while (it1 != intervals_.end() && it2 != iintervals->end()) { -- if (it1->end <= it2->begin) { -+ if (it1->end_ <= it2->begin_) { - ++it1; -- } else if (it2->end <= it1->begin) { -+ } else if (it2->end_ <= it1->begin_) { - ++it2; - } else { - Interval interval; -- interval.begin = max(it1->begin, it2->begin); -- interval.end = min(it1->end, it2->end); -+ interval.begin_ = max(it1->begin_, it2->begin_); -+ interval.end_ = min(it1->end_, it2->end_); - ointervals->push_back(interval); -- oset->count_ += interval.end - interval.begin; -- if (it1->end < it2->end) -+ oset->count_ += interval.end_ - interval.begin_; -+ if (it1->end_ < it2->end_) - ++it1; - else - ++it2; -@@ -235,21 +235,21 @@ void IntervalSet::Complement(T maxval, IntervalSet *oset) const { - oset->count_ = 0; - - Interval interval; -- interval.begin = 0; -+ interval.begin_ = 0; - for (typename vector::const_iterator it = intervals_.begin(); - it != intervals_.end(); - ++it) { -- interval.end = min(it->begin, maxval); -- if (interval.begin < interval.end) { -+ interval.end_ = min(it->begin_, maxval); -+ if (interval.begin_ < interval.end_) { - ointervals->push_back(interval); -- oset->count_ += interval.end - interval.begin; -+ oset->count_ += interval.end_ - interval.begin_; - } -- interval.begin = it->end; -+ interval.begin_ = it->end_; - } -- interval.end = maxval; -- if (interval.begin < interval.end) { -+ interval.end_ = maxval; -+ if (interval.begin_ < interval.end_) { - ointervals->push_back(interval); -- oset->count_ += interval.end - interval.begin; -+ oset->count_ += interval.end_ - interval.begin_; - } - } - -@@ -263,7 +263,7 @@ void IntervalSet::Difference(const IntervalSet &iset, - oset->count_ = 0; - } else { - IntervalSet cset; -- iset.Complement(intervals_.back().end, &cset); -+ iset.Complement(intervals_.back().end_, &cset); - Intersect(cset, oset); - } - } -@@ -277,9 +277,9 @@ bool IntervalSet::Overlaps(const IntervalSet &iset) const { - typename vector::const_iterator it2 = intervals->begin(); - - while (it1 != intervals_.end() && it2 != intervals->end()) { -- if (it1->end <= it2->begin) { -+ if (it1->end_ <= it2->begin_) { - ++it1; -- } else if (it2->end <= it1->begin) { -+ } else if (it2->end_ <= it1->begin_) { - ++it2; - } else { - return true; -@@ -300,21 +300,21 @@ bool IntervalSet::StrictlyOverlaps(const IntervalSet &iset) const { - bool overlap = false; // point in both intervals_ and intervals - - while (it1 != intervals_.end() && it2 != intervals->end()) { -- if (it1->end <= it2->begin) { // no overlap - it1 first -+ if (it1->end_ <= it2->begin_) { // no overlap - it1 first - only1 = true; - ++it1; -- } else if (it2->end <= it1->begin) { // no overlap - it2 first -+ } else if (it2->end_ <= it1->begin_) { // no overlap - it2 first - only2 = true; - ++it2; -- } else if (it2->begin == it1->begin && it2->end == it1->end) { // equals -+ } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) { // equals - overlap = true; - ++it1; - ++it2; -- } else if (it2->begin <= it1->begin && it2->end >= it1->end) { // 1 c 2 -+ } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) { // 1 c 2 - only2 = true; - overlap = true; - ++it1; -- } else if (it1->begin <= it2->begin && it1->end >= it2->end) { // 2 c 1 -+ } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) { // 2 c 1 - only1 = true; - overlap = true; - ++it2; -@@ -346,11 +346,11 @@ bool IntervalSet::Contains(const IntervalSet &iset) const { - typename vector::const_iterator it2 = intervals->begin(); - - while (it1 != intervals_.end() && it2 != intervals->end()) { -- if (it1->end <= it2->begin) { // no overlap - it1 first -+ if (it1->end_ <= it2->begin_) { // no overlap - it1 first - ++it1; -- } else if (it2->begin < it1->begin || it2->end > it1->end) { // no C -+ } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) { // no C - return false; -- } else if (it2->end == it1->end) { -+ } else if (it2->end_ == it1->end_) { - ++it1; - ++it2; - } else { -@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet &s) { - ++it) { - if (it != intervals->begin()) - strm << ","; -- strm << "[" << it->begin << "," << it->end << ")"; -+ strm << "[" << it->begin_ << "," << it->end_ << ")"; - } - strm << "}"; - return strm; -diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h -index a7c3360..491ef7d 100644 ---- a/src/include/fst/label-reachable.h -+++ b/src/include/fst/label-reachable.h -@@ -359,9 +359,9 @@ class LabelReachable { - iiter = intervals->begin(); - iiter != intervals->end(); ++iiter) { - begin_low = LowerBound(aiter, end_low, aiter_end, -- aiter_input, iiter->begin); -+ aiter_input, iiter->begin_); - end_low = LowerBound(aiter, begin_low, aiter_end, -- aiter_input, iiter->end); -+ aiter_input, iiter->end_); - if (end_low - begin_low > 0) { - if (reach_begin_ < 0) - reach_begin_ = begin_low; -diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h -index 3fbe3ba..6e9dd3d 100644 ---- a/src/include/fst/minimize.h -+++ b/src/include/fst/minimize.h -@@ -134,7 +134,14 @@ class CyclicMinimizer { - typedef typename A::Weight Weight; - typedef ReverseArc RevA; - -- CyclicMinimizer(const ExpandedFst& fst) { -+ CyclicMinimizer(const ExpandedFst& fst): -+ // tell the Partition data-member to expect multiple repeated -+ // calls to SplitOn with the same element if we are non-deterministic. -+ P_(fst.Properties(kIDeterministic, true) == 0) { -+ if(fst.Properties(kIDeterministic, true) == 0) -+ CHECK(Weight::Properties() & kIdempotent); // this minimization -+ // algorithm for non-deterministic FSTs can only work with idempotent -+ // semirings. - Initialize(fst); - Compute(fst); - } -@@ -315,7 +322,13 @@ class AcyclicMinimizer { - typedef typename A::StateId ClassId; - typedef typename A::Weight Weight; - -- AcyclicMinimizer(const ExpandedFst& fst) { -+ AcyclicMinimizer(const ExpandedFst& fst): -+ // tell the Partition data-member to expect multiple repeated -+ // calls to SplitOn with the same element if we are non-deterministic. -+ partition_(fst.Properties(kIDeterministic, true) == 0) { -+ if(fst.Properties(kIDeterministic, true) == 0) -+ CHECK(Weight::Properties() & kIdempotent); // minimization for -+ // non-deterministic FSTs can only work with idempotent semirings. - Initialize(fst); - Refine(fst); - } -@@ -531,13 +544,7 @@ template - void Minimize(MutableFst* fst, - MutableFst* sfst = 0, - float delta = kDelta) { -- uint64 props = fst->Properties(kAcceptor | kIDeterministic| -- kWeighted | kUnweighted, true); -- if (!(props & kIDeterministic)) { -- FSTERROR() << "FST is not deterministic"; -- fst->SetProperties(kError, kError); -- return; -- } -+ uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true); - - if (!(props & kAcceptor)) { // weighted transducer - VectorFst< GallicArc > gfst; -diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h -index dcee67b..40b849a 100644 ---- a/src/include/fst/partition.h -+++ b/src/include/fst/partition.h -@@ -43,8 +43,8 @@ class Partition { - friend class PartitionIterator; - - struct Element { -- Element() : value(0), next(0), prev(0) {} -- Element(T v) : value(v), next(0), prev(0) {} -+ Element() : value(0), next(0), prev(0) {} -+ Element(T v) : value(v), next(0), prev(0) {} - - T value; - Element* next; -@@ -52,9 +52,11 @@ class Partition { - }; - - public: -- Partition() {} -+ Partition(bool allow_repeated_split): -+ allow_repeated_split_(allow_repeated_split) {} - -- Partition(T num_states) { -+ Partition(bool allow_repeated_split, T num_states): -+ allow_repeated_split_(allow_repeated_split) { - Initialize(num_states); - } - -@@ -137,16 +139,16 @@ class Partition { - if (class_size_[class_id] == 1) return; - - // first time class is split -- if (split_size_[class_id] == 0) -+ if (split_size_[class_id] == 0) { - visited_classes_.push_back(class_id); -- -+ class_split_[class_id] = classes_[class_id]; -+ } - // increment size of split (set of element at head of chain) - split_size_[class_id]++; -- -+ - // update split point -- if (class_split_[class_id] == 0) -- class_split_[class_id] = classes_[class_id]; -- if (class_split_[class_id] == elements_[element_id]) -+ if (class_split_[class_id] != 0 -+ && class_split_[class_id] == elements_[element_id]) - class_split_[class_id] = elements_[element_id]->next; - - // move to head of chain in same class -@@ -157,24 +159,31 @@ class Partition { - // class indices of the newly created class. Returns the new_class id - // or -1 if no new class was created. - T SplitRefine(T class_id) { -+ -+ Element* split_el = class_split_[class_id]; - // only split if necessary -- if (class_size_[class_id] == split_size_[class_id]) { -- class_split_[class_id] = 0; -+ //if (class_size_[class_id] == split_size_[class_id]) { -+ if(split_el == NULL) { // we split on everything... - split_size_[class_id] = 0; - return -1; - } else { -- - T new_class = AddClass(); -+ -+ if(allow_repeated_split_) { // split_size_ is possibly -+ // inaccurate, so work it out exactly. -+ size_t split_count; Element *e; -+ for(split_count=0,e=classes_[class_id]; -+ e != split_el; split_count++, e=e->next); -+ split_size_[class_id] = split_count; -+ } - size_t remainder = class_size_[class_id] - split_size_[class_id]; - if (remainder < split_size_[class_id]) { // add smaller -- Element* split_el = class_split_[class_id]; - classes_[new_class] = split_el; -- class_size_[class_id] = split_size_[class_id]; -- class_size_[new_class] = remainder; - split_el->prev->next = 0; - split_el->prev = 0; -+ class_size_[class_id] = split_size_[class_id]; -+ class_size_[new_class] = remainder; - } else { -- Element* split_el = class_split_[class_id]; - classes_[new_class] = classes_[class_id]; - class_size_[class_id] = remainder; - class_size_[new_class] = split_size_[class_id]; -@@ -245,10 +254,16 @@ class Partition { - vector class_size_; - - // size of split for each class -+ // in the nondeterministic case, split_size_ is actually an upper -+ // bound on the size of split for each class. - vector split_size_; - - // set of visited classes to be used in split refine - vector visited_classes_; -+ -+ // true if input fst was deterministic: we can make -+ // certain assumptions in this case that speed up the algorithm. -+ bool allow_repeated_split_; - }; - - -diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h -index 6d0c971..1da922e 100644 ---- a/src/include/fst/state-reachable.h -+++ b/src/include/fst/state-reachable.h -@@ -112,7 +112,7 @@ class IntervalReachVisitor { - void FinishState(StateId s, StateId p, const A *arc) { - if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) { - vector *intervals = (*isets_)[s].Intervals(); -- (*intervals)[0].end = index_; // Update tree interval end -+ (*intervals)[0].end_ = index_; // Update tree interval end - } - (*isets_)[s].Normalize(); - if (p != kNoStateId) +diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h +index 5ad3b52..d9c0ca6 100644 +--- a/src/include/fst/fst.h ++++ b/src/include/fst/fst.h +@@ -45,6 +45,12 @@ DECLARE_bool(fst_align); + + namespace fst { + ++ typedef ::int64 int64; ++ typedef ::uint64 uint64; ++ typedef ::int32 int32; ++ typedef ::uint32 uint32; ++ ++ + bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD + + class FstHeader; +diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h +index c4362f2..58cad44 100644 +--- a/src/include/fst/interval-set.h ++++ b/src/include/fst/interval-set.h +@@ -37,38 +37,38 @@ template + class IntervalSet { + public: + struct Interval { +- T begin; +- T end; ++ T begin_; ++ T end_; + +- Interval() : begin(-1), end(-1) {} ++ Interval() : begin_(-1), end_(-1) {} + +- Interval(T b, T e) : begin(b), end(e) {} ++ Interval(T b, T e) : begin_(b), end_(e) {} + + bool operator<(const Interval &i) const { +- return begin < i.begin || (begin == i.begin && end > i.end); ++ return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_); + } + + bool operator==(const Interval &i) const { +- return begin == i.begin && end == i.end; ++ return begin_ == i.begin_ && end_ == i.end_; + } + + bool operator!=(const Interval &i) const { +- return begin != i.begin || end != i.end; ++ return begin_ != i.begin_ || end_ != i.end_; + } + + istream &Read(istream &strm) { + T n; + ReadType(strm, &n); +- begin = n; ++ begin_ = n; + ReadType(strm, &n); +- end = n; ++ end_ = n; + return strm; + } + + ostream &Write(ostream &strm) const { +- T n = begin; ++ T n = begin_; + WriteType(strm, n); +- n = end; ++ n = end_; + WriteType(strm, n); + return strm; + } +@@ -108,7 +108,7 @@ class IntervalSet { + lower_bound(intervals_.begin(), intervals_.end(), interval); + if (lb == intervals_.begin()) + return false; +- return (--lb)->end > value; ++ return (--lb)->end_ > value; + } + + // Requires intervals be normalized. +@@ -123,7 +123,7 @@ class IntervalSet { + + bool Singleton() const { + return intervals_.size() == 1 && +- intervals_[0].begin + 1 == intervals_[0].end; ++ intervals_[0].begin_ + 1 == intervals_[0].end_; + } + + +@@ -178,17 +178,17 @@ void IntervalSet::Normalize() { + T size = 0; + for (T i = 0; i < intervals_.size(); ++i) { + Interval &inti = intervals_[i]; +- if (inti.begin == inti.end) ++ if (inti.begin_ == inti.end_) + continue; + for (T j = i + 1; j < intervals_.size(); ++j) { + Interval &intj = intervals_[j]; +- if (intj.begin > inti.end) ++ if (intj.begin_ > inti.end_) + break; +- if (intj.end > inti.end) +- inti.end = intj.end; ++ if (intj.end_ > inti.end_) ++ inti.end_ = intj.end_; + ++i; + } +- count_ += inti.end - inti.begin; ++ count_ += inti.end_ - inti.begin_; + intervals_[size++] = inti; + } + intervals_.resize(size); +@@ -208,17 +208,17 @@ void IntervalSet::Intersect(const IntervalSet &iset, + oset->count_ = 0; + + while (it1 != intervals_.end() && it2 != iintervals->end()) { +- if (it1->end <= it2->begin) { ++ if (it1->end_ <= it2->begin_) { + ++it1; +- } else if (it2->end <= it1->begin) { ++ } else if (it2->end_ <= it1->begin_) { + ++it2; + } else { + Interval interval; +- interval.begin = max(it1->begin, it2->begin); +- interval.end = min(it1->end, it2->end); ++ interval.begin_ = max(it1->begin_, it2->begin_); ++ interval.end_ = min(it1->end_, it2->end_); + ointervals->push_back(interval); +- oset->count_ += interval.end - interval.begin; +- if (it1->end < it2->end) ++ oset->count_ += interval.end_ - interval.begin_; ++ if (it1->end_ < it2->end_) + ++it1; + else + ++it2; +@@ -235,21 +235,21 @@ void IntervalSet::Complement(T maxval, IntervalSet *oset) const { + oset->count_ = 0; + + Interval interval; +- interval.begin = 0; ++ interval.begin_ = 0; + for (typename vector::const_iterator it = intervals_.begin(); + it != intervals_.end(); + ++it) { +- interval.end = min(it->begin, maxval); +- if (interval.begin < interval.end) { ++ interval.end_ = min(it->begin_, maxval); ++ if (interval.begin_ < interval.end_) { + ointervals->push_back(interval); +- oset->count_ += interval.end - interval.begin; ++ oset->count_ += interval.end_ - interval.begin_; + } +- interval.begin = it->end; ++ interval.begin_ = it->end_; + } +- interval.end = maxval; +- if (interval.begin < interval.end) { ++ interval.end_ = maxval; ++ if (interval.begin_ < interval.end_) { + ointervals->push_back(interval); +- oset->count_ += interval.end - interval.begin; ++ oset->count_ += interval.end_ - interval.begin_; + } + } + +@@ -263,7 +263,7 @@ void IntervalSet::Difference(const IntervalSet &iset, + oset->count_ = 0; + } else { + IntervalSet cset; +- iset.Complement(intervals_.back().end, &cset); ++ iset.Complement(intervals_.back().end_, &cset); + Intersect(cset, oset); + } + } +@@ -277,9 +277,9 @@ bool IntervalSet::Overlaps(const IntervalSet &iset) const { + typename vector::const_iterator it2 = intervals->begin(); + + while (it1 != intervals_.end() && it2 != intervals->end()) { +- if (it1->end <= it2->begin) { ++ if (it1->end_ <= it2->begin_) { + ++it1; +- } else if (it2->end <= it1->begin) { ++ } else if (it2->end_ <= it1->begin_) { + ++it2; + } else { + return true; +@@ -300,21 +300,21 @@ bool IntervalSet::StrictlyOverlaps(const IntervalSet &iset) const { + bool overlap = false; // point in both intervals_ and intervals + + while (it1 != intervals_.end() && it2 != intervals->end()) { +- if (it1->end <= it2->begin) { // no overlap - it1 first ++ if (it1->end_ <= it2->begin_) { // no overlap - it1 first + only1 = true; + ++it1; +- } else if (it2->end <= it1->begin) { // no overlap - it2 first ++ } else if (it2->end_ <= it1->begin_) { // no overlap - it2 first + only2 = true; + ++it2; +- } else if (it2->begin == it1->begin && it2->end == it1->end) { // equals ++ } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) { // equals + overlap = true; + ++it1; + ++it2; +- } else if (it2->begin <= it1->begin && it2->end >= it1->end) { // 1 c 2 ++ } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) { // 1 c 2 + only2 = true; + overlap = true; + ++it1; +- } else if (it1->begin <= it2->begin && it1->end >= it2->end) { // 2 c 1 ++ } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) { // 2 c 1 + only1 = true; + overlap = true; + ++it2; +@@ -346,11 +346,11 @@ bool IntervalSet::Contains(const IntervalSet &iset) const { + typename vector::const_iterator it2 = intervals->begin(); + + while (it1 != intervals_.end() && it2 != intervals->end()) { +- if (it1->end <= it2->begin) { // no overlap - it1 first ++ if (it1->end_ <= it2->begin_) { // no overlap - it1 first + ++it1; +- } else if (it2->begin < it1->begin || it2->end > it1->end) { // no C ++ } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) { // no C + return false; +- } else if (it2->end == it1->end) { ++ } else if (it2->end_ == it1->end_) { + ++it1; + ++it2; + } else { +@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet &s) { + ++it) { + if (it != intervals->begin()) + strm << ","; +- strm << "[" << it->begin << "," << it->end << ")"; ++ strm << "[" << it->begin_ << "," << it->end_ << ")"; + } + strm << "}"; + return strm; +diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h +index a7c3360..491ef7d 100644 +--- a/src/include/fst/label-reachable.h ++++ b/src/include/fst/label-reachable.h +@@ -359,9 +359,9 @@ class LabelReachable { + iiter = intervals->begin(); + iiter != intervals->end(); ++iiter) { + begin_low = LowerBound(aiter, end_low, aiter_end, +- aiter_input, iiter->begin); ++ aiter_input, iiter->begin_); + end_low = LowerBound(aiter, begin_low, aiter_end, +- aiter_input, iiter->end); ++ aiter_input, iiter->end_); + if (end_low - begin_low > 0) { + if (reach_begin_ < 0) + reach_begin_ = begin_low; +diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h +index 3fbe3ba..6e9dd3d 100644 +--- a/src/include/fst/minimize.h ++++ b/src/include/fst/minimize.h +@@ -134,7 +134,14 @@ class CyclicMinimizer { + typedef typename A::Weight Weight; + typedef ReverseArc RevA; + +- CyclicMinimizer(const ExpandedFst& fst) { ++ CyclicMinimizer(const ExpandedFst& fst): ++ // tell the Partition data-member to expect multiple repeated ++ // calls to SplitOn with the same element if we are non-deterministic. ++ P_(fst.Properties(kIDeterministic, true) == 0) { ++ if(fst.Properties(kIDeterministic, true) == 0) ++ CHECK(Weight::Properties() & kIdempotent); // this minimization ++ // algorithm for non-deterministic FSTs can only work with idempotent ++ // semirings. + Initialize(fst); + Compute(fst); + } +@@ -315,7 +322,13 @@ class AcyclicMinimizer { + typedef typename A::StateId ClassId; + typedef typename A::Weight Weight; + +- AcyclicMinimizer(const ExpandedFst& fst) { ++ AcyclicMinimizer(const ExpandedFst& fst): ++ // tell the Partition data-member to expect multiple repeated ++ // calls to SplitOn with the same element if we are non-deterministic. ++ partition_(fst.Properties(kIDeterministic, true) == 0) { ++ if(fst.Properties(kIDeterministic, true) == 0) ++ CHECK(Weight::Properties() & kIdempotent); // minimization for ++ // non-deterministic FSTs can only work with idempotent semirings. + Initialize(fst); + Refine(fst); + } +@@ -531,13 +544,7 @@ template + void Minimize(MutableFst* fst, + MutableFst* sfst = 0, + float delta = kDelta) { +- uint64 props = fst->Properties(kAcceptor | kIDeterministic| +- kWeighted | kUnweighted, true); +- if (!(props & kIDeterministic)) { +- FSTERROR() << "FST is not deterministic"; +- fst->SetProperties(kError, kError); +- return; +- } ++ uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true); + + if (!(props & kAcceptor)) { // weighted transducer + VectorFst< GallicArc > gfst; +diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h +index dcee67b..40b849a 100644 +--- a/src/include/fst/partition.h ++++ b/src/include/fst/partition.h +@@ -43,8 +43,8 @@ class Partition { + friend class PartitionIterator; + + struct Element { +- Element() : value(0), next(0), prev(0) {} +- Element(T v) : value(v), next(0), prev(0) {} ++ Element() : value(0), next(0), prev(0) {} ++ Element(T v) : value(v), next(0), prev(0) {} + + T value; + Element* next; +@@ -52,9 +52,11 @@ class Partition { + }; + + public: +- Partition() {} ++ Partition(bool allow_repeated_split): ++ allow_repeated_split_(allow_repeated_split) {} + +- Partition(T num_states) { ++ Partition(bool allow_repeated_split, T num_states): ++ allow_repeated_split_(allow_repeated_split) { + Initialize(num_states); + } + +@@ -137,16 +139,16 @@ class Partition { + if (class_size_[class_id] == 1) return; + + // first time class is split +- if (split_size_[class_id] == 0) ++ if (split_size_[class_id] == 0) { + visited_classes_.push_back(class_id); +- ++ class_split_[class_id] = classes_[class_id]; ++ } + // increment size of split (set of element at head of chain) + split_size_[class_id]++; +- ++ + // update split point +- if (class_split_[class_id] == 0) +- class_split_[class_id] = classes_[class_id]; +- if (class_split_[class_id] == elements_[element_id]) ++ if (class_split_[class_id] != 0 ++ && class_split_[class_id] == elements_[element_id]) + class_split_[class_id] = elements_[element_id]->next; + + // move to head of chain in same class +@@ -157,24 +159,31 @@ class Partition { + // class indices of the newly created class. Returns the new_class id + // or -1 if no new class was created. + T SplitRefine(T class_id) { ++ ++ Element* split_el = class_split_[class_id]; + // only split if necessary +- if (class_size_[class_id] == split_size_[class_id]) { +- class_split_[class_id] = 0; ++ //if (class_size_[class_id] == split_size_[class_id]) { ++ if(split_el == NULL) { // we split on everything... + split_size_[class_id] = 0; + return -1; + } else { +- + T new_class = AddClass(); ++ ++ if(allow_repeated_split_) { // split_size_ is possibly ++ // inaccurate, so work it out exactly. ++ size_t split_count; Element *e; ++ for(split_count=0,e=classes_[class_id]; ++ e != split_el; split_count++, e=e->next); ++ split_size_[class_id] = split_count; ++ } + size_t remainder = class_size_[class_id] - split_size_[class_id]; + if (remainder < split_size_[class_id]) { // add smaller +- Element* split_el = class_split_[class_id]; + classes_[new_class] = split_el; +- class_size_[class_id] = split_size_[class_id]; +- class_size_[new_class] = remainder; + split_el->prev->next = 0; + split_el->prev = 0; ++ class_size_[class_id] = split_size_[class_id]; ++ class_size_[new_class] = remainder; + } else { +- Element* split_el = class_split_[class_id]; + classes_[new_class] = classes_[class_id]; + class_size_[class_id] = remainder; + class_size_[new_class] = split_size_[class_id]; +@@ -245,10 +254,16 @@ class Partition { + vector class_size_; + + // size of split for each class ++ // in the nondeterministic case, split_size_ is actually an upper ++ // bound on the size of split for each class. + vector split_size_; + + // set of visited classes to be used in split refine + vector visited_classes_; ++ ++ // true if input fst was deterministic: we can make ++ // certain assumptions in this case that speed up the algorithm. ++ bool allow_repeated_split_; + }; + + +diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h +index 6d0c971..1da922e 100644 +--- a/src/include/fst/state-reachable.h ++++ b/src/include/fst/state-reachable.h +@@ -112,7 +112,7 @@ class IntervalReachVisitor { + void FinishState(StateId s, StateId p, const A *arc) { + if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) { + vector *intervals = (*isets_)[s].Intervals(); +- (*intervals)[0].end = index_; // Update tree interval end ++ (*intervals)[0].end_ = index_; // Update tree interval end + } + (*isets_)[s].Normalize(); + if (p != kNoStateId) diff --git a/windows/INSTALL b/windows/INSTALL deleted file mode 100644 index d743129498b..00000000000 --- a/windows/INSTALL +++ /dev/null @@ -1,146 +0,0 @@ - -# Installation instructions for native Windows with Visual -# studio (for cygwin installation, see the instructions -# in ../INSTALL). - -#NOTE: These instructions are valid June 2015, MKL and OpenBLAS are supported -#NOTE: ATLAS is not supported and I personally have no intention to work on supporting -# it, as it requires whole cygwin environment -#NOTE: We now (20150613) support CUDA on Windows as well. The build was -# tested on CUDA 7.0. It is possible that the compilation fails -# for significantly older CUDA SDK (less than, say, 5.0) -# Please not that CUDA support for windows is not really that usefull, -# because, the speed benefit during decoding is not large. And for training -# one would have to re-implement the while training pipeline (as the -# bash script wouldn't most probably work) -#NOTE: While the 32bit project files will still be generated, we don't really -# care if they work or not. They will be removed in the near future. -#NOTE: The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc -#NOTE: We support only openfst-1.3.x for now. -#NOTE: I suggest to have git installed -- not only because we will -# use it to download the source codes (you could download archives -# instead of it), but also because the windows version comes -# with a bunch of useful utilities. -#NOTE: The examples will assume you have installed the git for windows -# and during the installation you chose the GIT Shell to install as well. -# Moreover, all the commands are issued from the same session - -1) Checkout Kaldi trunk, either using the svn from the url - https://svn.code.sf.net/p/kaldi/code/trunk - or using git from - https://github.com/kaldi-asr/kaldi.git - Example: - $ git clone https://github.com/kaldi-asr/kaldi.git kaldi - -2) enter the (kaldi)/tools directory in the freshly - checked-out kaldi repo. All following actions should - be taken in the tools dir - Example: - $ cd (kaldi)/tools - (kaldi)/tools$ pwd - - -2a) Use git to clone the OpenFST(win) from - https://github.com/jtrmal/openfstwin-1.3.4.git - Example: - (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst - - -2b) Download pthread-win32 (or wget or curl) - https://sourceforge.net/projects/pthreads4w/ - (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip - (kaldi)/tools$ mkdir pthreads; cd pthreads - (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip - -2c) Use patch (or you can use git patch) to patch the OpenFST(win) - patch location tools/extras/openfstwin-1.3.4.patch, - Example: - (kaldi)/tools$ cd openfst - (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch - -2d-1) Download the OpenBLAS binary packages - https://sourceforge.net/projects/openblas - (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip - (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip - (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip - (kaldi)/tools$ unzip mingw64_dll.zip - - NOTE: Be carefull to download "Win64-int32" and not "Win64-int64"! - -2d-2) Install MKL -2e) If you want enabled CUDA support, download and install NVidia CUDA SDK. - Be careful and strive for as standard install as possible. The installer - set certain environment variables on which the MSVC Build rules rely. - If you call "set" in the command line, you should see: - - (kaldi)/tools $ set | grep CUDA - CUDA_PATH='C:\Users\Yenda\Downloads\cuda' - CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda' - NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda' - NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda' - - The first one (CUDA_PATH) is particulary important. - -3) Open the OpenFST solution in VS -- - for VS 2013, the correct solution is in VS2012 directory - for VS 2014, the correct solution is in VS2014 directory - !!!switch the configuration to debug|x64 and build the solution - !!!The same for configuration release|x64 - If either of the two won't build, you should stop here and start figuring what's different! - -4) Enter the (kaldi)/windows directory - Example: - (kaldi)/tools/openfst$ cd ../../windows - (kaldi)/windows $ pwd - -4a) modify the file variables.props to reflect - the correct paths, using your favorite text editor. - Don't worry, it's a text file, even though you have to be - careful to keep the structure itself intact - (kaldi)/windows $ vim variables.props - - If you plan to use MKL, you can ignore the OPENBLASDIR path - If you plan to use OpenBLAS, you can ignore the MKLDIR path - No matter what you plan to use, set both the OPENFST* and PTHREADW - variables correctly - -4b-1) For OpenBLAS support, copy the file "kaldiwin_openblas.props" to "kaldiwin.props" -4b-2) For MKL support, you don't have to do anything, it should work out of the box. - When you need to switch from OpenBLAS to MKL, copy the "kaldiwin_mkl.props" - to "kaldiwin.props" - - -4c) call the script that generates the MSVC solution - i.e. - generate_solution.pl --vsver - i.e. for example - generate_solution.pl --vsver vs2013 - - For CUDA support, add switch --enable-cuda to the command line, - i.e. for example - generate_solution.pl --vsver vs2013 --enable-cuda - -5) Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build - Expect 10 projects to fail, majority of them will fail because of missing include "portaudio.h" - ------- -NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS) -(B) either - (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy - kaldiwin_atlas.props to kaldiwin.props - -(D) -If you had installed ATLAS, you next have to do this: -[assuming you are one level above this directory] -cd kaldiwin_vs10_auto/ - -# type the following (these commands were done from cygwin): note that these -# commands are a bit wasteful of disk; you could alternatively ensure that -# [root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you -# run the binaries. - -mkdir -p Debug Release -cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug -cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release - -Then build the project with Visual Studio. diff --git a/windows/INSTALL.md b/windows/INSTALL.md new file mode 100644 index 00000000000..6a57d3d2ee2 --- /dev/null +++ b/windows/INSTALL.md @@ -0,0 +1,176 @@ + +# Installation instructions for native Windows with Visual Studio + +For cygwin installation, see the instructions in `../INSTALL`. + +## Notes + +* These instructions are valid June 2015, MKL and OpenBLAS are supported +* ATLAS is not supported and I personally have no intention to work on supporting + it, as it requires whole cygwin environment +* We now (20150613) support CUDA on Windows as well. The build was + tested on CUDA 7.0. It is possible that the compilation fails + for significantly older CUDA SDK (less than, say, 5.0) + Please not that CUDA support for windows is not really that usefull, + because, the speed benefit during decoding is not large. And for training + one would have to re-implement the while training pipeline (as the + bash script wouldn't most probably work) +* While the 32bit project files will still be generated, we don't really + care if they work or not. They will be removed in the near future. +* The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc +* We support only openfst-1.3.x for now. +* I suggest to have git installed -- not only because we will + use it to download the source codes (you could download archives + instead of it), but also because the windows version comes + with a bunch of useful utilities. +* The examples will assume you have installed the git for windows + and during the installation you chose the GIT Shell to install as well. + Moreover, all the commands are issued from the same session. + +## Steps + +1. Checkout Kaldi trunk, either using the svn from the url https://svn.code.sf.net/p/kaldi/code/trunk + or using git from https://github.com/kaldi-asr/kaldi.git + + Example: + + $ git clone https://github.com/kaldi-asr/kaldi.git kaldi + +2. Enter the `(kaldi)/tools` directory in the freshly + checked-out kaldi repo. All following actions should + be taken in the tools dir. + + Example: + + $ cd (kaldi)/tools + (kaldi)/tools$ pwd + +3. Use git to clone the OpenFST(win) from + + https://github.com/jtrmal/openfstwin-1.3.4.git + + Example: + + (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst + +4. Download pthread-win32 (or wget or curl) + + https://sourceforge.net/projects/pthreads4w/ + + (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip + (kaldi)/tools$ mkdir pthreads; cd pthreads + (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip + +5. Use patch (or you can use git patch) to patch the OpenFST(win). + + The patch location is `tools/extras/openfstwin-1.3.4.patch` + + Example: + + (kaldi)/tools$ cd openfst + (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch + + If you get this error: `Assertion failed: hunk, file ../patch-2.5.9-src/patch.c, line 354` + it is because the `patch.c` file should have Windows line endings (CRLF) rather than Unix ones (LF). + +There are two options to use for BLAS (linear algebra): MLK and OpenBLAS. MLK is made by Intel and is optimised +for their processors. Unfortunately it isn't free. OpenBLAS is free alternative with similar performance. + +6. If using MLK, install it. + +7. If using OpenBLAS, download the binary packages. + + https://sourceforge.net/projects/openblas + + (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip + (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip + (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip + (kaldi)/tools$ unzip mingw64_dll.zip + + **Be careful to download "Win64-int32" and not "Win64-int64"!** + +8. If you want enabled CUDA support, download and install NVidia CUDA SDK. + Be careful and strive for as standard install as possible. The installer + set certain environment variables on which the MSVC Build rules rely. + If you call "set" in the command line, you should see: + + (kaldi)/tools $ set | grep CUDA + CUDA_PATH='C:\Users\Yenda\Downloads\cuda' + CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda' + NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda' + NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda' + + The first one (`CUDA_PATH`) is particularly important. + +9. Open the OpenFST solution in VS + + * for VS 2013, the correct solution is in VS2012 directory + * for VS 2014, the correct solution is in VS2014 directory + + **Switch the configuration to `debug|x64` and build the solution.** + + **Do the same for configuration `release|x64`.** + + If either of the two won't build, you should stop here and start figuring what's different! + +10. Enter the `(kaldi)/windows` directory + + Example: + + (kaldi)/tools/openfst$ cd ../../windows + (kaldi)/windows $ pwd + +11. Modify the file `variables.props` to reflect + the correct paths, using your favorite text editor. + Don't worry, it's a text file, even though you have to be + careful to keep the structure itself intact + + (kaldi)/windows $ vim variables.props + + If you plan to use MKL, you can ignore the `OPENBLASDIR` path. + If you plan to use OpenBLAS, you can ignore the `MKLDIR` path. + No matter what you plan to use, set both the `OPENFST*` and `PTHREADW` + variables correctly + +12. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props` +13. For MKL support, you don't have to do anything, it should work out of the box. + When you need to switch from OpenBLAS to MKL, copy the `kaldiwin_mkl.props` + to `kaldiwin.props` + +14. Call the script that generates the MSVC solution + + generate_solution.pl --vsver [--enable-cuda] [--enable-openblas] [--enable-mlk] + + `--enable-mlk` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MLK support. + CUDA is disabled by default. The default Visual Studio version is 11.0 (Visual Studio 2012). + + For example, for a build supporting CUDA using OpenBLAS and VS 2015 you would run: + + (kaldi)/tools$ generate_solution.pl --vsver vs2015 --enable-cuda --enable-openblas + +15. Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build. + Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h` + +------ +NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS) + +(B) either + (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy + kaldiwin_atlas.props to kaldiwin.props + +(D) +If you had installed ATLAS, you next have to do this: +[assuming you are one level above this directory] + + cd kaldiwin_vs10_auto/ + +Type the following (these commands were done from cygwin): note that these +commands are a bit wasteful of disk; you could alternatively ensure that +[root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you +run the binaries. + + mkdir -p Debug Release + cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug + cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release + +Then build the project with Visual Studio.