diff --git a/.gitattributes b/.gitattributes
index 5a815654b4c..bede44edf8a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,4 +15,6 @@ windows/INSTALL*   eol=native
 windows/NewGuidCmd.exe.config text eol=crlf
 windows/NewGuidCmd.exe binary
 
+# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows).
+**/*.patch            -text
 
diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt
index 4df9eb2a2c5..71ab9f0fa45 100644
--- a/egs/swbd/s5c/local/chain/README.txt
+++ b/egs/swbd/s5c/local/chain/README.txt
@@ -6,5 +6,7 @@ ones to look at right now:
   4f is a good jesus-layer system
   4q is an improved TDNN with various bells and whistles from Vijay.
   4r is a slightly-better jesus-layer system than 4f, with one more layer.
+  5e is the best configuration run so far.
+
 
 
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
index 6a85bde4653..9cdbfefb5a2 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
@@ -4,6 +4,15 @@
 # from 1.0 to 2.0 because there is a lot of parameter change in the final xent
 # layer, and this limits the rate of change of the other layers.
 
+#./compare_wer.sh 4r 4v
+#System                       4r        4v
+#WER on train_dev(tg)      16.50     15.95
+#WER on train_dev(fg)      15.45     14.69
+#WER on eval2000(tg)        18.3      17.7
+#WER on eval2000(fg)        16.7      16.0
+#Final train prob      -0.103652 -0.106646  -1.60775
+#Final valid prob      -0.121105 -0.118631  -1.62832
+
 # _4r is as _4f, but one more hidden layer, and reducing context of existing
 # layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
 # from 1500 to 1400.
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
index 62b87cccd06..6dd5c587f7a 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
@@ -1,6 +1,16 @@
 #!/bin/bash
 
-# _4w is as _4v, but doubling --xent-regularize to 0.2
+# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a
+# bit worse, although final valid prob is very slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
 
 # _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
 # from 1.0 to 2.0 because there is a lot of parameter change in the final xent
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
index cb04a39be51..0290e0bdbd5 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
@@ -1,7 +1,17 @@
 #!/bin/bash
 
 # _4x is as _4u, but with --leaky-hmm-coefficient 0.2.   Note: the
-# ultimate baseline is 4f.
+# ultimate baseline is 4f.  It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1).
+# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1.
+#
+# ./compare_wer.sh  4f 4u 4x
+# System                       4f        4u        4x
+# WER on train_dev(tg)      16.83     16.47     16.63
+# WER on train_dev(fg)      15.73     15.23     15.42
+# WER on eval2000(tg)        18.4      18.4      18.4
+# WER on eval2000(fg)        16.6      16.7      16.6
+# Final train prob      -0.105832 -0.118911 -0.130674
+# Final valid prob      -0.123021 -0.135768 -0.146351
 
 # _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
 # ultimate baseline is 4f.
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
new file mode 100755
index 00000000000..cd1de07a80d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
@@ -0,0 +1,401 @@
+#!/bin/bash
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.  Very helpful (between 0.2%
+# and 0.6%).
+
+#./compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
new file mode 100755
index 00000000000..7e44c10920e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
@@ -0,0 +1,404 @@
+#!/bin/bash
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
new file mode 100755
index 00000000000..93ebb59b16d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
@@ -0,0 +1,409 @@
+#!/bin/bash
+
+# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be
+# worse than 0.1.
+# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2).
+#System                       4w        5c
+#WER on train_dev(tg)      16.05     16.35
+#WER on train_dev(fg)      14.92     15.21
+#WER on eval2000(tg)        18.0      17.8
+#WER on eval2000(fg)        16.2      16.4
+#Final train prob      -0.108816 -0.107098
+#Final valid prob      -0.118254 -0.118209
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2.  WER seems consistently
+# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very
+# slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.05 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
new file mode 100755
index 00000000000..8e6e9358003
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
@@ -0,0 +1,407 @@
+#!/bin/bash
+
+# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and
+# jesus-forward-output-dim from 1800 to 2000.
+
+# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1).
+#./compare_wer.sh 5b 5d
+#System                       5b        5d
+#WER on train_dev(tg)      15.51     15.29
+#WER on train_dev(fg)      14.39     14.17
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.7
+#Final train prob      -0.112013 -0.107858
+#Final valid prob      -0.130879 -0.128862
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
new file mode 100755
index 00000000000..ed48b0673b8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
@@ -0,0 +1,417 @@
+#!/bin/bash
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
new file mode 100755
index 00000000000..5fb1f0c445c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
@@ -0,0 +1,423 @@
+#!/bin/bash
+
+# _5f is as _5e, but making the 5b->5d change (increasing the
+# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000,
+# and jesus-forward-input-dim from 500 to 600.
+
+# WER change is (-0.1, -0.2, +0.2, +0.1).  So zero on average.
+# This means 5e remains the best system so far.
+
+#./compare_wer.sh 5e 5f
+#System                       5e        5f
+#WER on train_dev(tg)      15.43     15.35
+#WER on train_dev(fg)      14.32     14.15
+#WER on eval2000(tg)        17.3      17.5
+#WER on eval2000(fg)        15.5      15.6
+#Final train prob      -0.110056  -0.10574
+#Final valid prob      -0.129184 -0.128112
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1).
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
index c840e014250..7bd4ecf5647 100755
--- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh
+++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
@@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do
   echo >> $dir/indexes
   num_blocks=$[$num_blocks+1]
   cur_index=$[$cur_index+$block_shift]
-  if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then
+  if [ $[$cur_index+$block_size] -gt $feat_dim ]; then
     cur_index=$[$feat_dim-$block_size];
   fi
 done
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 93588ffc874..f2af7d0fdcb 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -101,7 +101,7 @@ right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on th
 
 # End configuration section.
 
-trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -497,7 +497,9 @@ while [ $x -lt $num_iters ]; do
     rm $dir/.error 2>/dev/null
 
 
-    ( # this sub-shell is so that when we "wait" below,
+    (
+      trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
+      # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
 
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index e28ddcc1a09..ac590a06a25 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -41,20 +41,19 @@
 #endif
 
 #ifndef M_PI
-#  define M_PI 3.1415926535897932384626433832795
+#define M_PI 3.1415926535897932384626433832795
 #endif
 
 #ifndef M_SQRT2
-#  define M_SQRT2 1.4142135623730950488016887
+#define M_SQRT2 1.4142135623730950488016887
 #endif
 
-
 #ifndef M_2PI
-#  define M_2PI 6.283185307179586476925286766559005
+#define M_2PI 6.283185307179586476925286766559005
 #endif
 
 #ifndef M_SQRT1_2
-# define M_SQRT1_2 0.7071067811865475244008443621048490
+#define M_SQRT1_2 0.7071067811865475244008443621048490
 #endif
 
 #ifndef M_LOG_2PI
@@ -65,6 +64,11 @@
 #define M_LN2 0.693147180559945309417232121458
 #endif
 
+#ifndef M_LN10
+#define M_LN10 2.302585092994045684017991454684
+#endif
+
+
 #define KALDI_ISNAN std::isnan
 #define KALDI_ISINF std::isinf
 #define KALDI_ISFINITE(x) std::isfinite(x)
diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h
index d3ffb913d78..52e388a3f2e 100644
--- a/src/chain/chain-datastruct.h
+++ b/src/chain/chain-datastruct.h
@@ -46,7 +46,7 @@ extern "C" {
 
 
   // Search for this in chain-kernels.cu for an explanation.
-  enum { kOccupationRescalingPowerOfTwo = 20, kThresholdingPowerOfTwo = 14 };
+  enum { kThresholdingPowerOfTwo = 14 };
 
 }
 
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 7414bb5fd39..ceb61a550f0 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -139,87 +139,6 @@ void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) {
 
   Vector<BaseFloat> avg_prob_float(avg_prob);
   initial_probs_ = avg_prob_float;
-  special_hmm_state_ = ComputeSpecialState(fst, avg_prob_float);
-}
-
-int32 NumStatesThatCanReach(const fst::StdVectorFst &fst,
-                            int32 dest_state) {
-  int32 num_states = fst.NumStates(),
-      num_states_can_reach = 0;
-  KALDI_ASSERT(dest_state >= 0 && dest_state < num_states);
-  std::vector<bool> can_reach(num_states, false);
-  std::vector<std::vector<int32> > reverse_transitions(num_states);
-  for (int32 s = 0; s < num_states; s++)
-    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
-         aiter.Next())
-      reverse_transitions[aiter.Value().nextstate].push_back(s);
-  std::vector<int32> queue;
-  can_reach[dest_state] = true;
-  queue.push_back(dest_state);
-  num_states_can_reach++;
-  while (!queue.empty()) {
-    int32 state = queue.back();
-    queue.pop_back();
-    std::vector<int32>::const_iterator iter = reverse_transitions[state].begin(),
-        end = reverse_transitions[state].end();
-    for (; iter != end; ++iter) {
-      int32 prev_state = *iter;
-      if (!can_reach[prev_state]) {
-        can_reach[prev_state] = true;
-        queue.push_back(prev_state);
-        num_states_can_reach++;
-      }
-    }
-  }
-  KALDI_ASSERT(num_states_can_reach >= 1 &&
-               num_states_can_reach <= num_states);
-  return num_states_can_reach;
-}
-
-
-int32 DenominatorGraph::ComputeSpecialState(
-    const fst::StdVectorFst &fst,
-    const Vector<BaseFloat> &initial_probs) {
-  int32 num_states = initial_probs.Dim();
-  std::vector<int32> num_transitions_into(num_states, 0);
-  for (int32 s = 0; s < fst.NumStates(); s++) {
-    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
-         aiter.Next())
-      num_transitions_into[aiter.Value().nextstate]++;
-  }
-  // this vector 'pairs' is a vector of pairs (-num-transitions-into-state, state).
-  std::vector<std::pair<int32, int32> > pairs(num_states);
-  for (int32 i = 0; i < num_states; i++) {
-    pairs[i].first = -num_transitions_into[i];
-    pairs[i].second = i;
-  }
-  // the first element of each pair is the negative of the num-transitions, so
-  // when we sort, the highest num-transitions will be first.
-  std::sort(pairs.begin(), pairs.end());
-
-  // this threshold of 0.75 is pretty arbitrary.  We reject any
-  // state if it can't be reached by 75% of all other states.
-  // In practice we think that states will either be reachable by
-  // almost-all states, or almost-none (e.g. states that are active
-  // only at utterance-beginning), so this threshold shouldn't
-  // be too critical.
-  int32 min_states_can_reach = 0.75 * num_states;
-  for (int32 i = 0; i < num_states; i++) {
-    int32 state = pairs[i].second;
-    int32 n = NumStatesThatCanReach(fst, state);
-    if (n < min_states_can_reach) {
-      KALDI_WARN << "Rejecting state " << state << " as a 'special' HMM state "
-                 << "(for renormalization in fwd-bkwd), because it's only "
-                 << "reachable by " << n << " out of " << num_states
-                 << " states.";
-    } else {
-      return state;
-    }
-  }
-  KALDI_ERR << "Found no states that are reachable by at least "
-            << min_states_can_reach << " out of " << num_states
-            << " states.  This is unexpected.  Change the threshold";
-  return -1;
 }
 
 void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst,
@@ -271,6 +190,34 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) {
   fst::Decode(fst, encoder);
 }
 
+// This static function, used in CreateDenominatorFst, sorts an
+// fst's states in decreasing order of number of transitions (into + out of)
+// the state.  The aim is to have states that have a lot of transitions
+// either into them or out of them, be numbered earlier, so hopefully
+// they will be scheduled first and won't delay the computation
+static void SortOnTransitionCount(fst::StdVectorFst *fst) {
+  // negative_num_transitions[i] will contain (before sorting), the pair
+  // ( -(num-transitions-into(i) + num-transition-out-of(i)), i)
+  int32 num_states = fst->NumStates();
+  std::vector<std::pair<int32, int32> > negative_num_transitions(num_states);
+  for (int32 i = 0; i < num_states; i++) {
+    negative_num_transitions[i].first = 0;
+    negative_num_transitions[i].second = i;
+  }
+  for (int32 i = 0; i < num_states; i++) {
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(*fst, i); !aiter.Done();
+         aiter.Next()) {
+      negative_num_transitions[i].first--;
+      negative_num_transitions[aiter.Value().nextstate].first--;
+    }
+  }
+  std::sort(negative_num_transitions.begin(), negative_num_transitions.end());
+  std::vector<fst::StdArc::StateId> order(num_states);
+  for (int32 i = 0; i < num_states; i++)
+    order[negative_num_transitions[i].second] = i;
+  fst::StateSort(fst, order);
+}
+
 void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) {
   for (int32 i = 1; i <= 3; i++) {
     fst::PushSpecial(fst, fst::kDelta * 0.01);
@@ -424,6 +371,8 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
 
   DenGraphMinimizeWrapper(&transition_id_fst);
 
+  SortOnTransitionCount(&transition_id_fst);
+
   *den_fst = transition_id_fst;
   CheckDenominatorFst(trans_model.NumPdfs(), *den_fst);
   PrintDenGraphStats(*den_fst);
diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h
index 8e5ee39e4bd..b2510651f39 100644
--- a/src/chain/chain-den-graph.h
+++ b/src/chain/chain-den-graph.h
@@ -88,13 +88,6 @@ class DenominatorGraph {
   // Note: we renormalize each HMM-state to sum to one before doing this.
   const CuVector<BaseFloat> &InitialProbs() const;
 
-  // returns the index of the HMM-state that has the highest value in
-  // InitialProbs (and which we believe will always be reachable from most other
-  // states... later on we may check this more carefully [TODO]).
-  // It's used in getting the 'arbitrary_scale' value to keep the alphas
-  // in a good dynamic range.
-  int32 SpecialHmmState() const { return special_hmm_state_; }
-
   // This function outputs a modifified version of the FST that was used to
   // build this object, that has an initial-state with epsilon transitions to
   // each state, with weight determined by initial_probs_; and has each original
@@ -116,23 +109,15 @@ class DenominatorGraph {
   // functions called from the constructor
   void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds);
 
-  // work out the initial-probs and the 'special state'
-  // Note, there are no final-probs; we treat all states as final
-  // with probability one [we have a justification for this..
-  // assuming it's roughly a well-normalized HMM, this makes sense;
-  // note that we train on chunks, so the beginning and end of a chunk
-  // appear at arbitrary points in the sequence.
-  // At both beginning and end of the chunk, we limit ourselves to
-  // only those pdf-ids that were allowed in the numerator sequence.
+  // work out the initial-probs.  Note, there are no final-probs; we treat all
+  // states as final with probability one [we have a justification for this..
+  // assuming it's roughly a well-normalized HMM, this makes sense; note that we
+  // train on chunks, so the beginning and end of a chunk appear at arbitrary
+  // points in the sequence.  At both beginning and end of the chunk, we limit
+  // ourselves to only those pdf-ids that were allowed in the numerator
+  // sequence.
   void SetInitialProbs(const fst::StdVectorFst &fst);
 
-  // return a suitable 'special' HMM-state used for normalizing probabilities in
-  // the forward-backward.  It has to have a reasonably high probability and be
-  // reachable from most of the graph.  returns a suitable state-index
-  // that we can set special_hmm_state_ to.
-  int32 ComputeSpecialState(const fst::StdVectorFst &fst,
-                            const Vector<BaseFloat> &initial_probs);
-
   // forward_transitions_ is an array, indexed by hmm-state index,
   // of start and end indexes into the transition_ array, which
   // give us the set of transitions out of this state.
@@ -152,23 +137,9 @@ class DenominatorGraph {
   // distribution of the HMM.  This isn't too critical.
   CuVector<BaseFloat> initial_probs_;
 
-  // The index of a somewhat arbitrarily chosen HMM-state that we
-  // use for adjusting the alpha probabilities.  It needs to be
-  // one that is reachable from all states (i.e. not a special
-  // state that's only reachable at sentence-start).  We choose
-  // whichever one has the greatest initial-prob.  It's set
-  // in SetInitialProbs().
-  int32 special_hmm_state_;
-
   int32 num_pdfs_;
 };
 
-// returns the number of states from which there is a path to
-// 'dest_state'.  Utility function used in selecting 'special' state
-// for normalization of probabilities.
-int32 NumStatesThatCanReach(const fst::StdVectorFst &fst,
-                            int32 dest_state);
-
 
 // Function that does acceptor minimization without weight pushing...
 // this is useful when constructing the denominator graph.
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 80d51bc661f..258c33cd465 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -86,10 +86,7 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
   const DenominatorGraphTransition *transitions = den_graph_.Transitions();
   int32 num_pdfs = exp_nnet_output_transposed_.NumRows(),
       num_hmm_states = den_graph_.NumStates(),
-      num_sequences = num_sequences_,
-      special_hmm_state = num_hmm_states;
-  // special_hmm_state now points to the alpha-sum quantity which is located
-  // in the sam place as the num_hmm_states'th hmm state would be.
+      num_sequences = num_sequences_;
 
   // 'probs' is the matrix of pseudo-likelihoods for frame t - 1.
   CuSubMatrix<BaseFloat> probs(exp_nnet_output_transposed_, 0, num_pdfs,
@@ -103,8 +100,8 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
     dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
 
     cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions,
-                           num_sequences, special_hmm_state, prob_data,
-                           probs.Stride(), prev_alpha_dash, this_alpha);
+                           num_sequences, prob_data, probs.Stride(),
+                           prev_alpha_dash, this_alpha);
 
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -126,15 +123,16 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
               this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s];
           this_tot_alpha += this_prev_alpha * transition_prob * prob;
         }
-        // Let arbitrary_scale be the inverse of the alpha value for the
-        // hmm-state indexed special_hmm_state_ on the previous frame (for this
-        // sequence); we multiply this into all the transition-probabilities
-        // from the previous frame to this frame, in both the forward and
-        // backward passes, in order to keep the alphas in a good numeric range.
-        // This won't affect the posteriors, but when computing the total
-        // likelihood we'll need to compensate for it later on.
+        // Let arbitrary_scale be the inverse of the alpha-sum value that we
+        // store in the same place we'd store the alpha for the state numbered
+        // 'num_hmm_states'. We multiply this into all the
+        // transition-probabilities from the previous frame to this frame, in
+        // both the forward and backward passes, in order to keep the alphas in
+        // a good numeric range.  This won't affect the posteriors, but when
+        // computing the total likelihood we'll need to compensate for it later
+        // on.
         BaseFloat arbitrary_scale =
-            1.0 / prev_alpha_dash[special_hmm_state * num_sequences + s];
+            1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s];
         KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0);
         this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
       }
@@ -259,10 +257,7 @@ bool DenominatorComputation::Backward(
           *nnet_output_deriv,
           t * num_sequences_, chunk_frames * num_sequences_,
           0, num_pdfs);
-      const BaseFloat occupation_arbitrary_factor_inv =
-          (1 << kOccupationRescalingPowerOfTwo);
-      output_deriv_part.AddMat(deriv_weight * occupation_arbitrary_factor_inv,
-                               transposed_deriv_part, kTrans);
+      output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans);
       if (t != 0)
         transposed_deriv_part.SetZero();
     }
@@ -310,8 +305,7 @@ void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
                      t_wrapped * num_sequences_, num_sequences_);
 
   int32 num_hmm_states = den_graph_.NumStates(),
-      num_sequences = num_sequences_,
-      special_hmm_state = num_hmm_states;
+      num_sequences = num_sequences_;
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -319,10 +313,9 @@ void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
     dim3 dimBlock(std::min<int32>(CU1DBLOCK, num_sequences), 1, 1);
     dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
     cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions,
-                            num_sequences, special_hmm_state,
-                            probs.Data(), probs.Stride(), this_alpha_dash,
-                            next_beta, this_beta_dash, log_prob_deriv.Data(),
-                            log_prob_deriv.Stride());
+                            num_sequences, probs.Data(), probs.Stride(),
+                            this_alpha_dash, next_beta, this_beta_dash,
+                            log_prob_deriv.Data(), log_prob_deriv.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -336,14 +329,9 @@ void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
       for (int32 s = 0; s < num_sequences; s++) {
         BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s],
             inv_arbitrary_scale =
-            this_alpha_dash[special_hmm_state * num_sequences + s];
+            this_alpha_dash[num_hmm_states * num_sequences + s];
         double tot_variable_factor = 0.0;
-        // search for 'occupation_arbitrary_factor' in chain-kernels.cu for
-        // an explanation.
-        const BaseFloat occupation_arbitrary_factor =
-            (1.0 / (1 << kOccupationRescalingPowerOfTwo));
-        BaseFloat occupation_factor =
-            (occupation_arbitrary_factor * this_alpha_dash_prob) /
+        BaseFloat occupation_factor = this_alpha_dash_prob /
             inv_arbitrary_scale;
         const DenominatorGraphTransition
             *trans_iter = transitions + forward_transitions[h].first,
@@ -376,12 +364,9 @@ void DenominatorComputation::BetaGeneralFrameDebug(int32 t) {
   CuSubMatrix<BaseFloat> this_log_prob_deriv(
       nnet_output_deriv_transposed_, 0, num_pdfs,
       t_wrapped * num_sequences_, num_sequences_);
-  const BaseFloat occupation_inv_arbitrary_factor =
-      1 << kOccupationRescalingPowerOfTwo;
   BaseFloat alpha_beta_product = VecVec(this_alpha_dash,
                                         this_beta_dash),
-      this_log_prob_deriv_sum = this_log_prob_deriv.Sum() *
-      occupation_inv_arbitrary_factor;
+      this_log_prob_deriv_sum = this_log_prob_deriv.Sum();
   if (!ApproxEqual(alpha_beta_product, num_sequences_)) {
     KALDI_WARN << "On time " << t << ", alpha-beta product "
                << alpha_beta_product << " != " << num_sequences_
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index de3e64cc693..b0f616673d6 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -41,6 +41,153 @@ namespace kaldi {
 namespace chain {
 
 
+/*
+  This extended comment describes how we implement forward-backward without log
+  and without overflow, and also the leaky-HMM idea.
+
+  We'll start by establishing the notation for conventional forward-backward,
+  then add the 'arbitrary-scale' concept that prevents overflow, and then
+  add the 'leaky-hmm' concept.
+
+  All this is done in parallel over multiple sequences, but the computations
+  are independent over the separate sequences, so we won't introduce any notation
+  or index for the sequence; we'll just explain it for one sequences.
+
+  Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for
+  hmm-state indexes).  Let foll(i) give a list of arcs leaving state i, and
+  pred(i) give a list of arcs entering state i, and we'll use notation like:
+    for (j, p, n) in foll(i):
+  for iterating over those arcs, where in this case j is the destination-state,
+  p is the transition-probability of the arc and n is the pdf-id index.
+  We can then look up the emission probability as x(t, n) for some frame
+  0 <= t < T.
+
+  ** Version 1 of the computation (naive version) **
+
+  * Forward computation (version 1)
+
+  In the forward computation we're computing alpha(i, t) for 0 <= t <= T):
+    - For the first frame, set alpha(0, i) = init(i), where init(i) is the
+      initial-probabilitiy from state i.  # in our framework these are obtained
+      #  by running the HMM for a while and getting an averaged occupation
+      # probability, and using this as an initial-prob, since the boundaries of
+      # chunks don't really correspond to utterance boundaries in general.]
+    - For t = 1 ... T:
+        for i = 0 ... I-1:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p.
+
+    - total-prob = \sum_i alpha(T, i).  # note, we take the final-probs of all states
+                                        # to be 1.0.
+
+  * Backward computation (version 1)
+
+  And now for the backward computation.  Contrary to tradition, we include the
+  inverse of the total-prob as a factor in the betas.  This is both more
+  convenient (it simplifies the way we obtain posteriors), and makes the
+  algorithm more generalizable as all the beta quantities can be interpreted as
+  the partial derivative of the logprob with respect to their corresponding
+  alpha.
+
+  In forward backward notation, gamma is normally used for state-level
+  occupation probabilities, but what we care about here is pdf-id-level
+  occupation probabilities (i.e. the partial derivative of the log-likelihood
+  w.r.t. the logs of the x(t, n) quantities), so we use gamma for that.
+
+    - for the final frame:
+       for each i, beta(T, i) = 1 / total-prob.
+    - for t = T-1 ... 0:
+        for i = 0 ... I-1:
+           beta(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta(t, i) += x(t, n) * beta(t+1, j) * p.
+              gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p.
+
+  ** Version 2 of the computation (renormalized version) **
+
+  Version 1 of the algorithm is susceptible to numeric underflow and overflow,
+  due to the limited range of IEEE floating-point exponents.
+  Define tot-alpha(t) = \sum_i alpha(t, i).  Then the renormalized version of
+  the computation is as above, except whenever the quantity x(t, n) appears,
+  we replace it with x(t, n) / alpha(t).  In the algorithm we refer to
+  1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any
+  value here as long as we are consistent and the value only varies with t
+  and not with n; we'll always get the same posteriors (gamma).
+
+  When the algorithm outputs log(total-prob) as the total log-probability
+  of the HMM, we have to instead return the expression:
+    log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t).
+  to correct for the scaling of the x values.
+
+  The algorithm is still vulnerable to overflow in the beta computation because
+  it's possible that the dominant path could have a very tiny alpha.  However,
+  once we introduce the leaky-HMM idea (below), this problem will disappear.
+
+  ** Version 3 of the computation (leaky-HMM version) **
+
+  The leaky-HMM idea is intended to improve generalization by allowing paths
+  other than those explicitly allowed by the FST we compiled.  Another way to
+  look at it is as a way of hedging our bets about where we split the utterance,
+  so it's as we're marginalizing over different splits of the utterance.  You
+  could also think of it as a modification of the FST so that there is an
+  epsilon transition from each state to a newly added state, with probability
+  one, and then an epsilon transition from the newly added state to each state
+  with probability leaky-hmm-prob * init(i) [except we need a mechanism so that
+  no more than two epsilon transitions can be taken per frame- this would involve
+  creating two copies of the states]
+
+  Recall that we mentioned that init(i) is the initial-probability of
+  HMM-state i, but these are obtained in such a way that they can be treated
+  as priors, or average occupation-probabilities.
+
+  Anyway, the way we formulate leaky-hmm is as follows:
+
+  * Forward computation (version 3)
+
+  Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical
+  value.  It defines how much probability we give to the 'leaky' transitions.
+
+  - For frame 0, set alpha(0, i) = init(i).
+  - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i).
+  - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i).
+
+  - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use
+      the previous frame's alpha' instead of alpha.  That is:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1)
+
+  - total-prob = \sum_i alpha'(T, i)
+
+  The corrected log-prob that we return from the algorithm will be
+   (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)).
+
+  * Backward computation (version 3)
+
+  The backward computation is as follows.  It is fairly straightforward to
+  derive if you think of it as an instance of backprop where beta, tot-beta and
+  beta' are the partial derivatives of the output log-prob w.r.t. the
+  corresponding alpha, tot-alpha and alpha' quantities.  Note, tot-beta is not
+  really the sum of the betas as its name might suggest, it's just the
+  derivative w.r.t. tot-alpha.
+
+   - beta'(T, i) = 1 / total-prob.
+   - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i)
+   - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t).
+   - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows:
+        for 0 <= i < I:
+           beta'(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t)
+              gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p *  x(t, n) / tot-alpha(t)
+
+   Note: in the code, the tot-alpha and tot-beta quantities go in the same
+   memory location that the corresponding alpha and beta for state I would go.
+
+ */
+
+
 // This does forward-backward in parallel on a number of sequences, using a
 // single HMM.
 class DenominatorComputation {
@@ -128,7 +275,8 @@ class DenominatorComputation {
 
   // the (temporarily) alpha and (more permanently) alpha-dash probabilities;
   // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences +
-  // num_sequences).  Note, they are not logs.  The last 'num_sequences' columns
+  // num_sequences).  Note, they are not logs.  The last 'num_sequences'
+  // columns, where the alpha for the state indexed 'num_hmm_states' would live,
   // are for the alpha-sums, which relates to leaky HMM.
   CuMatrix<BaseFloat> alpha_;
 
@@ -150,10 +298,10 @@ class DenominatorComputation {
   CuVector<BaseFloat> tot_log_prob_;
 
   // the log of the total correction term for each sequence, which is the
-  // product of the alpha_[special hmm state] over all the frames.  The
-  // 'correction terms' are terms that we divide the alphas and betas by in
-  // order to keep them in a good dynamic range.  The product of them
-  // must be included in the total likelihood.
+  // product of the alpha-sums [used in the leaky-hmm computation] over all the
+  // frames.  The 'correction terms' are terms that we divide the alphas and
+  // betas by in order to keep them in a good dynamic range.  The product of
+  // them must be included in the total likelihood.
   CuVector<BaseFloat> log_correction_term_;
 
   bool ok_;
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index af7a1a6b176..8ec1dcf322c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -29,7 +29,6 @@ extern "C" {
                                const Int32Pair *forward_transitions,
                                const DenominatorGraphTransition *transitions,
                                int32_cuda num_sequences,
-                               int32_cuda special_hmm_state,
                                const BaseFloat *probs,
                                int32_cuda prob_stride,
                                const BaseFloat *this_alpha,
@@ -42,7 +41,6 @@ extern "C" {
                               const Int32Pair *backward_transitions,
                               const DenominatorGraphTransition *transitions,
                               int32_cuda num_sequences,
-                              int32_cuda special_hmm_state,
                               const BaseFloat *probs,
                               int32_cuda prob_stride,
                               const BaseFloat *prev_alpha,
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 05127ed4c51..ea10b6680f0 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -40,15 +40,9 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) {
   // threshold itself with probability (value / threshold).  This preserves
   // expectations.  Note: we assume that value >= 0.
 
-  // kThresholdingPowerOfTwo is defined in chain-datastruct.h; think of this as
-  // defining the real threshold. (larger power -> more exact, smaller power ->
-  // faster).  The occupation factors that we add ('value' in this code) will
-  // previously have been scaled by by 2^{-kOccupationRescalingPowerOfTwo}, so
-  // we need to adjust the threshold to compensate for this.
-  // In the next line we compute 'threshold' in what an odd way to avoid
-  // overflow; it should be computed as a constant in the compiler.
-  const Real threshold = (1.0 / (1 << kThresholdingPowerOfTwo)) /
-      (1 << kOccupationRescalingPowerOfTwo);
+  // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines
+  // the threshold for randomized posterior pruning.
+  const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo);
   if (value >= threshold) {
     atomic_add(address, value);
   } else {
@@ -87,7 +81,6 @@ __global__
 static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
                                     const DenominatorGraphTransition *transitions,
                                     int32_cuda num_sequences,
-                                    int32_cuda special_hmm_state,
                                     const BaseFloat *probs,
                                     int32_cuda prob_stride,
                                     const BaseFloat *prev_alpha,
@@ -142,15 +135,18 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
     this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0;
   }
 
-  // Let arbitrary_scale be the inverse of the alpha value for the
-  // hmm-state indexed special_hmm_state_ on the previous frame (for this
-  // sequence); we multiply this into all the transition-probabilities
-  // from the previous frame to this frame, in both the forward and
-  // backward passes, in order to keep the alphas in a good numeric range.
-  // This won't affect the posteriors, but when computing the total
-  // likelihood we'll need to compensate for it later on.
+  int32_cuda num_hmm_states = gridDim.y;
+  // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the
+  // previous frame this sum of all the alpha values is stored in the place that
+  // we'd store the previous alpha for state-index equal to num_hmm_states
+  // (i.e. one past the end).  We multiply this into all the
+  // transition-probabilities from the previous frame to this frame, in both the
+  // forward and backward passes, in order to keep the alphas in a good numeric
+  // range.  This won't affect the posteriors, as it's just a constant factor
+  // for each frame, but when computing the total likelihood we'll need to
+  // compensate for it later on.
   BaseFloat arbitrary_scale =
-      1.0 / prev_alpha[special_hmm_state * num_sequences + s];
+      1.0 / prev_alpha[num_hmm_states * num_sequences + s];
   this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
 }
 
@@ -159,7 +155,6 @@ __global__
 static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
                                      const DenominatorGraphTransition *transitions,
                                      int32_cuda num_sequences,
-                                     int32_cuda special_hmm_state,
                                      const BaseFloat *probs, int32_cuda prob_stride,
                                      const BaseFloat *this_alpha, const BaseFloat *next_beta,
                                      BaseFloat *this_beta, BaseFloat *log_prob_deriv,
@@ -184,32 +179,15 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
   if (s >= num_sequences)
     return;
 
+  // below, you can read 'gridDim.y' as 'num_hmm_states'.  See where
+  // arbitrary_scale is defined in the forward computation above, for more
+  // explanation.
   BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s],
       inv_arbitrary_scale =
-      this_alpha[special_hmm_state * num_sequences + s];
+      this_alpha[gridDim.y * num_sequences + s];
   double tot_variable_factor = 0.0;
 
-  // this should be compiled as a constant.  This factor 'occupation_factor'
-  // here is arbitrarily chosen and will be canceled out by its inverse factor
-  // in chain-denomnator.cc.  It is to avoid infinities appearing in the
-  // derivatives when the 'special' HMM state gets very unlikely and
-  // 'this_alpha_prob' gets close to the maximum representable floating point
-  // value.  A check in chain-training.cc that tot_objf is finite would detect
-  // the case where the alphas are actually infinite and discard the
-  // derivatives, so we can assume that all the alphas are finite.  However, if
-  // one of the alphas is close to the maximum representable floating point
-  // value and if inv_arbitrary_scale is less than one, we could (if not for
-  // this factor of 10^-6) easily get overflow in the next line and produce an
-  // inf, which would not be detected as the alphas remain finite; this would
-  // produce an inf in the nnet-output derivatives and propagate back to the
-  // training.  Because 'inv_arbitrary_scale' is in the same range as the exp of
-  // the nnet outputs, and for a non-diverging chain model these will always be
-  // fairly close to 1, this small factor (around 10^-6 currently) should be
-  // sufficient to prevent an inf appearing here.
-  const BaseFloat occupation_arbitrary_factor =
-      (1.0 / (1 << kOccupationRescalingPowerOfTwo));
-  BaseFloat occupation_factor = (occupation_arbitrary_factor * this_alpha_prob) /
-      inv_arbitrary_scale;
+  BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale;
   const DenominatorGraphTransition
       *trans_iter = transitions + forward_transitions[h].first,
       *trans_end = transitions + forward_transitions[h].second;
@@ -250,12 +228,7 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
                            occupation_prob0);
   }
   BaseFloat beta = tot_variable_factor / inv_arbitrary_scale;
-  // If an overflow was generated while computing the beta (which should be
-  // extremely rare), substitute zero.  This will likely lead to denominator
-  // occupancies which are less than one for this sequence, as the resulting
-  // betas will be less than they should be.  but it's better than generating an
-  // inf and ruining the whole backprop.
-  this_beta[h * num_sequences + s] = (beta - beta == 0 ? beta : 0.0);
+  this_beta[h * num_sequences + s] = beta;
 }
 
 
@@ -263,28 +236,26 @@ void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl,
                             const Int32Pair *backward_transitions,
                             const DenominatorGraphTransition *transitions,
                             int32_cuda num_sequences,
-                            int32_cuda special_hmm_state,
                             const BaseFloat *probs, int32_cuda prob_stride,
                             const BaseFloat *prev_alpha,
                             BaseFloat *this_alpha) {
   _cuda_chain_hmm_forward<<<Gr,Bl>>>(backward_transitions, transitions,
-                                     num_sequences, special_hmm_state,
-                                     probs, prob_stride, prev_alpha, this_alpha);
+                                     num_sequences, probs, prob_stride,
+                                     prev_alpha, this_alpha);
 }
 
 void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                              const Int32Pair *forward_transitions,
                              const DenominatorGraphTransition *transitions,
                              int32_cuda num_sequences,
-                             int32_cuda special_hmm_state,
                              const BaseFloat *probs, int32_cuda prob_stride,
                              const BaseFloat *this_alpha, const BaseFloat *next_beta,
                              BaseFloat *this_beta,
                              BaseFloat *log_prob_deriv,
                              int32_cuda log_prob_deriv_stride) {
   _cuda_chain_hmm_backward<<<Gr,Bl>>>(forward_transitions, transitions,
-                                      num_sequences, special_hmm_state,
-                                      probs, prob_stride, this_alpha, next_beta,
+                                      num_sequences, probs, prob_stride,
+                                      this_alpha, next_beta,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 1bf0201fbfa..9c8f3424390 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -29,6 +29,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
+                              const CuMatrixBase<BaseFloat> *xent_output,
                               BaseFloat *objf,
                               BaseFloat *l2_term,                              
                               BaseFloat *weight,
@@ -103,13 +104,65 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
     *l2_term = 0.0;
   } else {
     // compute the l2 penalty term and its derivative
-    BaseFloat scale = supervision.weight * opts.l2_regularize;
-    *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans);
-    if (nnet_output_deriv)
-      nnet_output_deriv->AddMat(-1.0 * scale, nnet_output);
+    BaseFloat scale_coeff = supervision.weight * opts.l2_regularize;
+    // If xent_output provided, l2 penalty is trying to regress the chain output
+    // to be a linear function of cross-entropy output.
+    // It minimizes -0.5 * l2_regularize * l2_norm(diag(scale) * x + offset - y)^2, 
+    // where x is cross-entropy output and y is chain output.
+    if (xent_output) {
+      //compute offset and scale
+      // The objecitve is to minimize L w.r.t scale_i, offset_i, 
+      // L = -0.5 * l2_regularize * 
+      //    \sum_{j=1}^{m_size}(\sum_i (nnet_output_ji - target_ji)^2),
+      // where the target_ji = scale_i * xent_output_ji + offset_i.
+      // 
+      // scale_i = [\sum_j (nnet_output_ji * xent_output_ji) - 
+      //           1/m_size * \sum_j(nnet_output_ji) * \sum_j(xent_output_ji)] / 
+      //           [\sum_j(xent_output_ji^2) - 1/m_size * (\sum_j(xent_output_ji))^2]
+      // offset_i = 1 ./ m_size * \sum_j (nnet_output_ji - scale_i * xent_output_ji)
+      // where m_size is minibatch_size.
+      CuVector<BaseFloat> scale(xent_output->NumCols()), 
+        offset(xent_output->NumCols()), 
+        nnet_col_sum(nnet_output.NumCols()),
+        xent_col_sum(xent_output->NumCols()),
+        scale_denom(nnet_output.NumCols());
+
+      nnet_col_sum.AddRowSumMat(1.0, nnet_output, 0.0);
+      xent_col_sum.AddRowSumMat(1.0, *xent_output, 0.0); 
+      scale.AddDiagMatMat(1.0, *xent_output, kTrans, nnet_output, kNoTrans, 0.0);
+      scale.AddVecVec(-1.0 / nnet_output.NumRows(), nnet_col_sum, xent_col_sum, 1.0);
+      scale_denom.AddDiagMat2(1.0, *xent_output, kTrans, 0.0);
+      scale_denom.AddVecVec(-1.0 / nnet_output.NumRows(), xent_col_sum, xent_col_sum, 1.0);
+      scale.DivElements(scale_denom);
+      
+      offset.AddVec(1.0 / xent_output->NumRows(), nnet_col_sum);
+      offset.AddVecVec(-1.0 / xent_output->NumRows(), scale, xent_col_sum, 1.0);
+      
+      if (rand() % 10 == 1)
+        KALDI_LOG << "l1_norm(scale) = " << scale.Norm(1.0) 
+                  << " l1_norm(offset) = " << offset.Norm(1.0);
+
+      //output_diff = (xent_output * diag(scale) + offset) - nnet_output;
+      CuMatrix<BaseFloat> output_diff(xent_output->NumRows(), xent_output->NumCols());
+      output_diff.AddMatDiagVec(1.0, *xent_output, kNoTrans, scale, 0.0);
+      output_diff.AddVecToRows(1.0, offset);
+      output_diff.AddMat(-1.0, nnet_output);
+      *l2_term = -0.5 * scale_coeff * TraceMatMat(output_diff, output_diff, kTrans);
+
+      //update the nnet_output and xent_output derivative w.r.t. regularizer term.
+      if (nnet_output_deriv)
+        nnet_output_deriv->AddMat(scale_coeff, output_diff);
+
+      if (xent_output_deriv) 
+        xent_output_deriv->AddMatDiagVec(-1.0 * scale_coeff, output_diff, kNoTrans, scale, 1.0);
+
+    } else {
+      *l2_term = -0.5 * scale_coeff * TraceMatMat(nnet_output, nnet_output, kTrans);
+      if (nnet_output_deriv)
+        nnet_output_deriv->AddMat(-1.0 * scale_coeff, nnet_output);
+    }
   }
 }
 
-
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index e6143d10846..1e2cfe8cf88 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -116,6 +116,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
+                              const CuMatrixBase<BaseFloat> *xent_output,
                               BaseFloat *objf,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 804bea1a217..b1cab67362f 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -103,6 +103,7 @@ void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a, float param
 void cudaF_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim);
 void cudaF_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim);
 void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
+void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim);
 void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
 void cudaF_vec_min(const float* v, float* value, int dim);
 void cudaF_vec_max(const float* v, float* value, int dim);
@@ -243,6 +244,7 @@ void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a, double pa
 void cudaD_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim);
 void cudaD_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim);
 void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim);
+void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, int dim);
 void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
 void cudaD_vec_min(const double* v, double* value, int dim);
 void cudaD_vec_max(const double* v, double* value, int dim);
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 4e1b69f0cce..b8958616b2b 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -444,6 +444,14 @@ static void _vec_mul_elements(Real* v, const Real* a, int dim) {
     v[i] = v[i] * a[i];
 }
 
+template<typename Real>
+__global__
+static void _vec_div_elements(Real* v, const Real* a, int dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < dim)
+    v[i] = v[i] / a[i];
+}
+
 
 template<typename Real>
 __global__
@@ -2337,6 +2345,10 @@ void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) {
   _vec_mul_elements<<<Gr,Bl>>>(v, a, dim);
 }
 
+void cudaF_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim) {
+  _vec_div_elements<<<Gr,Bl>>>(v, a, dim);
+}
+
 void cudaF_vec_min(const float* v, float* value, int dim) {
   _vec_min<<<1,CU1DBLOCK>>>(v, value, dim);
 }
@@ -2797,6 +2809,10 @@ void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim)
   _vec_mul_elements<<<Gr,Bl>>>(v, a, dim);
 }
 
+void cudaD_vec_div_elements(int Gr, int Bl, double* v, const double* a, int dim) {
+  _vec_div_elements<<<Gr,Bl>>>(v, a, dim);
+}
+
 void cudaD_vec_min(const double* v, double* value, int dim) {
   _vec_min<<<1,CU1DBLOCK>>>(v, value, dim);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index fc1fbae54da..dec0797f015 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -189,6 +189,7 @@ inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a, float
 inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
 inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const float* v_in, int dim) { cudaF_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
 inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_mul_elements(Gr,Bl,v,a,dim); }
+inline void cuda_vec_div_elements(int Gr, int Bl, float* v, const float* a, int dim) { cudaF_vec_div_elements(Gr,Bl,v,a,dim); }
 inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); }
 inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); }
 inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
@@ -373,6 +374,7 @@ inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a, dou
 inline void cuda_copy_from_vec_df(int Gr, int Bl, double* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_df(Gr,Bl,v_out,v_in,dim); }
 inline void cuda_copy_from_vec_fd(int Gr, int Bl, float* v_out, const double* v_in, int dim) { cudaD_copy_from_vec_fd(Gr,Bl,v_out,v_in,dim); }
 inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_mul_elements(Gr,Bl,v,a,dim); }
+inline void cuda_vec_div_elements(int Gr, int Bl, double* v, const double* a, int dim) { cudaD_vec_div_elements(Gr,Bl,v,a,dim); }
 inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); }
 inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); }
 inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index eb5a268d543..2ea5457fefd 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1166,7 +1166,7 @@ template<typename Real>
 void CuMatrixBase<Real>::AddMatDiagVec(
     const Real alpha,
     const CuMatrixBase<Real> &M, MatrixTransposeType transM,
-    CuVectorBase<Real> &v,
+    const CuVectorBase<Real> &v,
     Real beta) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index fd4c642ab7f..13f41e26dec 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -420,7 +420,7 @@ class CuMatrixBase {
   // The same as adding M but scaling each column M_j by v(j).
   void AddMatDiagVec(const Real alpha,
                      const CuMatrixBase<Real> &M, MatrixTransposeType transM,
-                     CuVectorBase<Real> &v,
+                     const CuVectorBase<Real> &v,
                      Real beta = 1.0);
 
   /// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 6deb3809d85..98d22892515 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -708,6 +708,24 @@ void CuVectorBase<Real>::MulElements(const CuVectorBase<Real> &v) {
     Vec().MulElements(v.Vec());
   }
 }
+template<typename Real>
+void CuVectorBase<Real>::DivElements(const CuVectorBase<Real> &v) {
+  KALDI_ASSERT(dim_ == v.dim_);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (dim_ == 0) return;
+    Timer tim;
+    int dimBlock(CU1DBLOCK);
+    int dimGrid(n_blocks(dim_, CU1DBLOCK));
+    cuda_vec_div_elements(dimGrid, dimBlock, data_, v.Data(), dim_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuVectorBase::DivElements", tim.Elapsed());
+  } else
+#endif
+  {
+    Vec().DivElements(v.Vec());
+  }
+}
 
 template<>
 template<>
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 54c1ac0ad4f..f8a213e148e 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -193,6 +193,8 @@ class CuVectorBase {
   void ReplaceValue(Real orig, Real changed);
   
   void MulElements(const CuVectorBase<Real> &v);
+
+  void DivElements(const CuVectorBase<Real> &v);
  protected:
 
   // The following two functions should only be called if we did not compile
diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox
index 9935fa52711..938321fd7b2 100644
--- a/src/doc/hmm.dox
+++ b/src/doc/hmm.dox
@@ -447,9 +447,10 @@ We now explain what these three scales do:
    when we add the self-loop, let the probability mass given to the self-loop be p
    and the mass given to the rest be (1-p).  We add a self-loop with log-probability
    self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other 
-   log transition probabilities
-   out of that state.  In typical topologies, the self-loop scale is the only scale
-   that matters.
+   log transition probabilities out of that state.  (Note: in the initial stage of
+   graph creation we create a graph without self-loops, and with the non-self-loop
+   transition probabilities renormalized to sum to one).  In typical topologies, the 
+   self-loop scale is the only scale that matters.
 
 The reason we feel it might make sense to apply a different probability scale to
 the self-loops versus the normal transition scale is we think they could be
diff --git a/src/lm/Makefile b/src/lm/Makefile
index ddda9576557..acf327d994f 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -10,10 +10,10 @@ MATHLIB = NONE
 
 include ../kaldi.mk
 
-TESTFILES = lm-lib-test
+TESTFILES = arpa-file-parser-test lm-lib-test
 
-OBJFILES = const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o kaldi-rnnlm.o \
-           mikolov-rnnlm-lib.o
+OBJFILES = arpa-file-parser.o const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o \
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
 
 TESTOUTPUTS = composed.fst output.fst output1.fst output2.fst
 
diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc
new file mode 100644
index 00000000000..e37a916d263
--- /dev/null
+++ b/src/lm/arpa-file-parser-test.cc
@@ -0,0 +1,365 @@
+// lm/arpa-file-parser-test.cc
+
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file lm-lib-test.cc
+ * @brief Unit tests for language model code.
+ */
+
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+#include "lm/kaldi-lm.h"
+
+#include "lm/arpa-file-parser.h"
+
+namespace kaldi {
+namespace {
+
+const int kMaxOrder = 3;
+
+struct NGramTestData {
+  int32 line_number;
+  float logprob;
+  int32 words[kMaxOrder];
+  float backoff;
+};
+
+std::ostream& operator<<(std::ostream& os, const NGramTestData& data) {
+  std::ios::fmtflags saved_state(os.flags());
+  os << std::fixed << std::setprecision(6);
+
+  os << data.logprob << ' ';
+  for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' ';
+  os << data.backoff << " // Line " << data.line_number;
+
+  os.flags(saved_state);
+  return os;
+}
+
+// This does not own the array pointer, and uset to simplify passing expected
+// result to TestableArpaFileParser::Verify.
+template <class T>
+struct CountedArray {
+  template <size_t N>
+  CountedArray(T(&array)[N]) : array(array), count(N) { }
+  const T* array;
+  const size_t count;
+};
+
+template <class T, size_t N>
+inline CountedArray<T> MakeCountedArray(T(&array)[N]) {
+  return CountedArray<T>(array);
+}
+
+class TestableArpaFileParser : public ArpaFileParser {
+ public:
+  TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols)
+      : ArpaFileParser(options, symbols),
+        header_available_(false),
+        read_complete_(false),
+        last_order_(0) { }
+  void Validate(CountedArray<int32> counts, CountedArray<NGramTestData> ngrams);
+
+ private:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
+  bool header_available_;
+  bool read_complete_;
+  int32 last_order_;
+  std::vector <NGramTestData> ngrams_;
+};
+
+void TestableArpaFileParser::HeaderAvailable() {
+  KALDI_ASSERT(!header_available_);
+  KALDI_ASSERT(!read_complete_);
+  header_available_ = true;
+  KALDI_ASSERT(NgramCounts().size() <= kMaxOrder);
+}
+
+void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  KALDI_ASSERT(ngram.words.size() <= NgramCounts().size());
+  KALDI_ASSERT(ngram.words.size() >= last_order_);
+  last_order_ = ngram.words.size();
+
+  NGramTestData entry = { 0 };
+  entry.line_number = LineNumber();
+  entry.logprob = ngram.logprob;
+  entry.backoff = ngram.backoff;
+  std::copy(ngram.words.begin(), ngram.words.end(), entry.words);
+  ngrams_.push_back(entry);
+}
+
+void TestableArpaFileParser::ReadComplete() {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  read_complete_ = true;
+}
+
+//
+bool CompareNgrams(const NGramTestData& actual,
+                   const NGramTestData& expected) {
+  if (actual.line_number != expected.line_number
+      || !std::equal(actual.words, actual.words + kMaxOrder,
+                     expected.words)
+      || !ApproxEqual(actual.logprob, expected.logprob)
+      || !ApproxEqual(actual.backoff, expected.backoff)) {
+    KALDI_WARN << "Actual n-gram [" << actual
+               << "] differs from expected [" << expected << "]";
+    return false;
+  }
+  return true;
+}
+
+void TestableArpaFileParser::Validate(
+    CountedArray<int32> expect_counts,
+    CountedArray<NGramTestData> expect_ngrams) {
+  // This needs better disagnostics probably.
+  KALDI_ASSERT(NgramCounts().size() == expect_counts.count);
+  KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(),
+                          expect_counts.array));
+
+  KALDI_ASSERT(ngrams_.size() == expect_ngrams.count);
+  // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(),
+  //                           expect_ngrams.array, CompareNgrams);
+  // if (mpos.first != ngrams_.end())
+  //   KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin();
+  //TODO:auto above requres C++11, and I cannot spell out the type!!!
+  KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(),
+                          expect_ngrams.array, CompareNgrams));
+}
+
+// Read integer LM (no symbols) with log base conversion.
+void ReadIntegerLmLogconvExpectSuccess() {
+  KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()";
+
+  static std::string integer_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.234679	4 -3.3\n\
+-3.456783	5\n\
+0.0000000	1 -2.5\n\
+-4.333333	2\n\
+\n\
+\\2-grams:\n\
+-1.45678	4 5 -3.23\n\
+-1.30490	1 4 -4.2\n\
+\n\
+\\3-grams:\n\
+-0.34958	1 4 5\n\
+-0.23940	4 5 2\n\
+\n\
+\\end\\";
+
+  int32 expect_counts[] = { 4, 2, 2 };
+  NGramTestData expect_ngrams[] = {
+    {  7, -12.05329, { 4, 0, 0 }, -7.598531 },
+    {  8, -7.959537, { 5, 0, 0 },  0.0      },
+    {  9,  0.0,      { 1, 0, 0 }, -5.756463 },
+    { 10, -9.977868, { 2, 0, 0 },  0.0      },
+
+    { 13, -3.354360, { 4, 5, 0 }, -7.437350 },
+    { 14, -3.004643, { 1, 4, 0 }, -9.670857 },
+
+    { 17, -0.804938, { 1, 4, 5 },  0.0      },
+    { 18, -0.551239, { 4, 5, 2 },  0.0      } };
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+
+  TestableArpaFileParser parser(options, NULL);
+  std::istringstream stm(integer_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_ngrams));
+}
+
+// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks.
+static std::string symbolic_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.2	a -3.3\n\
+-3.4	\xCE\xB2\n\
+0.0	<s> -2.5\n\
+-4.3	</s>\n\
+\n\
+\\2-grams:\n\
+-1.5	a \xCE\xB2 -3.2\n\
+-1.3	<s> a -4.2\n\
+\n\
+\\3-grams:\n\
+-0.3	<s> a \xCE\xB2\n\
+-0.2	<s> a </s>\n\
+\n\
+\\end\\";
+
+// Symbol table that is created with predefined test symbols, "a" but no "b".
+class TestSymbolTable : public fst::SymbolTable {
+ public:
+  TestSymbolTable() {
+    AddSymbol("<eps>", 0);
+    AddSymbol("<s>", 1);
+    AddSymbol("</s>", 2);
+    AddSymbol("<unk>", 3);
+    AddSymbol("a", 4);
+  }
+};
+
+// Full expected result shared between ReadSymbolicLmNoOovImpl and
+// ReadSymbolicLmWithOovAddToSymbols().
+NGramTestData expect_symbolic_full[] = {
+  {  7, -5.2, { 4, 0, 0 }, -3.3 },
+  {  8, -3.4, { 5, 0, 0 },  0.0 },
+  {  9,  0.0, { 1, 0, 0 }, -2.5 },
+  { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+  { 13, -1.5, { 4, 5, 0 }, -3.2 },
+  { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+  { 17, -0.3, { 1, 4, 5 },  0.0 },
+  { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  TestSymbolTable symbols;
+  symbols.AddSymbol("\xCE\xB2", 5);
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.use_log10 = true;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, &symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_symbolic_full));
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+}
+
+void ReadSymbolicLmNoOovTests() {
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram);
+}
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmWithOovImpl(
+    ArpaParseOptions::OovHandling oov,
+    CountedArray<NGramTestData> expect_ngrams,
+    fst::SymbolTable* symbols) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.use_log10 = true;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts), expect_ngrams);
+}
+
+void ReadSymbolicLmWithOovAddToSymbols() {
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols,
+                            MakeCountedArray(expect_symbolic_full),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+  KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5);
+}
+
+void ReadSymbolicLmWithOovReplaceWithUnk() {
+  NGramTestData expect_symbolic_unk_b[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  8, -3.4, { 3, 0, 0 },  0.0 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 13, -1.5, { 4, 3, 0 }, -3.2 },
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 17, -0.3, { 1, 4, 3 },  0.0 },
+    { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk,
+                            MakeCountedArray(expect_symbolic_unk_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovSkipNGram() {
+  NGramTestData expect_symbolic_no_b[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram,
+                            MakeCountedArray(expect_symbolic_no_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovTests() {
+  KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()";
+  ReadSymbolicLmWithOovAddToSymbols();
+  KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()";
+  ReadSymbolicLmWithOovReplaceWithUnk();
+  KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()";
+  ReadSymbolicLmWithOovSkipNGram();
+}
+
+}  // namespace
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  kaldi::ReadIntegerLmLogconvExpectSuccess();
+  kaldi::ReadSymbolicLmNoOovTests();
+  kaldi::ReadSymbolicLmWithOovTests();
+}
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
new file mode 100644
index 00000000000..2d8f9f18638
--- /dev/null
+++ b/src/lm/arpa-file-parser.cc
@@ -0,0 +1,236 @@
+// lm/arpa-file-parser.cc
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include <fst/fstlib.h>
+
+#include "base/kaldi-error.h"
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
+#include "util/text-utils.h"
+
+namespace kaldi {
+
+ArpaFileParser::ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols)
+    : options_(options), symbols_(symbols), line_number_(0) {
+}
+
+ArpaFileParser::~ArpaFileParser() {
+}
+
+void ArpaFileParser::Read(std::istream &is, bool binary) {
+  if (binary) {
+    KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
+  }
+
+  // Argument sanity checks.
+  if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 ||
+      options_.bos_symbol == options_.eos_symbol)
+    KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and "
+              << "differ from each other. Given:"
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL &&
+      options_.oov_handling == ArpaParseOptions::kReplaceWithUnk &&
+      (options_.unk_symbol <= 0 ||
+       options_.unk_symbol == options_.bos_symbol ||
+       options_.unk_symbol == options_.eos_symbol))
+    KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, "
+              << "UNK symbol is required, must not be epsilon, and "
+              << "differ from both BOS and EOS symbols. Given:"
+              << " UNK=" << options_.unk_symbol
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty())
+    KALDI_ERR << "BOS symbol must exist in symbol table";
+  if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty())
+    KALDI_ERR << "EOS symbol must exist in symbol table";
+  if (symbols_ != NULL && options_.unk_symbol > 0 &&
+      symbols_->Find(options_.unk_symbol).empty())
+    KALDI_ERR << "UNK symbol must exist in symbol table";
+
+  ngram_counts_.clear();
+  line_number_ = 0;
+
+#define PARSE_ERR (KALDI_ERR << "in line " << line_number_ << ": ")
+
+  // Give derived class an opportunity to prepare its state.
+  ReadStarted();
+
+  std::string line;
+
+  // Processes "\data\" section.
+  bool keyword_found = false;
+  while (++line_number_, getline(is, line) && !is.eof()) {
+    if (line.empty()) continue;
+
+    // The section keywords starts with backslash. We terminate the while loop
+    // if a new section is found.
+    if (line[0] == '\\') {
+      if (!keyword_found && line == "\\data\\") {
+        KALDI_LOG << "Reading \\data\\ section.";
+        keyword_found = true;
+        continue;
+      }
+      break;
+    }
+
+    if (!keyword_found) continue;
+
+    // Enters "\data\" section, and looks for patterns like "ngram 1=1000",
+    // which means there are 1000 unigrams.
+    std::size_t equal_symbol_pos = line.find("=");
+    if (equal_symbol_pos != std::string::npos)
+      line.replace(equal_symbol_pos, 1, " = ");  // Inserts spaces around "="
+    std::vector<std::string> col;
+    SplitStringToVector(line, " \t", true, &col);
+    if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") {
+      int32 order, ngram_count = 0;
+      if (!ConvertStringToInteger(col[1], &order) ||
+          !ConvertStringToInteger(col[3], &ngram_count)) {
+        PARSE_ERR << "Cannot parse ngram count '" << line << "'.";
+      }
+      if (ngram_counts_.size() <= order) {
+        ngram_counts_.resize(order);
+      }
+      ngram_counts_[order - 1] = ngram_count;
+    } else {
+      KALDI_WARN << "Uninterpretable line in \\data\\ section: " << line;
+    }
+  }
+
+  if (ngram_counts_.size() == 0)
+    PARSE_ERR << "\\data\\ section missing or empty.";
+
+  // Signal that grammar order and n-gram counts are known.
+  HeaderAvailable();
+
+  NGram ngram;
+  ngram.words.reserve(ngram_counts_.size());
+
+  // Processes "\N-grams:" section.
+  for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) {
+    // Skips n-grams with zero count.
+    if (ngram_counts_[cur_order - 1] == 0) {
+      KALDI_WARN << "Zero ngram count in ngram order " << cur_order
+                 << "(look for 'ngram " << cur_order << "=0' in the \\data\\ "
+                 << " section). There is possibly a problem with the file.";
+      continue;
+    }
+
+    // Must be looking at a \k-grams: directive at this point.
+    std::ostringstream keyword;
+    keyword << "\\" << cur_order << "-grams:";
+    if (line != keyword.str()) {
+      PARSE_ERR << "Invalid directive '" << line << "', "
+                << "expecting '" << keyword.str() << "'.";
+    }
+    KALDI_LOG << "Reading " << line << " section.";
+
+    int32 ngram_count = 0;
+    while (++line_number_, getline(is, line) && !is.eof()) {
+      if (line.empty()) continue;
+      if (line[0] == '\\') break;
+
+      std::vector<std::string> col;
+      SplitStringToVector(line, " \t", true, &col);
+
+      if (col.size() < 1 + cur_order ||
+          col.size() > 2 + cur_order ||
+          (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) {
+        PARSE_ERR << "Invalid n-gram line '"  << line << "'";
+      }
+      ++ngram_count;
+
+      // Parse out n-gram logprob and, if present, backoff weight.
+      if (!ConvertStringToReal(col[0], &ngram.logprob)) {
+        PARSE_ERR << "Invalid n-gram logprob '" << col[0] << "'.";
+      }
+      ngram.backoff = 0.0;
+      if (col.size() > cur_order + 1) {
+        if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff))
+          PARSE_ERR << "Invalid backoff weight '" << col[cur_order + 1] << "'.";
+      }
+      // Convert to natural log unless the option is set not to.
+      if (!options_.use_log10) {
+        ngram.logprob *= M_LN10;
+        ngram.backoff *= M_LN10;
+      }
+
+      ngram.words.resize(cur_order);
+      bool skip_ngram = false;
+      for (int32 index = 0; !skip_ngram && index < cur_order; ++index) {
+        int32 word;
+        if (symbols_) {
+          // Symbol table provided, so symbol labels are expected.
+          if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) {
+            word = symbols_->AddSymbol(col[1 + index]);
+          } else {
+            word = symbols_->Find(col[1 + index]);
+            if (word == fst::SymbolTable::kNoSymbol) {
+              switch(options_.oov_handling) {
+                case ArpaParseOptions::kReplaceWithUnk:
+                  word = options_.unk_symbol;
+                  break;
+                case ArpaParseOptions::kSkipNGram:
+                  skip_ngram = true;
+                  break;
+                default:
+                  PARSE_ERR << "Word '"  << col[1 + index]
+                            << "' not in symbol table.";
+              }
+            }
+          }
+        } else {
+          // Symbols not provided, LM file should contain integers.
+          if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) {
+            PARSE_ERR << "invalid symbol '" << col[1 + index] << "'";
+          }
+        }
+        // Whichever way we got it, an epsilon is invalid.
+        if (word == 0) {
+          PARSE_ERR << "Epsilon symbol '" << col[1 + index]
+                    << "' is illegal in ARPA LM.";
+        }
+        ngram.words[index] = word;
+      }
+      if (!skip_ngram) {
+        ConsumeNGram(ngram);
+      }
+    }
+    if (ngram_count > ngram_counts_[cur_order - 1]) {
+      PARSE_ERR << "Header said there would be " << ngram_counts_[cur_order]
+                << " n-grams of order " << cur_order << ", but we saw "
+                << ngram_count;
+    }
+  }
+
+  if (line != "\\end\\") {
+    PARSE_ERR << "Invalid or unexpected directive line '" << line << "', "
+              << "expected \\end\\.";
+  }
+
+  ReadComplete();
+
+#undef PARSE_ERR
+}
+
+}  // namespace kaldi
diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h
new file mode 100644
index 00000000000..0011fb4ee21
--- /dev/null
+++ b/src/lm/arpa-file-parser.h
@@ -0,0 +1,125 @@
+// lm/arpa-file-parser.h
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_ARPA_FILE_PARSER_H_
+#define KALDI_LM_ARPA_FILE_PARSER_H_
+
+#include <string>
+#include <vector>
+
+#include <fst/fst-decl.h>
+
+#include "base/kaldi-types.h"
+
+namespace kaldi {
+
+/**
+  Options that control ArpaFileParser
+*/
+struct ArpaParseOptions {
+  enum OovHandling {
+    kRaiseError,     ///< Abort on OOV words
+    kAddToSymbols,   ///< Add novel words to the symbol table.
+    kReplaceWithUnk, ///< Replace OOV words with <unk>.
+    kSkipNGram       ///< Skip n-gram with OOV word and continue.
+  };
+
+  ArpaParseOptions()
+      : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1),
+        oov_handling(kRaiseError), use_log10(false) { }
+
+  int32 bos_symbol;  ///< Symbol for <s>, Required non-epsilon.
+  int32 eos_symbol;  ///< Symbol for </s>, Required non-epsilon.
+  int32 unk_symbol;  ///< Symbol for <unk>, Required for kReplaceWithUnk.
+  OovHandling oov_handling;  ///< How to handle OOV words in the file.
+  bool use_log10;    ///< Use log10 for prob and backoff weight, not ln.
+};
+
+/**
+   A parsed n-gram from ARPA LM file.
+*/
+struct NGram {
+  NGram() : logprob(0.0), backoff(0.0) { }
+  std::vector<int32> words;  ///< Symbols in LTR order.
+  float logprob;             ///< Log-prob of the n-gram.
+  float backoff;             ///< log-backoff weight of the n-gram.
+};
+
+/**
+    ArpaFileParser is an abstract base class for ARPA LM file conversion.
+
+    See ConstArpaLmBuilder for a usage example.
+*/
+class ArpaFileParser {
+ public:
+  /// Constructs the parser with the given options and optional symbol table.
+  /// If symbol table is provided, then the file should contain text n-grams,
+  /// and the words are mapped to symbols through it. bos_symbol and
+  /// eos_symbol in the options structure must be valid symbols in the table,
+  /// and so must be unk_symbol if provided. The table is not owned by the
+  /// parser, but may be augmented, if oov_handling is set to kAddToSymbols.
+  /// If symbol table is a null pointer, the file should contain integer
+  /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol
+  /// must be valid symbols still.
+  ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols);
+  virtual ~ArpaFileParser();
+
+  /// Read ARPA LM file through Kaldi I/O functions. Only text mode is
+  /// supported.
+  void Read(std::istream &is, bool binary);
+
+  const ArpaParseOptions& Options() const { return options_; }
+
+ protected:
+  /// Override called before reading starts. This is the point to prepare
+  /// any state in the derived class.
+  virtual void ReadStarted() { }
+
+  /// Override function called to signal that ARPA header with the expected
+  /// number of n-grams has been read, and ngram_counts() is now valid.
+  virtual void HeaderAvailable() { }
+
+  /// Pure override that must be implemented to process current n-gram. The
+  /// n-grams are sent in the file order, which guarantees that all
+  /// (k-1)-grams are processed before the first k-gram is.
+  virtual void ConsumeNGram(const NGram&) = 0;
+
+  /// Override function called after the last n-gram has been consumed.
+  virtual void ReadComplete() { }
+
+  /// Read-only access to symbol table.
+  const fst::SymbolTable* Symbols() const { return symbols_; }
+
+  /// Inside ConsumeNGram(), provides the current line number.
+  int32 LineNumber() const { return line_number_; }
+
+  /// N-gram counts. Valid in and after a call to HeaderAvailable().
+  const std::vector<int32>& NgramCounts() const { return ngram_counts_; }
+
+ private:
+  ArpaParseOptions options_;
+  fst::SymbolTable* symbols_;  // Not owned.
+  int32 line_number_;
+  std::vector<int32> ngram_counts_;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_ARPA_FILE_PARSER_H_
diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc
index 7f63dce886e..5043933d7f0 100644
--- a/src/lm/const-arpa-lm.cc
+++ b/src/lm/const-arpa-lm.cc
@@ -22,13 +22,14 @@
 #include <sstream>
 #include <utility>
 
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
 #include "lm/const-arpa-lm.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"
-#include "base/kaldi-math.h"
 
-namespace kaldi {
 
+namespace kaldi {
 
 // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa
 // format.
@@ -173,13 +174,10 @@ class LmState {
 
 // Class to build ConstArpaLm from Arpa format language model. It relies on the
 // auxiliary class LmState above.
-class ConstArpaLmBuilder {
+class ConstArpaLmBuilder : public ArpaFileParser {
  public:
-  ConstArpaLmBuilder(
-      const bool natural_base, const int32 bos_symbol,
-      const int32 eos_symbol, const int32 unk_symbol) :
-      natural_base_(natural_base), bos_symbol_(bos_symbol),
-      eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) {
+  ConstArpaLmBuilder(ArpaParseOptions options)
+      : ArpaFileParser(options, NULL) {
     ngram_order_ = 0;
     num_words_ = 0;
     overflow_buffer_size_ = 0;
@@ -204,21 +202,21 @@ class ConstArpaLmBuilder {
     }
   }
 
-  // Reads in the Arpa format language model, parses it and creates LmStates.
-  void Read(std::istream &is, bool binary);
-
   // Writes ConstArpaLm.
   void Write(std::ostream &os, bool binary) const;
 
-  // Builds ConstArpaLm.
-  void Build();
-
   void SetMaxAddressOffset(const int32 max_address_offset) {
     KALDI_WARN << "You are changing <max_address_offset_>; the default should "
         << "not be changed unless you are in testing mode.";
     max_address_offset_ = max_address_offset;
   }
 
+ protected:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
  private:
   struct WordsAndLmStatePairLessThan {
     bool operator()(
@@ -229,10 +227,6 @@ class ConstArpaLmBuilder {
   };
 
  private:
-  // If true, use natural base e for log-prob, otherwise use base 10. The
-  // default base in Arpa format language model is base 10.
-  bool natural_base_;
-
   // Indicating if ConstArpaLm has been built or not.
   bool is_built_;
 
@@ -240,16 +234,6 @@ class ConstArpaLmBuilder {
   // The default value is 30-bits and should not be changed except for testing.
   int32 max_address_offset_;
 
-  // Integer corresponds to <s>.
-  int32 bos_symbol_;
-
-  // Integer corresponds to </s>.
-  int32 eos_symbol_;
-
-  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
-  // provided.
-  int32 unk_symbol_;
-
   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;
@@ -280,201 +264,58 @@ class ConstArpaLmBuilder {
                 LmState*, VectorHasher<int32> > seq_to_state_;
 };
 
-// Reads in the Arpa format language model, parses it and puts the word sequence
-// into the corresponding LmState in <seq_to_state_>.
-void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
-  if (binary) {
-    KALDI_ERR << "binary-mode reading is not implemented for "
-        << "ConstArpaLmBuilder.";
-  }
-
-  std::string line;
-
-  // Number of n-grams from "\data\" section. Those numbers should match the
-  // actual number of n-grams from "\N-grams:" sections.
-  // Note that when we convert the words in the Arpa format language model into
-  // integers, we remove lines with OOV words. We also modify the n-gram counts
-  // in "\data\" correspondingly.
-  std::vector<int32> num_ngrams;
-
-  // Processes "\data\" section.
-  bool keyword_found = false;
-  while (getline(is, line) && !is.eof()) {
-    // The section keywords starts with backslash. We terminate the while loop
-    // if a new section is found.
-    if (!line.empty() && line[0] == '\\') {
-      if (line.find("-grams:") != std::string::npos) break;
-      if (line.find("\\end\\") != std::string::npos) break;
-    }
-
-    std::size_t equal_symbol_pos = line.find("=");
-    if (equal_symbol_pos != std::string::npos)
-      line.replace(equal_symbol_pos, 1, " = ");  // Inserts spaces around "="
-    std::vector<std::string> col;
-    SplitStringToVector(line, " \t", true, &col);
-
-    // Looks for keyword "\data\".
-    if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") {
-      KALDI_LOG << "Reading \"\\data\\\" section.";
-      keyword_found = true;
-      continue;
-    }
+void ConstArpaLmBuilder::HeaderAvailable() {
+  ngram_order_ = NgramCounts().size();
+}
 
-    // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which
-    // means there are 1000 unigrams.
-    if (keyword_found && col.size() == 4 && col[0] == "ngram") {
-      if (col[2] == "=") {
-        int32 order, ngram_count;
-        if (!ConvertStringToInteger(col[1], &order)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[1] << " to integer.";
-        }
-        if (!ConvertStringToInteger(col[3], &ngram_count)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[3] << " to integer.";
-        }
-        if (num_ngrams.size() <= order) {
-          num_ngrams.resize(order + 1);
-        }
-        num_ngrams[order] = ngram_count;
-      } else {
-        KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-      }
-    } else if (keyword_found) {
-      KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-    }
+void ConstArpaLmBuilder::ConsumeNGram(const NGram& ngram) {
+  int32 cur_order = ngram.words.size();
+  // If <ngram_order_> is larger than 1, then we do not create LmState for
+  // the final order entry. We only keep the log probability for it.
+  LmState *lm_state = NULL;
+  if (cur_order != ngram_order_ || ngram_order_ == 1) {
+    lm_state = new LmState(cur_order == 1,
+                           cur_order == ngram_order_ - 1,
+                           ngram.logprob, ngram.backoff);
+
+    KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end());
+    seq_to_state_[ngram.words] = lm_state;
   }
-  if (num_ngrams.size() == 0)
-    KALDI_ERR << "Fail to read \"\\data\\\" section.";
-  ngram_order_ = num_ngrams.size() - 1;
-
-  // Processes "\N-grams:" section.
-  int32 max_word_id = 0;
-  for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) {
-    // Skips n-grams with zero count.
-    if (num_ngrams[cur_order] == 0) continue;
-
-    keyword_found = false;
-    int32 ngram_count = 0;
-    std::ostringstream keyword;
-    keyword << "\\" << cur_order << "-grams:";
-    // We use "do ... while" loop since one line has already been read.
-    do {
-      // The section keywords starts with backslash. We terminate the while loop
-      // if a new section is found.
-      if (!line.empty() && line[0] == '\\') {
-        if (line.find("-grams:") != std::string::npos && keyword_found) break;
-        if (line.find("\\end\\") != std::string::npos) break;
-      }
 
-      std::vector<std::string> col;
-      SplitStringToVector(line, " \t", true, &col);
-
-      // Looks for keyword "\N-gram:" if the keyword has not been located.
-      if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) {
-        KALDI_LOG << "Reading \"" << keyword.str() << "\" section.";
-        ngram_count = 0;
-        keyword_found = true;
-        continue;
-      }
-
-      // Enters "\N-grams:" section if the keyword has been located.
-      if (keyword_found && col.size() > 0) {
-        KALDI_ASSERT(col.size() >= 1 + cur_order);
-        KALDI_ASSERT(col.size() <= 2 + cur_order);  // backoff_logprob can be 0.
-        if (cur_order == ngram_order_ && col.size() == 2 + cur_order) {
-          KALDI_ERR << "Backoff probability detected for final-order entry \""
-              << line << "\".";
-        }
-        ngram_count++;
-
-        // If backoff_logprob is 0, it will not appear in Arpa format language
-        // model. We put it back so the processing afterwards will be easier.
-        if (col.size() == 1 + cur_order) {
-          col.push_back("0");
-        }
-
-        // Creates LmState for the current word sequence.
-        bool is_unigram = (cur_order == 1) ? true : false;
-        float logprob;
-        float backoff_logprob;
-        KALDI_ASSERT(ConvertStringToReal(col[0], &logprob));
-        KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob));
-        if (natural_base_) {
-          logprob *= Log(10.0f);
-          backoff_logprob *= Log(10.0f);
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not create LmState for
-        // the final order entry. We only keep the log probability for it.
-        LmState *lm_state = NULL;
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          lm_state = new LmState(is_unigram,
-                                 (cur_order == ngram_order_ - 1),
-                                 logprob, backoff_logprob);
-        }
-
-        // Figures out the sequence of words.
-        std::vector<int32> seq(cur_order, 0);
-        for (int32 index = 0; index < cur_order; ++index) {
-          int32 word;
-          if (!ConvertStringToInteger(col[1 + index], &word)) {
-            KALDI_ERR << "bad line: " << line << "; fail to convert "
-                << col[1 + index] << " to integer.";
-          }
-          seq[index] = word;
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not insert LmState to
-        // <seq_to_state_>.
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          KALDI_ASSERT(lm_state != NULL);
-          KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end());
-          seq_to_state_[seq] = lm_state;
-        }
-
-        // If n-gram order is larger than 1, we have to add possible child to
-        // existing LmStates. We have the following two assumptions:
-        // 1. N-grams are processed from small order to larger ones, i.e., from
-        //    1, 2, ... to the highest order.
-        // 2. If a n-gram exists in the Arpa format language model, then the
-        //    "history" n-gram also exists. For example, if "A B C" is a valid
-        //    n-gram, then "A B" is also a valid n-gram.
-        if (cur_order > 1) {
-          std::vector<int32> hist(seq.begin(), seq.begin() + cur_order - 1);
-          int32 word = seq[seq.size() - 1];
-          unordered_map<std::vector<int32>,
-                        LmState*, VectorHasher<int32> >::iterator hist_iter;
-          hist_iter = seq_to_state_.find(hist);
-          KALDI_ASSERT(hist_iter != seq_to_state_.end());
-          if (cur_order != ngram_order_ || ngram_order_ == 1) {
-            KALDI_ASSERT(lm_state != NULL);
-            KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, lm_state);
-          } else {
-            KALDI_ASSERT(lm_state == NULL);
-            KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, logprob);
-          }
-        } else {
-          // Figures out <max_word_id>.
-          KALDI_ASSERT(seq.size() == 1);
-          if (seq[0] > max_word_id) {
-            max_word_id = seq[0];
-          }
-        }
-      }
-    } while (getline(is, line) && !is.eof());
-    if (ngram_count > num_ngrams[cur_order] ||
-        (ngram_count == 0 && num_ngrams[cur_order] != 0)) {
-      KALDI_ERR << "Header said there would be " << num_ngrams[cur_order]
-                << " n-grams of order " << cur_order << ", but we saw "
-                << ngram_count;
+  // If n-gram order is larger than 1, we have to add possible child to
+  // existing LmStates. We have the following two assumptions:
+  // 1. N-grams are processed from small order to larger ones, i.e., from
+  //    1, 2, ... to the highest order.
+  // 2. If a n-gram exists in the Arpa format language model, then the
+  //    "history" n-gram also exists. For example, if "A B C" is a valid
+  //    n-gram, then "A B" is also a valid n-gram.
+  int32 last_word = ngram.words[cur_order - 1];
+  if (cur_order > 1) {
+    std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
+    unordered_map<std::vector<int32>,
+                  LmState*, VectorHasher<int32> >::iterator hist_iter;
+    hist_iter = seq_to_state_.find(hist);
+    if (hist_iter == seq_to_state_.end()) {
+      std::ostringstream ss;
+      for (int i = 0; i < cur_order; ++i)
+        ss << (i == 0 ? '[' : ' ') << ngram.words[i];
+      KALDI_ERR << "In line " << LineNumber() << ": "
+                << cur_order << "-gram " << ss.str() << "] does not have "
+                << "a parent model " << cur_order << "-gram.";
+    }
+    if (cur_order != ngram_order_ || ngram_order_ == 1) {
+      KALDI_ASSERT(lm_state != NULL);
+      KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, lm_state);
+    } else {
+      KALDI_ASSERT(lm_state == NULL);
+      KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, ngram.logprob);
     }
+  } else {
+    // Figures out <max_word_id>.
+    num_words_ = std::max(num_words_, last_word + 1);
   }
-
-  // <num_words_> is <max_word_id> plus 1.
-  num_words_ = max_word_id + 1;
 }
 
 // ConstArpaLm can be built in the following steps, assuming we have already
@@ -503,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
 //    At the same time, we will also create two special buffers:
 //    <unigram_states_>
 //    <overflow_buffer_>
-void ConstArpaLmBuilder::Build() {
+void ConstArpaLmBuilder::ReadComplete() {
   // STEP 1: sorting LmStates lexicographically.
   // Vector for holding the sorted LmStates.
   std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
@@ -637,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(is_built_);
 
   // Creates ConstArpaLm.
-  ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_,
-                            num_words_, overflow_buffer_size_, lm_states_size_,
-                            unigram_states_, overflow_buffer_, lm_states_);
+  ConstArpaLm const_arpa_lm(
+      Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
+      ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
+      unigram_states_, overflow_buffer_, lm_states_);
   const_arpa_lm.Write(os, binary);
 }
 
@@ -1224,10 +1066,15 @@ bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol,
                       const int32 eos_symbol, const int32 unk_symbol,
                       const std::string& arpa_rxfilename,
                       const std::string& const_arpa_wxfilename) {
-  ConstArpaLmBuilder lm_builder(natural_base, bos_symbol,
-                                eos_symbol, unk_symbol);
+  ArpaParseOptions options;
+  options.bos_symbol = bos_symbol;
+  options.eos_symbol = eos_symbol;
+  options.unk_symbol = unk_symbol;
+  options.use_log10 = !natural_base;
+
+  ConstArpaLmBuilder lm_builder(options);
+  KALDI_LOG << "Reading " << arpa_rxfilename;
   ReadKaldiObject(arpa_rxfilename, &lm_builder);
-  lm_builder.Build();
   WriteKaldiObject(lm_builder, const_arpa_wxfilename, true);
   return true;
 }
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 76b83ea7114..b9c85e9ae6e 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -448,7 +448,7 @@ template<typename Real>
 void MatrixBase<Real>::AddMatDiagVec(
     const Real alpha,
     const MatrixBase<Real> &M, MatrixTransposeType transM,
-    VectorBase<Real> &v,
+    const VectorBase<Real> &v,
     Real beta) {
 
   if (beta != 1.0) this->Scale(beta);
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index c16ffb22135..add6fab93b3 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -554,7 +554,7 @@ class MatrixBase {
   /// The same as adding M but scaling each column M_j by v(j).
   void AddMatDiagVec(const Real alpha,
                      const MatrixBase<Real> &M, MatrixTransposeType transM,
-                     VectorBase<Real> &v,
+                     const VectorBase<Real> &v,
                      Real beta = 1.0);
 
   /// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 46e2b0c01dc..bcb6c9e581a 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -108,14 +108,19 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     if (nnet_config_.compute_deriv)
       nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                                kUndefined);
-    if (use_xent)
+
+    const CuMatrixBase<BaseFloat> *xent_output = NULL;
+    if (use_xent) {
       xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                         kUndefined);
-      
+
+      // this block computes the cross-entropy objective.
+      xent_output = &(computer->GetOutput(xent_name));
+    }
+
     BaseFloat tot_like, tot_l2_term, tot_weight;
-    
     ComputeChainObjfAndDeriv(chain_config_, den_graph_,
-                             sup.supervision, nnet_output,
+                             sup.supervision, nnet_output, xent_output,
                              &tot_like, &tot_l2_term, &tot_weight,
                              (nnet_config_.compute_deriv ? &nnet_output_deriv :
                               NULL), (use_xent ? &xent_deriv : NULL));
@@ -138,13 +143,10 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
 
     if (use_xent) {
       ChainObjectiveInfo &xent_totals = objf_info_[xent_name];
-      // this block computes the cross-entropy objective.
-      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
-          xent_name);
       // at this point, xent_deriv is posteriors derived from the numerator
       // computation.  note, xent_deriv has a factor of '.supervision.weight',
       // but so does tot_weight.
-      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      BaseFloat xent_objf = TraceMatMat(*xent_output, xent_deriv, kTrans);
       xent_totals.tot_weight += tot_weight;
       xent_totals.tot_like += xent_objf;
     }
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index dee0eee2a33..1dbade49469 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -113,25 +113,24 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
     bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> xent_deriv;
-    if (use_xent)
+    const CuMatrixBase<BaseFloat> *xent_output = NULL;
+    if (use_xent) {
       xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                         kUndefined);
-
+      // this block computes the cross-entropy objective.
+      xent_output = &(computer->GetOutput(xent_name));
+    }
     BaseFloat tot_objf, tot_l2_term, tot_weight;
-
     ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
-                             sup.supervision, nnet_output,
+                             sup.supervision, nnet_output, xent_output,
                              &tot_objf, &tot_l2_term, &tot_weight,
                              &nnet_output_deriv,
                              (use_xent ? &xent_deriv : NULL));
 
     if (use_xent) {
-      // this block computes the cross-entropy objective.
-      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
-          xent_name);
       // at this point, xent_deriv is posteriors derived from the numerator
       // computation.  note, xent_objf has a factor of '.supervision.weight'
-      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      BaseFloat xent_objf = TraceMatMat((*xent_output), xent_deriv, kTrans);
       objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval,
                                         num_minibatches_processed_,
                                         tot_weight, xent_objf);
diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch
index e142341f5ba..858a61160fa 100644
--- a/tools/extras/openfstwin-1.3.4.patch
+++ b/tools/extras/openfstwin-1.3.4.patch
@@ -1,425 +1,425 @@
-diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
-index 5ad3b52..d9c0ca6 100644
---- a/src/include/fst/fst.h
-+++ b/src/include/fst/fst.h
-@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
- 
- namespace fst {
- 
-+	typedef ::int64 int64;
-+	typedef ::uint64 uint64;
-+	typedef ::int32 int32;
-+	typedef ::uint32 uint32;
-+
-+
- bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
- 
- class FstHeader;
-diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
-index c4362f2..58cad44 100644
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
- 
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
- 
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
- 
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
- 
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
- 
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
- 
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
- 
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
- 
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
- 
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
- 
- 
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
- 
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
- 
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
- 
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
-diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
-index a7c3360..491ef7d 100644
---- a/src/include/fst/label-reachable.h
-+++ b/src/include/fst/label-reachable.h
-@@ -359,9 +359,9 @@ class LabelReachable {
-                iiter = intervals->begin();
-            iiter != intervals->end(); ++iiter) {
-         begin_low = LowerBound(aiter, end_low, aiter_end,
--                               aiter_input, iiter->begin);
-+                               aiter_input, iiter->begin_);
-         end_low = LowerBound(aiter, begin_low, aiter_end,
--                             aiter_input, iiter->end);
-+                             aiter_input, iiter->end_);
-         if (end_low - begin_low > 0) {
-           if (reach_begin_ < 0)
-             reach_begin_ = begin_low;
-diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
-index 3fbe3ba..6e9dd3d 100644
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
- 
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
- 
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
- 
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
-diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
-index dcee67b..40b849a 100644
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
- 
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
- 
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
- 
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
- 
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
- 
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
- 
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) { 
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
--
-+    
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
- 
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
--        class_size_[class_id] = split_size_[class_id];
--        class_size_[new_class] = remainder;
-         split_el->prev->next = 0;
-         split_el->prev = 0;
-+        class_size_[class_id] = split_size_[class_id];
-+        class_size_[new_class] = remainder;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
- 
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
- 
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
- 
- 
-diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
-index 6d0c971..1da922e 100644
---- a/src/include/fst/state-reachable.h
-+++ b/src/include/fst/state-reachable.h
-@@ -112,7 +112,7 @@ class IntervalReachVisitor {
-   void FinishState(StateId s, StateId p, const A *arc) {
-     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
-       vector<Interval> *intervals = (*isets_)[s].Intervals();
--      (*intervals)[0].end = index_;      // Update tree interval end
-+      (*intervals)[0].end_ = index_;      // Update tree interval end
-     }
-     (*isets_)[s].Normalize();
-     if (p != kNoStateId)
+diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
+index 5ad3b52..d9c0ca6 100644
+--- a/src/include/fst/fst.h
++++ b/src/include/fst/fst.h
+@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
+ 
+ namespace fst {
+ 
++	typedef ::int64 int64;
++	typedef ::uint64 uint64;
++	typedef ::int32 int32;
++	typedef ::uint32 uint32;
++
++
+ bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
+ 
+ class FstHeader;
+diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
+index c4362f2..58cad44 100644
+--- a/src/include/fst/interval-set.h
++++ b/src/include/fst/interval-set.h
+@@ -37,38 +37,38 @@ template <typename T>
+ class IntervalSet {
+  public:
+   struct Interval {
+-    T begin;
+-    T end;
++    T begin_;
++    T end_;
+ 
+-    Interval() : begin(-1), end(-1) {}
++    Interval() : begin_(-1), end_(-1) {}
+ 
+-    Interval(T b, T e) : begin(b), end(e) {}
++    Interval(T b, T e) : begin_(b), end_(e) {}
+ 
+     bool operator<(const Interval &i) const {
+-      return begin < i.begin || (begin == i.begin && end > i.end);
++      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
+     }
+ 
+     bool operator==(const Interval &i) const {
+-      return begin == i.begin && end == i.end;
++      return begin_ == i.begin_ && end_ == i.end_;
+     }
+ 
+     bool operator!=(const Interval &i) const {
+-      return begin != i.begin || end != i.end;
++      return begin_ != i.begin_ || end_ != i.end_;
+     }
+ 
+     istream &Read(istream &strm) {
+       T n;
+       ReadType(strm, &n);
+-      begin = n;
++      begin_ = n;
+       ReadType(strm, &n);
+-      end = n;
++      end_ = n;
+       return strm;
+     }
+ 
+     ostream &Write(ostream &strm) const {
+-      T n = begin;
++      T n = begin_;
+       WriteType(strm, n);
+-      n = end;
++      n = end_;
+       WriteType(strm, n);
+       return strm;
+     }
+@@ -108,7 +108,7 @@ class IntervalSet {
+         lower_bound(intervals_.begin(), intervals_.end(), interval);
+     if (lb == intervals_.begin())
+       return false;
+-    return (--lb)->end > value;
++    return (--lb)->end_ > value;
+   }
+ 
+   // Requires intervals be normalized.
+@@ -123,7 +123,7 @@ class IntervalSet {
+ 
+   bool Singleton() const {
+     return intervals_.size() == 1 &&
+-        intervals_[0].begin + 1 == intervals_[0].end;
++        intervals_[0].begin_ + 1 == intervals_[0].end_;
+   }
+ 
+ 
+@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
+   T size = 0;
+   for (T i = 0; i < intervals_.size(); ++i) {
+     Interval &inti = intervals_[i];
+-    if (inti.begin == inti.end)
++    if (inti.begin_ == inti.end_)
+       continue;
+     for (T j = i + 1; j < intervals_.size(); ++j) {
+       Interval &intj = intervals_[j];
+-      if (intj.begin > inti.end)
++      if (intj.begin_ > inti.end_)
+         break;
+-      if (intj.end > inti.end)
+-        inti.end = intj.end;
++      if (intj.end_ > inti.end_)
++        inti.end_ = intj.end_;
+       ++i;
+     }
+-    count_ += inti.end - inti.begin;
++    count_ += inti.end_ - inti.begin_;
+     intervals_[size++] = inti;
+   }
+   intervals_.resize(size);
+@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
+   oset->count_ = 0;
+ 
+   while (it1 != intervals_.end() && it2 != iintervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       Interval interval;
+-      interval.begin = max(it1->begin, it2->begin);
+-      interval.end = min(it1->end, it2->end);
++      interval.begin_ = max(it1->begin_, it2->begin_);
++      interval.end_ = min(it1->end_, it2->end_);
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
+-      if (it1->end < it2->end)
++      oset->count_ += interval.end_ - interval.begin_;
++      if (it1->end_ < it2->end_)
+         ++it1;
+       else
+         ++it2;
+@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
+   oset->count_ = 0;
+ 
+   Interval interval;
+-  interval.begin = 0;
++  interval.begin_ = 0;
+   for (typename vector<Interval>::const_iterator it = intervals_.begin();
+        it != intervals_.end();
+        ++it) {
+-    interval.end = min(it->begin, maxval);
+-    if (interval.begin < interval.end) {
++    interval.end_ = min(it->begin_, maxval);
++    if (interval.begin_ < interval.end_) {
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
++      oset->count_ += interval.end_ - interval.begin_;
+     }
+-    interval.begin = it->end;
++    interval.begin_ = it->end_;
+   }
+-  interval.end = maxval;
+-  if (interval.begin < interval.end) {
++  interval.end_ = maxval;
++  if (interval.begin_ < interval.end_) {
+     ointervals->push_back(interval);
+-    oset->count_ += interval.end - interval.begin;
++    oset->count_ += interval.end_ - interval.begin_;
+   }
+ }
+ 
+@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
+     oset->count_ = 0;
+   } else {
+     IntervalSet<T> cset;
+-    iset.Complement(intervals_.back().end, &cset);
++    iset.Complement(intervals_.back().end_, &cset);
+     Intersect(cset, oset);
+   }
+ }
+@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       return true;
+@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
+   bool overlap = false; // point in both intervals_ and intervals
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       only1 = true;
+       ++it1;
+-    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
++    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
+       only2 = true;
+       ++it2;
+-    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
++    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
+       overlap = true;
+       ++it1;
+       ++it2;
+-    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
++    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
+       only2 = true;
+       overlap = true;
+       ++it1;
+-    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
++    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
+       only1 = true;
+       overlap = true;
+       ++it2;
+@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       ++it1;
+-    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
++    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
+       return false;
+-    } else if (it2->end == it1->end) {
++    } else if (it2->end_ == it1->end_) {
+       ++it1;
+       ++it2;
+     } else {
+@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
+        ++it) {
+     if (it != intervals->begin())
+       strm << ",";
+-    strm << "[" << it->begin << "," << it->end << ")";
++    strm << "[" << it->begin_ << "," << it->end_ << ")";
+   }
+   strm << "}";
+   return strm;
+diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
+index a7c3360..491ef7d 100644
+--- a/src/include/fst/label-reachable.h
++++ b/src/include/fst/label-reachable.h
+@@ -359,9 +359,9 @@ class LabelReachable {
+                iiter = intervals->begin();
+            iiter != intervals->end(); ++iiter) {
+         begin_low = LowerBound(aiter, end_low, aiter_end,
+-                               aiter_input, iiter->begin);
++                               aiter_input, iiter->begin_);
+         end_low = LowerBound(aiter, begin_low, aiter_end,
+-                             aiter_input, iiter->end);
++                             aiter_input, iiter->end_);
+         if (end_low - begin_low > 0) {
+           if (reach_begin_ < 0)
+             reach_begin_ = begin_low;
+diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
+index 3fbe3ba..6e9dd3d 100644
+--- a/src/include/fst/minimize.h
++++ b/src/include/fst/minimize.h
+@@ -134,7 +134,14 @@ class CyclicMinimizer {
+   typedef typename A::Weight Weight;
+   typedef ReverseArc<A> RevA;
+ 
+-  CyclicMinimizer(const ExpandedFst<A>& fst) {
++  CyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      P_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // this minimization
++    // algorithm for non-deterministic FSTs can only work with idempotent
++    // semirings.
+     Initialize(fst);
+     Compute(fst);
+   }
+@@ -315,7 +322,13 @@ class AcyclicMinimizer {
+   typedef typename A::StateId ClassId;
+   typedef typename A::Weight Weight;
+ 
+-  AcyclicMinimizer(const ExpandedFst<A>& fst) {
++  AcyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      partition_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // minimization for
++    // non-deterministic FSTs can only work with idempotent semirings.
+     Initialize(fst);
+     Refine(fst);
+   }
+@@ -531,13 +544,7 @@ template <class A>
+ void Minimize(MutableFst<A>* fst,
+               MutableFst<A>* sfst = 0,
+               float delta = kDelta) {
+-  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
+-                                 kWeighted | kUnweighted, true);
+-  if (!(props & kIDeterministic)) {
+-    FSTERROR() << "FST is not deterministic";
+-    fst->SetProperties(kError, kError);
+-    return;
+-  }
++  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
+ 
+   if (!(props & kAcceptor)) {  // weighted transducer
+     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
+diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
+index dcee67b..40b849a 100644
+--- a/src/include/fst/partition.h
++++ b/src/include/fst/partition.h
+@@ -43,8 +43,8 @@ class Partition {
+   friend class PartitionIterator<T>;
+ 
+   struct Element {
+-   Element() : value(0), next(0), prev(0) {}
+-   Element(T v) : value(v), next(0), prev(0) {}
++    Element() : value(0), next(0), prev(0) {}
++    Element(T v) : value(v), next(0), prev(0) {}
+ 
+    T        value;
+    Element* next;
+@@ -52,9 +52,11 @@ class Partition {
+   };
+ 
+  public:
+-  Partition() {}
++  Partition(bool allow_repeated_split):
++      allow_repeated_split_(allow_repeated_split) {}
+ 
+-  Partition(T num_states) {
++  Partition(bool allow_repeated_split, T num_states):
++      allow_repeated_split_(allow_repeated_split) {
+     Initialize(num_states);
+   }
+ 
+@@ -137,16 +139,16 @@ class Partition {
+     if (class_size_[class_id] == 1) return;
+ 
+     // first time class is split
+-    if (split_size_[class_id] == 0)
++    if (split_size_[class_id] == 0) { 
+       visited_classes_.push_back(class_id);
+-
++      class_split_[class_id] = classes_[class_id];
++    }
+     // increment size of split (set of element at head of chain)
+     split_size_[class_id]++;
+-
++    
+     // update split point
+-    if (class_split_[class_id] == 0)
+-      class_split_[class_id] = classes_[class_id];
+-    if (class_split_[class_id] == elements_[element_id])
++    if (class_split_[class_id] != 0
++        && class_split_[class_id] == elements_[element_id])
+       class_split_[class_id] = elements_[element_id]->next;
+ 
+     // move to head of chain in same class
+@@ -157,24 +159,31 @@ class Partition {
+   // class indices of the newly created class. Returns the new_class id
+   // or -1 if no new class was created.
+   T SplitRefine(T class_id) {
++
++    Element* split_el = class_split_[class_id];
+     // only split if necessary
+-    if (class_size_[class_id] == split_size_[class_id]) {
+-      class_split_[class_id] = 0;
++    //if (class_size_[class_id] == split_size_[class_id]) {
++    if(split_el == NULL) { // we split on everything...
+       split_size_[class_id] = 0;
+       return -1;
+     } else {
+-
+       T new_class = AddClass();
++
++      if(allow_repeated_split_) { // split_size_ is possibly
++        // inaccurate, so work it out exactly.
++        size_t split_count;  Element *e;
++        for(split_count=0,e=classes_[class_id];
++            e != split_el; split_count++, e=e->next);
++        split_size_[class_id] = split_count;
++      }
+       size_t remainder = class_size_[class_id] - split_size_[class_id];
+       if (remainder < split_size_[class_id]) {  // add smaller
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = split_el;
+-        class_size_[class_id] = split_size_[class_id];
+-        class_size_[new_class] = remainder;
+         split_el->prev->next = 0;
+         split_el->prev = 0;
++        class_size_[class_id] = split_size_[class_id];
++        class_size_[new_class] = remainder;
+       } else {
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = classes_[class_id];
+         class_size_[class_id] = remainder;
+         class_size_[new_class] = split_size_[class_id];
+@@ -245,10 +254,16 @@ class Partition {
+   vector<T> class_size_;
+ 
+   // size of split for each class
++  // in the nondeterministic case, split_size_ is actually an upper
++  // bound on the size of split for each class.
+   vector<T> split_size_;
+ 
+   // set of visited classes to be used in split refine
+   vector<T> visited_classes_;
++
++  // true if input fst was deterministic: we can make
++  // certain assumptions in this case that speed up the algorithm.
++  bool allow_repeated_split_;
+ };
+ 
+ 
+diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
+index 6d0c971..1da922e 100644
+--- a/src/include/fst/state-reachable.h
++++ b/src/include/fst/state-reachable.h
+@@ -112,7 +112,7 @@ class IntervalReachVisitor {
+   void FinishState(StateId s, StateId p, const A *arc) {
+     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
+       vector<Interval> *intervals = (*isets_)[s].Intervals();
+-      (*intervals)[0].end = index_;      // Update tree interval end
++      (*intervals)[0].end_ = index_;      // Update tree interval end
+     }
+     (*isets_)[s].Normalize();
+     if (p != kNoStateId)
diff --git a/windows/INSTALL b/windows/INSTALL
deleted file mode 100644
index d743129498b..00000000000
--- a/windows/INSTALL
+++ /dev/null
@@ -1,146 +0,0 @@
-
-# Installation instructions for native Windows with Visual
-# studio (for cygwin installation, see the instructions 
-# in ../INSTALL).
-
-#NOTE: These instructions are valid June 2015, MKL and OpenBLAS are supported
-#NOTE: ATLAS is not supported and I personally have no intention to work on supporting
-#      it, as it requires whole cygwin environment
-#NOTE: We now (20150613) support CUDA on Windows as well. The build was 
-#      tested on CUDA 7.0. It is possible that the compilation fails
-#      for significantly older CUDA SDK (less than, say, 5.0)
-#      Please not that CUDA support for windows is not really that usefull,
-#      because, the speed benefit during decoding is not large. And for training
-#      one would have to re-implement the while training pipeline (as the 
-#      bash script wouldn't most probably work) 
-#NOTE: While the 32bit project files will still be generated, we don't really
-#      care if they work or not. They will be removed in the near future.
-#NOTE: The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
-#NOTE: We support only openfst-1.3.x for now.
-#NOTE: I suggest to have git installed -- not only because we will 
-#      use it to download the source codes (you could download archives
-#      instead of it), but also because the windows version comes
-#      with a bunch of useful utilities. 
-#NOTE: The examples will assume you have installed the git for windows
-#      and during the installation you chose the GIT Shell to install as well.
-#      Moreover, all the commands are issued from the same session
-
-1) Checkout Kaldi trunk, either using the svn from the url
-   https://svn.code.sf.net/p/kaldi/code/trunk
-   or using git from 
-   https://github.com/kaldi-asr/kaldi.git
-   Example:
-     $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
-
-2) enter the (kaldi)/tools directory in the freshly 
-   checked-out kaldi repo. All following actions should
-   be taken in the tools dir
-   Example:
-     $ cd (kaldi)/tools
-	 (kaldi)/tools$ pwd
-   
-   
-2a) Use git to clone the OpenFST(win) from
-   https://github.com/jtrmal/openfstwin-1.3.4.git
-   Example:
-     (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
-   
-   
-2b) Download pthread-win32 (or wget or curl)
-   https://sourceforge.net/projects/pthreads4w/
-     (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
-	 (kaldi)/tools$ mkdir pthreads; cd pthreads
-	 (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
-
-2c) Use patch (or you can use git patch) to patch the OpenFST(win)
-   patch location tools/extras/openfstwin-1.3.4.patch,
-   Example:
-     (kaldi)/tools$ cd openfst
-	 (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch 
-
-2d-1) Download the OpenBLAS binary packages
-      https://sourceforge.net/projects/openblas
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
-	  (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ unzip mingw64_dll.zip
-	  
-	  NOTE: Be carefull to download "Win64-int32" and not "Win64-int64"!
-	  
-2d-2) Install MKL
-2e) If you want enabled CUDA support, download and install NVidia CUDA SDK.
-    Be careful and strive for as standard install as possible. The installer
-	set certain environment variables on which the MSVC Build rules rely.
-	If you call "set" in the command line, you should see:
-    
-	(kaldi)/tools $ set | grep CUDA
-     CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
-     CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
-	
-    The first one (CUDA_PATH) is particulary important.
- 	
-3)  Open the OpenFST solution in VS -- 
-   for VS 2013, the correct solution is in VS2012 directory
-   for VS 2014, the correct solution is in VS2014 directory
-   !!!switch the configuration to debug|x64 and build the solution
-   !!!The same for configuration release|x64
-   If either of the two won't build, you should stop here and start figuring what's different!
-
-4)  Enter the (kaldi)/windows directory
-   Example:
-	 (kaldi)/tools/openfst$ cd ../../windows
-	 (kaldi)/windows $ pwd
-	 
-4a) modify the file variables.props to reflect 
-    the correct paths, using your favorite text editor.
-	Don't worry, it's a text file, even though you have to be 
-	careful to keep the structure itself intact
-	(kaldi)/windows $ vim variables.props
-	
-	If you plan to use MKL, you can ignore the OPENBLASDIR path
-	If you plan to use OpenBLAS, you can ignore the MKLDIR path
-	No matter what you plan to use, set both the OPENFST* and PTHREADW
-	variables correctly
-	
-4b-1) For OpenBLAS support, copy the file "kaldiwin_openblas.props" to "kaldiwin.props"
-4b-2) For MKL support, you don't have to do anything, it should work out of the box. 
-      When you need to switch from OpenBLAS to MKL, copy the "kaldiwin_mkl.props" 
-	  to "kaldiwin.props"
-
-
-4c) call the script that generates the MSVC solution
-	i.e.
-	generate_solution.pl --vsver <default|vs2013|vs2015>
-	i.e. for example
-	generate_solution.pl --vsver vs2013
-	
-	For CUDA support, add switch --enable-cuda to the command line,
-	i.e. for example
-	generate_solution.pl --vsver vs2013 --enable-cuda
-	
-5)  Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build
-   Expect 10 projects to fail, majority of them will fail because of missing include "portaudio.h"
-
-------  	
-NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS)
-(B) either
-   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
-  kaldiwin_atlas.props  to kaldiwin.props
-
-(D)
-If you had installed ATLAS, you next have to do this:
-[assuming you are one level above this directory]
-cd kaldiwin_vs10_auto/
-
-# type the following (these commands were done from cygwin): note that these
-# commands are a bit wasteful of disk; you could alternatively ensure that
-# [root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you
-# run the binaries.
-
-mkdir -p Debug Release
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
-
-Then build the project with Visual Studio.
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
new file mode 100644
index 00000000000..6a57d3d2ee2
--- /dev/null
+++ b/windows/INSTALL.md
@@ -0,0 +1,176 @@
+
+# Installation instructions for native Windows with Visual Studio
+
+For cygwin installation, see the instructions in `../INSTALL`.
+
+## Notes
+
+* These instructions are valid June 2015, MKL and OpenBLAS are supported
+* ATLAS is not supported and I personally have no intention to work on supporting
+  it, as it requires whole cygwin environment
+* We now (20150613) support CUDA on Windows as well. The build was
+  tested on CUDA 7.0. It is possible that the compilation fails
+  for significantly older CUDA SDK (less than, say, 5.0)
+  Please not that CUDA support for windows is not really that usefull,
+  because, the speed benefit during decoding is not large. And for training
+  one would have to re-implement the while training pipeline (as the
+  bash script wouldn't most probably work)
+* While the 32bit project files will still be generated, we don't really
+  care if they work or not. They will be removed in the near future.
+* The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
+* We support only openfst-1.3.x for now.
+* I suggest to have git installed -- not only because we will
+  use it to download the source codes (you could download archives
+  instead of it), but also because the windows version comes
+  with a bunch of useful utilities.
+* The examples will assume you have installed the git for windows
+  and during the installation you chose the GIT Shell to install as well.
+  Moreover, all the commands are issued from the same session.
+
+## Steps
+
+1. Checkout Kaldi trunk, either using the svn from the url https://svn.code.sf.net/p/kaldi/code/trunk
+   or using git from https://github.com/kaldi-asr/kaldi.git
+
+   Example:
+   
+        $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
+
+2. Enter the `(kaldi)/tools` directory in the freshly
+   checked-out kaldi repo. All following actions should
+   be taken in the tools dir.
+
+   Example:
+   
+        $ cd (kaldi)/tools
+        (kaldi)/tools$ pwd
+
+3. Use git to clone the OpenFST(win) from
+       
+        https://github.com/jtrmal/openfstwin-1.3.4.git
+
+   Example:
+   
+        (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
+
+4. Download pthread-win32 (or wget or curl)
+
+   https://sourceforge.net/projects/pthreads4w/
+
+        (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
+        (kaldi)/tools$ mkdir pthreads; cd pthreads
+        (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
+
+5. Use patch (or you can use git patch) to patch the OpenFST(win).
+
+   The patch location is `tools/extras/openfstwin-1.3.4.patch`
+
+   Example:
+   
+        (kaldi)/tools$ cd openfst
+        (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch
+
+   If you get this error: `Assertion failed: hunk, file ../patch-2.5.9-src/patch.c, line 354`
+   it is because the `patch.c` file should have Windows line endings (CRLF) rather than Unix ones (LF).
+   
+There are two options to use for BLAS (linear algebra): MLK and OpenBLAS. MLK is made by Intel and is optimised
+for their processors. Unfortunately it isn't free. OpenBLAS is free alternative with similar performance.
+
+6. If using MLK, install it.
+
+7. If using OpenBLAS, download the binary packages.
+
+   https://sourceforge.net/projects/openblas
+
+        (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
+        (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ unzip mingw64_dll.zip
+
+   **Be careful to download "Win64-int32" and not "Win64-int64"!**
+
+8. If you want enabled CUDA support, download and install NVidia CUDA SDK.
+   Be careful and strive for as standard install as possible. The installer
+   set certain environment variables on which the MSVC Build rules rely.
+   If you call "set" in the command line, you should see:
+
+        (kaldi)/tools $ set | grep CUDA
+        CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
+        CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
+
+   The first one (`CUDA_PATH`) is particularly important.
+
+9. Open the OpenFST solution in VS
+
+   * for VS 2013, the correct solution is in VS2012 directory
+   * for VS 2014, the correct solution is in VS2014 directory
+
+   **Switch the configuration to `debug|x64` and build the solution.**
+
+   **Do the same for configuration `release|x64`.**
+
+   If either of the two won't build, you should stop here and start figuring what's different!
+
+10. Enter the `(kaldi)/windows` directory
+
+    Example:
+    
+         (kaldi)/tools/openfst$ cd ../../windows
+         (kaldi)/windows $ pwd
+
+11. Modify the file `variables.props` to reflect
+    the correct paths, using your favorite text editor.
+    Don't worry, it's a text file, even though you have to be
+    careful to keep the structure itself intact
+
+         (kaldi)/windows $ vim variables.props
+
+    If you plan to use MKL, you can ignore the `OPENBLASDIR` path.
+    If you plan to use OpenBLAS, you can ignore the `MKLDIR` path.
+    No matter what you plan to use, set both the `OPENFST*` and `PTHREADW`
+    variables correctly
+
+12. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props`
+13. For MKL support, you don't have to do anything, it should work out of the box.
+    When you need to switch from OpenBLAS to MKL, copy the `kaldiwin_mkl.props`
+    to `kaldiwin.props`
+
+14. Call the script that generates the MSVC solution
+
+         generate_solution.pl --vsver <default|vs2013|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mlk]
+
+    `--enable-mlk` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MLK support.
+    CUDA is disabled by default. The default Visual Studio version is 11.0 (Visual Studio 2012).
+
+    For example, for a build supporting CUDA using OpenBLAS and VS 2015 you would run:
+
+         (kaldi)/tools$ generate_solution.pl --vsver vs2015 --enable-cuda --enable-openblas
+
+15. Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build.
+   Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`
+
+------
+NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS)
+
+(B) either
+   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
+  kaldiwin_atlas.props  to kaldiwin.props
+
+(D)
+If you had installed ATLAS, you next have to do this:
+[assuming you are one level above this directory]
+
+    cd kaldiwin_vs10_auto/
+
+Type the following (these commands were done from cygwin): note that these
+commands are a bit wasteful of disk; you could alternatively ensure that
+[root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you
+run the binaries.
+
+    mkdir -p Debug Release
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
+
+Then build the project with Visual Studio.