-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hainan's RNNLM setup #37
base: master
Are you sure you want to change the base?
Changes from all commits
d0a06e3
03f6bd8
1bae64a
912770f
451e2a4
1d4e7c8
55c41b1
f7181e4
3805d0f
1ba7357
5212f54
1b7b23b
2f22d48
52e35c6
37dc6c5
9ed83d6
d00217c
52652ec
0f85a2d
bd11e28
b12b1d0
2755d54
49737e9
e582e09
e1b33cd
7dd21ad
3ea9ef3
5df77b4
75b323d
a4d40c9
e61de3c
7cfe475
7e20e28
7acd96a
cd2ea74
f75e5c9
169fc0f
b23955d
42a3ac9
86abb85
e7c5d8a
03eabbd
147cf87
5c4da99
ecb4e7f
063b140
9299526
2b345ef
0d1eba4
2352a5b
3a16b1f
a4d32f8
d20661b
0e8e5b5
3b44e8b
78eb7c1
d7c5a73
02f0498
e18eb37
0438594
14d0bd6
2c12ba5
69510ca
131c8d9
07e0a9b
ff1ab0e
b3f0c08
d9677a3
0670dc5
22a84c6
1e555ee
167dfd9
c1c44d6
47b9fa0
0ff3342
53d2b63
2d2dfe2
b38eb6a
3580ed9
ce2af52
cd110a0
e245a3d
34e78a1
ed64185
95a6f96
8194443
0540dd7
d0bffb9
bc21318
a7fb871
a3d19ca
3e574ae
33a004c
ebab9f7
3a664b9
f9e81b8
1795a5c
ae9b986
6a43ff7
1a8536c
3b70acf
2af0e93
1142e60
c79e1b1
09b35fd
62c48b2
7ae1bff
dd8aea9
d6ae8e7
799f355
0f0870b
b49997a
a6adf89
e8df0c5
0aa1bc5
7c23ceb
bac0a08
8fe4b95
c2e65e0
6b074f2
e3da23c
2c2312b
511e136
6ea1135
049c437
69e328b
6c5da3f
5967298
bcba8fb
b5bdf3c
2268dad
868cf8c
c8baae7
dd777c0
efcf0bb
1fdc779
2fcb4f8
04996e8
1190b3d
62e5f9b
03ec444
60608b4
12d387a
eb37f98
f9db3f3
f92dd96
e6d9d19
77c7306
045398c
d61ff54
26a2a11
4fdde14
8403b67
d1202d3
b2d4bac
38ef328
12f119e
2103485
449c15b
af2a4c2
7de1e33
9506f72
af29d26
9fc7168
f23806c
4c4a6c9
e84e403
7fa005c
ef6a357
9bddfc5
595bd10
265b9fa
799111f
7de184e
c1d5f0c
9f88f56
e1925eb
ffc8a11
fad5050
d95a352
d6b1e4c
ba5b446
7e179be
fb4eb99
c41bbc9
388fcf5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#!/bin/bash | ||
|
||
train_text=data/sdm1/train/text | ||
dev_text=data/sdm1/dev/text | ||
|
||
type=lstm | ||
|
||
num_words_in=10000 | ||
num_words_out=10000 | ||
|
||
stage=-100 | ||
sos="<s>" | ||
eos="</s>" | ||
oos="<oos>" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i assume this means out of set. maybe best to clarify via a comment? |
||
|
||
max_param_change=20 | ||
num_iters=30 | ||
|
||
shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples | ||
minibatch_size=64 | ||
|
||
initial_learning_rate=0.01 | ||
final_learning_rate=0.0005 | ||
learning_rate_decline_factor=1.1 | ||
|
||
num_lstm_layers=1 | ||
cell_dim=64 | ||
hidden_dim=256 | ||
recurrent_projection_dim=0 | ||
non_recurrent_projection_dim=64 | ||
norm_based_clipping=true | ||
clipping_threshold=30 | ||
label_delay=0 # 5 | ||
splice_indexes=0 | ||
|
||
. cmd.sh | ||
. path.sh | ||
. parse_options.sh || exit 1; | ||
|
||
outdir=data/sdm1/lstm-$initial_learning_rate-$final_learning_rate-$learning_rate_decline_factor-$minibatch_size-$type | ||
srcdir=data/local/dict | ||
|
||
set -e | ||
|
||
mkdir -p $outdir | ||
|
||
if [ $stage -le -4 ]; then | ||
cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $outdir/wordlist.all | ||
|
||
cat $train_text | awk -v w=$outdir/wordlist.all \ | ||
'BEGIN{while((getline<w)>0) v[$1]=1;} | ||
{for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \ | ||
| shuf --random-source=$train_text > $outdir/train.txt.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't rely on shuf, it's not always installed and we don't like adding dependencies. we typically use utils/shuffle_list.pl |
||
|
||
cat $dev_text | awk -v w=$outdir/wordlist.all \ | ||
'BEGIN{while((getline<w)>0) v[$1]=1;} | ||
{for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \ | ||
| shuf --random-source=$dev_text > $outdir/dev.txt.0 | ||
|
||
cat $outdir/train.txt.0 $outdir/wordlist.all | sed "s= =\n=g" | grep . | sort | uniq -c | sort -k1 -n -r | awk '{print $2,$1}' > $outdir/unigramcounts.txt | ||
|
||
echo $sos 0 > $outdir/wordlist.in | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. RE the fact that you use the same symbol for both the BOS and EOS symbols... |
||
echo $oos 1 >> $outdir/wordlist.in | ||
cat $outdir/unigramcounts.txt | head -n $num_words_in | awk '{print $1,1+NR}' >> $outdir/wordlist.in | ||
|
||
echo $eos 0 > $outdir/wordlist.out | ||
echo $oos 1 >> $outdir/wordlist.out | ||
|
||
cat $outdir/unigramcounts.txt | head -n $num_words_out | awk '{print $1,1+NR}' >> $outdir/wordlist.out | ||
|
||
cat $outdir/train.txt.0 | awk -v sos="$sos" -v eos="$eos" '{print sos,$0,eos}' > $outdir/train.txt | ||
cat $outdir/dev.txt.0 | awk -v sos="$sos" -v eos="$eos" '{print sos,$0,eos}' > $outdir/dev.txt | ||
fi | ||
|
||
num_words_in=`wc -l $outdir/wordlist.in | awk '{print $1}'` | ||
num_words_out=`wc -l $outdir/wordlist.out | awk '{print $1}'` | ||
|
||
if [ $stage -le -3 ]; then | ||
rnnlm-get-egs $outdir/train.txt $outdir/wordlist.in $outdir/wordlist.out ark,t:$outdir/egs | ||
fi | ||
|
||
if [ $stage -le -2 ]; then | ||
|
||
steps/rnnlm/make_lstm_configs.py \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we'll want to have a more xconfig-based mechanism for this. |
||
--splice-indexes "$splice_indexes " \ | ||
--num-lstm-layers $num_lstm_layers \ | ||
--feat-dim $num_words_in \ | ||
--cell-dim $cell_dim \ | ||
--hidden-dim $hidden_dim \ | ||
--recurrent-projection-dim $recurrent_projection_dim \ | ||
--non-recurrent-projection-dim $non_recurrent_projection_dim \ | ||
--norm-based-clipping $norm_based_clipping \ | ||
--clipping-threshold $clipping_threshold \ | ||
--num-targets $num_words_out \ | ||
--label-delay $label_delay \ | ||
$outdir/configs || exit 1; | ||
|
||
fi | ||
|
||
if [ $stage -le 0 ]; then | ||
nnet3-init --binary=false $outdir/configs/layer1.config $outdir/0.mdl | ||
fi | ||
|
||
|
||
cat data/local/dict/lexicon.txt | awk '{print $1}' > $outdir/wordlist.all.1 | ||
cat $outdir/wordlist.in $outdir/wordlist.out | awk '{print $1}' > $outdir/wordlist.all.2 | ||
cat $outdir/wordlist.all.[12] | sort -u > $outdir/wordlist.all | ||
#rm $outdir/wordlist.all.[12] | ||
cp $outdir/wordlist.all $outdir/wordlist.rnn | ||
touch $outdir/unk.probs | ||
|
||
mkdir -p $outdir/log/ | ||
if [ $stage -le $num_iters ]; then | ||
start=1 | ||
# if [ $stage -gt 1 ]; then | ||
# start=$stage | ||
# fi | ||
learning_rate=$initial_learning_rate | ||
|
||
for n in `seq $start $num_iters`; do | ||
echo for iter $n, learning rate is $learning_rate | ||
[ $n -ge $stage ] && ( | ||
$cuda_cmd $outdir/log/train.rnnlm.$n.log nnet3-train \ | ||
--max-param-change=$max_param_change "nnet3-copy --learning-rate=$learning_rate $outdir/$[$n-1].mdl -|" \ | ||
"ark:nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$n ark:$outdir/egs ark:- | nnet3-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" $outdir/$n.mdl | ||
) | ||
|
||
learning_rate=`echo $learning_rate | awk -v d=$learning_rate_decline_factor '{printf("%f", $1/d)}'` | ||
if (( $(echo "$final_learning_rate > $learning_rate" |bc -l) )); then | ||
learning_rate=$final_learning_rate | ||
fi | ||
|
||
[ $n -ge $stage ] && ( | ||
nw=`wc -l $outdir/wordlist.all | awk '{print $1 - 3}'` # <s>, </s>, <oos> | ||
nw=`wc -l $outdir/wordlist.all | awk '{print $1}'` # <s>, </s>, <oos> | ||
# nw=`wc -l data/sdm1/cued_rnn_ce_1/unigram.counts | awk '{print $1}'` | ||
# $decode_cmd $outdir/dev.ppl.$n.log rnnlm-eval $outdir/$n.mdl $outdir/wordlist.in $outdir/wordlist.out $outdir/dev.txt $outdir/dev-probs-iter-$n.txt | ||
echo $decode_cmd $outdir/dev.ppl.$n.log rnnlm-eval --num-words=$nw $outdir/$n.mdl $outdir/wordlist.in $outdir/wordlist.out $outdir/dev.txt $outdir/dev-probs-iter-$n.txt | ||
$decode_cmd $outdir/dev.ppl.$n.log rnnlm-eval --num-words=$nw $outdir/$n.mdl $outdir/wordlist.in $outdir/wordlist.out $outdir/dev.txt $outdir/dev-probs-iter-$n.txt | ||
nw=`cat $outdir/dev.txt | awk '{a+=NF-1}END{print a}' ` | ||
to_cost=`cat $outdir/dev-probs-iter-$n.txt | awk '{a+=$1}END{print -a}'` | ||
ppl=`echo $to_cost $nw | awk '{print exp($1/$2)}'` | ||
echo DEV PPL on model $n.mdl is $ppl | tee $outdir/log/dev.ppl.$n.txt | ||
) & | ||
done | ||
cp $outdir/$num_iters.mdl $outdir/rnnlm | ||
fi | ||
|
||
./local/rnnlm/run-rescoring.sh --rnndir $outdir/ --type $type |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/bin/bash | ||
|
||
mic=sdm1 | ||
n=50 | ||
ngram_order=4 | ||
rnndir=data/nnet3_rnnlm_200_256_0 | ||
id=rnn | ||
|
||
. ./utils/parse_options.sh | ||
. ./cmd.sh | ||
. ./path.sh | ||
|
||
set -e | ||
|
||
local/nnet3-rnnlm/run-rnnlm-train.sh --use-gpu yes --stage -20 --num-iters 160 | ||
|
||
[ ! -f $rnndir/rnnlm ] && echo "Can't find RNNLM model" && exit 1; | ||
|
||
final_lm=ami_fsh.o3g.kn | ||
LM=$final_lm.pr1-7 | ||
|
||
for decode_set in dev eval; do | ||
( dir=exp/$mic/nnet3/tdnn_sp/ | ||
decode_dir=${dir}/decode_${decode_set} | ||
|
||
steps/lmrescore_rnnlm_lat.sh \ | ||
--cmd "$decode_cmd --mem 16G" \ | ||
--rnnlm-ver nnet3rnnlm --weight 0.5 --max-ngram-order $ngram_order \ | ||
data/lang_$LM $rnndir \ | ||
data/$mic/${decode_set}_hires ${decode_dir} \ | ||
${decode_dir}.rnnlm.lat.${ngram_order}gram | ||
) & | ||
done | ||
|
||
wait |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is typically called bos, not sos.