/
awd_lstm_lm_1150_wikitext-2.log
3005 lines (3005 loc) · 185 KB
/
awd_lstm_lm_1150_wikitext-2.log
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
Namespace(alpha=2, batch_size=80, beta=1, bptt=70, clip=0.25, dropout=0.4, dropout_e=0.1, dropout_h=0.2, dropout_i=0.65, emsize=400, epochs=750, eval_only=False, gpu='0', log_interval=200, lr=30, lr_update_factor=0.1, lr_update_interval=30, model='lstm', nhid=1150, nlayers=3, ntasgd=True, optimizer='sgd', save='awd_lstm_lm_1150_wikitext-2', test_mode=False, tied=True, wd=1.2e-06, weight_dropout=0.5)
Use AWDRNN
AWDRNN(
(embedding): HybridSequential(
(0): Embedding(33278 -> 400, float32)
(1): Dropout(p = 0.65, axes=(0,))
)
(encoder): HybridSequential(
(0): LSTM(400 -> 1150, TNC)
(1): LSTM(1150 -> 1150, TNC)
(2): LSTM(1150 -> 400, TNC)
)
(decoder): HybridSequential(
(0): Dense(400 -> 33278, linear)
)
)
[Epoch 0 Batch 200/372] current loss 8.00, ppl 2979.04, throughput 426.35 samples/s, lr 29.57
[Epoch 0] throughput 28348.05 samples/s
[Epoch 0] time cost 80.82s, valid loss 6.55, valid ppl 697.88, lr 30.00
[Epoch 0] test loss 6.48, test ppl 651.82
[Epoch 1 Batch 200/372] current loss 6.74, ppl 848.61, throughput 409.79 samples/s, lr 15.00
[Epoch 1] throughput 28006.58 samples/s
[Epoch 1] time cost 81.71s, valid loss 6.05, valid ppl 424.94, lr 30.00
[Epoch 1] test loss 5.98, test ppl 395.48
[Epoch 2 Batch 200/372] current loss 6.44, ppl 626.88, throughput 405.12 samples/s, lr 31.29
[Epoch 2] throughput 27820.44 samples/s
[Epoch 2] time cost 82.32s, valid loss 5.82, valid ppl 335.67, lr 30.00
[Epoch 2] test loss 5.74, test ppl 312.08
[Epoch 3 Batch 200/372] current loss 6.25, ppl 519.27, throughput 408.22 samples/s, lr 30.43
[Epoch 3] throughput 28002.06 samples/s
[Epoch 3] time cost 81.73s, valid loss 5.65, valid ppl 283.11, lr 30.00
[Epoch 3] test loss 5.57, test ppl 261.82
[Epoch 4 Batch 200/372] current loss 6.08, ppl 438.95, throughput 409.26 samples/s, lr 29.14
[Epoch 4] throughput 27671.86 samples/s
[Epoch 4] time cost 82.62s, valid loss 5.49, valid ppl 243.34, lr 30.00
[Epoch 4] test loss 5.42, test ppl 225.83
[Epoch 5 Batch 200/372] current loss 5.95, ppl 385.63, throughput 411.56 samples/s, lr 34.29
[Epoch 5] throughput 28001.11 samples/s
[Epoch 5] time cost 81.75s, valid loss 5.35, valid ppl 210.92, lr 30.00
[Epoch 5] test loss 5.27, test ppl 194.70
[Epoch 6 Batch 200/372] current loss 5.86, ppl 351.01, throughput 417.25 samples/s, lr 34.29
[Epoch 6] throughput 27880.04 samples/s
[Epoch 6] time cost 82.08s, valid loss 5.27, valid ppl 193.96, lr 30.00
[Epoch 6] test loss 5.19, test ppl 179.33
[Epoch 7 Batch 200/372] current loss 5.78, ppl 323.67, throughput 421.35 samples/s, lr 31.29
[Epoch 7] throughput 28099.37 samples/s
[Epoch 7] time cost 81.41s, valid loss 5.19, valid ppl 179.00, lr 30.00
[Epoch 7] test loss 5.11, test ppl 165.79
[Epoch 8 Batch 200/372] current loss 5.71, ppl 302.12, throughput 409.17 samples/s, lr 32.14
[Epoch 8] throughput 27626.67 samples/s
[Epoch 8] time cost 82.74s, valid loss 5.12, valid ppl 166.58, lr 30.00
[Epoch 8] test loss 5.04, test ppl 155.06
[Epoch 9 Batch 200/372] current loss 5.64, ppl 280.22, throughput 414.53 samples/s, lr 29.14
[Epoch 9] throughput 28063.92 samples/s
[Epoch 9] time cost 81.49s, valid loss 5.07, valid ppl 158.50, lr 30.00
[Epoch 9] test loss 5.00, test ppl 147.82
[Epoch 10 Batch 200/372] current loss 5.59, ppl 267.59, throughput 414.20 samples/s, lr 13.71
[Epoch 10] throughput 28004.12 samples/s
[Epoch 10] time cost 81.69s, valid loss 5.00, valid ppl 148.36, lr 30.00
[Epoch 10] test loss 4.93, test ppl 138.34
[Epoch 11 Batch 200/372] current loss 5.54, ppl 255.90, throughput 415.11 samples/s, lr 31.29
[Epoch 11] throughput 28263.72 samples/s
[Epoch 11] time cost 81.04s, valid loss 4.97, valid ppl 144.68, lr 30.00
[Epoch 11] test loss 4.90, test ppl 134.28
[Epoch 12 Batch 200/372] current loss 5.49, ppl 242.31, throughput 405.97 samples/s, lr 30.00
[Epoch 12] throughput 27716.04 samples/s
[Epoch 12] time cost 82.50s, valid loss 4.93, valid ppl 138.61, lr 30.00
[Epoch 12] test loss 4.86, test ppl 129.09
[Epoch 13 Batch 200/372] current loss 5.45, ppl 233.35, throughput 403.78 samples/s, lr 31.29
[Epoch 13] throughput 27762.58 samples/s
[Epoch 13] time cost 82.37s, valid loss 4.88, valid ppl 131.93, lr 30.00
[Epoch 13] test loss 4.81, test ppl 122.80
[Epoch 14 Batch 200/372] current loss 5.41, ppl 223.35, throughput 414.83 samples/s, lr 29.14
[Epoch 14] throughput 28045.79 samples/s
[Epoch 14] time cost 81.61s, valid loss 4.86, valid ppl 128.60, lr 30.00
[Epoch 14] test loss 4.78, test ppl 119.46
[Epoch 15 Batch 200/372] current loss 5.37, ppl 214.46, throughput 407.60 samples/s, lr 34.29
[Epoch 15] throughput 27815.06 samples/s
[Epoch 15] time cost 82.21s, valid loss 4.81, valid ppl 122.27, lr 30.00
[Epoch 15] test loss 4.73, test ppl 113.74
[Epoch 16 Batch 200/372] current loss 5.34, ppl 207.90, throughput 402.53 samples/s, lr 28.71
[Epoch 16] throughput 27600.81 samples/s
[Epoch 16] time cost 82.84s, valid loss 4.81, valid ppl 122.78, lr 30.00
[Epoch 17 Batch 200/372] current loss 5.32, ppl 204.69, throughput 409.20 samples/s, lr 31.29
[Epoch 17] throughput 27865.11 samples/s
[Epoch 17] time cost 82.14s, valid loss 4.78, valid ppl 119.34, lr 30.00
[Epoch 17] test loss 4.71, test ppl 111.34
[Epoch 18 Batch 200/372] current loss 5.27, ppl 194.69, throughput 411.09 samples/s, lr 14.14
[Epoch 18] throughput 27746.28 samples/s
[Epoch 18] time cost 82.42s, valid loss 4.79, valid ppl 120.86, lr 30.00
[Epoch 19 Batch 200/372] current loss 5.24, ppl 188.98, throughput 410.82 samples/s, lr 27.43
[Epoch 19] throughput 27800.39 samples/s
[Epoch 19] time cost 82.28s, valid loss 4.75, valid ppl 115.53, lr 30.00
[Epoch 19] test loss 4.68, test ppl 107.99
[Epoch 20 Batch 200/372] current loss 5.22, ppl 184.78, throughput 405.95 samples/s, lr 30.00
[Epoch 20] throughput 28009.05 samples/s
[Epoch 20] time cost 81.71s, valid loss 4.74, valid ppl 114.36, lr 30.00
[Epoch 20] test loss 4.67, test ppl 106.52
[Epoch 21 Batch 200/372] current loss 5.21, ppl 182.43, throughput 415.35 samples/s, lr 27.86
[Epoch 21] throughput 27801.15 samples/s
[Epoch 21] time cost 82.27s, valid loss 4.72, valid ppl 112.64, lr 30.00
[Epoch 21] test loss 4.65, test ppl 105.06
[Epoch 22 Batch 200/372] current loss 5.17, ppl 176.09, throughput 412.67 samples/s, lr 27.00
[Epoch 22] throughput 28282.55 samples/s
[Epoch 22] time cost 81.02s, valid loss 4.68, valid ppl 107.81, lr 30.00
[Epoch 22] test loss 4.61, test ppl 100.64
[Epoch 23 Batch 200/372] current loss 5.15, ppl 171.73, throughput 407.31 samples/s, lr 27.86
[Epoch 23] throughput 27675.34 samples/s
[Epoch 23] time cost 82.64s, valid loss 4.67, valid ppl 106.77, lr 30.00
[Epoch 23] test loss 4.61, test ppl 100.15
[Epoch 24 Batch 200/372] current loss 5.13, ppl 169.65, throughput 408.22 samples/s, lr 28.71
[Epoch 24] throughput 27834.56 samples/s
[Epoch 24] time cost 82.22s, valid loss 4.67, valid ppl 106.86, lr 30.00
[Epoch 25 Batch 200/372] current loss 5.12, ppl 167.75, throughput 408.43 samples/s, lr 32.14
[Epoch 25] throughput 27859.50 samples/s
[Epoch 25] time cost 82.13s, valid loss 4.64, valid ppl 103.50, lr 30.00
[Epoch 25] test loss 4.57, test ppl 96.80
[Epoch 26 Batch 200/372] current loss 5.11, ppl 165.33, throughput 405.79 samples/s, lr 30.43
[Epoch 26] throughput 27850.15 samples/s
[Epoch 26] time cost 82.22s, valid loss 4.64, valid ppl 103.57, lr 30.00
[Epoch 27 Batch 200/372] current loss 5.07, ppl 158.83, throughput 412.57 samples/s, lr 15.43
[Epoch 27] throughput 28061.55 samples/s
[Epoch 27] time cost 81.62s, valid loss 4.61, valid ppl 100.56, lr 30.00
[Epoch 27] test loss 4.54, test ppl 94.03
[Epoch 28 Batch 200/372] current loss 5.07, ppl 159.67, throughput 399.07 samples/s, lr 29.57
[Epoch 28] throughput 27652.75 samples/s
[Epoch 28] time cost 82.78s, valid loss 4.63, valid ppl 102.34, lr 30.00
[Epoch 29 Batch 200/372] current loss 5.06, ppl 158.03, throughput 411.58 samples/s, lr 30.00
[Epoch 29] throughput 27717.50 samples/s
[Epoch 29] time cost 82.55s, valid loss 4.63, valid ppl 102.83, lr 30.00
[Epoch 30 Batch 200/372] current loss 5.04, ppl 154.99, throughput 402.88 samples/s, lr 32.57
[Epoch 30] throughput 27573.07 samples/s
[Epoch 30] time cost 82.88s, valid loss 4.62, valid ppl 101.19, lr 30.00
[Epoch 31 Batch 200/372] current loss 5.02, ppl 151.91, throughput 406.84 samples/s, lr 34.71
[Epoch 31] throughput 27433.16 samples/s
[Epoch 31] time cost 83.28s, valid loss 4.58, valid ppl 97.57, lr 30.00
[Epoch 31] test loss 4.51, test ppl 91.32
[Epoch 32 Batch 200/372] current loss 5.00, ppl 148.69, throughput 401.37 samples/s, lr 30.00
[Epoch 32] throughput 27946.19 samples/s
[Epoch 32] time cost 81.84s, valid loss 4.58, valid ppl 97.53, lr 30.00
[Epoch 32] test loss 4.52, test ppl 91.43
[Epoch 33 Batch 200/372] current loss 5.00, ppl 148.20, throughput 403.44 samples/s, lr 29.57
[Epoch 33] throughput 27648.94 samples/s
[Epoch 33] time cost 82.88s, valid loss 4.58, valid ppl 97.03, lr 30.00
[Epoch 33] test loss 4.51, test ppl 90.68
[Epoch 34 Batch 200/372] current loss 4.97, ppl 144.55, throughput 407.29 samples/s, lr 31.71
[Epoch 34] throughput 27697.43 samples/s
[Epoch 34] time cost 82.68s, valid loss 4.56, valid ppl 95.38, lr 30.00
[Epoch 34] test loss 4.49, test ppl 89.45
[Epoch 35 Batch 200/372] current loss 4.96, ppl 142.80, throughput 401.33 samples/s, lr 33.00
[Epoch 35] throughput 27460.54 samples/s
[Epoch 35] time cost 83.31s, valid loss 4.55, valid ppl 94.30, lr 30.00
[Epoch 35] test loss 4.48, test ppl 88.26
[Epoch 36 Batch 200/372] current loss 4.96, ppl 142.62, throughput 404.48 samples/s, lr 30.43
[Epoch 36] throughput 27724.17 samples/s
[Epoch 36] time cost 82.63s, valid loss 4.56, valid ppl 95.63, lr 30.00
[Epoch 37 Batch 200/372] current loss 4.95, ppl 141.29, throughput 404.53 samples/s, lr 30.86
[Epoch 37] throughput 27537.29 samples/s
[Epoch 37] time cost 83.12s, valid loss 4.54, valid ppl 93.58, lr 30.00
[Epoch 37] test loss 4.48, test ppl 87.81
[Epoch 38 Batch 200/372] current loss 4.94, ppl 139.32, throughput 400.69 samples/s, lr 32.14
[Epoch 38] throughput 27555.84 samples/s
[Epoch 38] time cost 83.07s, valid loss 4.52, valid ppl 91.42, lr 30.00
[Epoch 38] test loss 4.45, test ppl 85.74
[Epoch 39 Batch 200/372] current loss 4.91, ppl 136.24, throughput 418.92 samples/s, lr 29.14
[Epoch 39] throughput 27565.96 samples/s
[Epoch 39] time cost 83.06s, valid loss 4.52, valid ppl 92.03, lr 30.00
[Epoch 40 Batch 200/372] current loss 4.92, ppl 137.08, throughput 405.13 samples/s, lr 31.29
[Epoch 40] throughput 27403.46 samples/s
[Epoch 40] time cost 83.45s, valid loss 4.53, valid ppl 92.42, lr 30.00
[Epoch 41 Batch 200/372] current loss 4.90, ppl 133.89, throughput 410.93 samples/s, lr 30.43
[Epoch 41] throughput 27860.01 samples/s
[Epoch 41] time cost 82.06s, valid loss 4.54, valid ppl 93.49, lr 30.00
[Epoch 42 Batch 200/372] current loss 4.89, ppl 132.36, throughput 401.88 samples/s, lr 27.86
[Epoch 42] throughput 27602.02 samples/s
[Epoch 42] time cost 82.77s, valid loss 4.51, valid ppl 91.24, lr 30.00
[Epoch 42] test loss 4.45, test ppl 85.86
[Epoch 43 Batch 200/372] current loss 4.89, ppl 133.05, throughput 403.03 samples/s, lr 27.43
[Epoch 43] throughput 27968.79 samples/s
[Epoch 43] time cost 81.78s, valid loss 4.51, valid ppl 90.68, lr 30.00
[Epoch 43] test loss 4.45, test ppl 85.25
[Epoch 44 Batch 200/372] current loss 4.88, ppl 131.75, throughput 405.89 samples/s, lr 11.57
[Epoch 44] throughput 27718.77 samples/s
[Epoch 44] time cost 82.54s, valid loss 4.50, valid ppl 90.01, lr 30.00
[Epoch 44] test loss 4.44, test ppl 84.44
[Epoch 45 Batch 200/372] current loss 4.87, ppl 129.90, throughput 405.38 samples/s, lr 29.57
[Epoch 45] throughput 27802.73 samples/s
[Epoch 45] time cost 82.33s, valid loss 4.50, valid ppl 89.57, lr 30.00
[Epoch 45] test loss 4.43, test ppl 84.18
[Epoch 46 Batch 200/372] current loss 4.85, ppl 127.96, throughput 408.08 samples/s, lr 27.86
[Epoch 46] throughput 28198.95 samples/s
[Epoch 46] time cost 81.19s, valid loss 4.48, valid ppl 87.88, lr 30.00
[Epoch 46] test loss 4.41, test ppl 82.52
[Epoch 47 Batch 200/372] current loss 4.84, ppl 126.63, throughput 410.60 samples/s, lr 28.71
[Epoch 47] throughput 27773.77 samples/s
[Epoch 47] time cost 82.33s, valid loss 4.48, valid ppl 88.56, lr 30.00
[Epoch 48 Batch 200/372] current loss 4.85, ppl 127.49, throughput 403.64 samples/s, lr 26.57
[Epoch 48] throughput 27904.37 samples/s
[Epoch 48] time cost 82.03s, valid loss 4.47, valid ppl 87.73, lr 30.00
[Epoch 48] test loss 4.41, test ppl 82.23
[Epoch 49 Batch 200/372] current loss 4.82, ppl 123.81, throughput 406.82 samples/s, lr 30.43
[Epoch 49] throughput 27568.50 samples/s
[Epoch 49] time cost 82.89s, valid loss 4.47, valid ppl 87.51, lr 30.00
[Epoch 49] test loss 4.41, test ppl 82.08
[Epoch 50 Batch 200/372] current loss 4.82, ppl 124.44, throughput 419.76 samples/s, lr 33.43
[Epoch 50] throughput 27901.84 samples/s
[Epoch 50] time cost 81.97s, valid loss 4.48, valid ppl 88.29, lr 30.00
[Epoch 51 Batch 200/372] current loss 4.81, ppl 122.78, throughput 400.94 samples/s, lr 29.14
[Epoch 51] throughput 27575.68 samples/s
[Epoch 51] time cost 82.85s, valid loss 4.46, valid ppl 86.23, lr 30.00
[Epoch 51] test loss 4.40, test ppl 81.06
[Epoch 52 Batch 200/372] current loss 4.80, ppl 121.63, throughput 406.48 samples/s, lr 28.71
[Epoch 52] throughput 28020.22 samples/s
[Epoch 52] time cost 81.67s, valid loss 4.47, valid ppl 87.04, lr 30.00
[Epoch 53 Batch 200/372] current loss 4.80, ppl 121.35, throughput 406.37 samples/s, lr 30.00
[Epoch 53] throughput 27687.38 samples/s
[Epoch 53] time cost 82.53s, valid loss 4.47, valid ppl 86.94, lr 30.00
[Epoch 54 Batch 200/372] current loss 4.79, ppl 120.32, throughput 407.42 samples/s, lr 27.43
[Epoch 54] throughput 27958.58 samples/s
[Epoch 54] time cost 81.86s, valid loss 4.49, valid ppl 89.10, lr 30.00
Switching to NTASGD and avg_trigger is : 20460
[Epoch 55 Batch 200/372] current loss 4.80, ppl 121.02, throughput 391.92 samples/s, lr 26.57
[Epoch 55] throughput 27067.59 samples/s
[Epoch 55] time cost 84.16s, valid loss 4.42, valid ppl 82.93, lr 30.00
[Epoch 55] test loss 4.36, test ppl 78.31
[Epoch 56 Batch 200/372] current loss 4.77, ppl 117.86, throughput 397.58 samples/s, lr 31.71
[Epoch 56] throughput 27239.37 samples/s
[Epoch 56] time cost 83.63s, valid loss 4.42, valid ppl 82.70, lr 30.00
[Epoch 56] test loss 4.36, test ppl 78.03
[Epoch 57 Batch 200/372] current loss 4.77, ppl 117.37, throughput 394.49 samples/s, lr 27.00
[Epoch 57] throughput 27088.26 samples/s
[Epoch 57] time cost 84.04s, valid loss 4.41, valid ppl 82.47, lr 30.00
[Epoch 57] test loss 4.35, test ppl 77.81
[Epoch 58 Batch 200/372] current loss 4.76, ppl 117.00, throughput 395.33 samples/s, lr 26.14
[Epoch 58] throughput 26796.85 samples/s
[Epoch 58] time cost 84.97s, valid loss 4.41, valid ppl 82.25, lr 30.00
[Epoch 58] test loss 4.35, test ppl 77.60
[Epoch 59 Batch 200/372] current loss 4.77, ppl 118.02, throughput 402.01 samples/s, lr 30.43
[Epoch 59] throughput 27330.83 samples/s
[Epoch 59] time cost 83.42s, valid loss 4.41, valid ppl 82.07, lr 30.00
[Epoch 59] test loss 4.35, test ppl 77.44
[Epoch 60 Batch 200/372] current loss 4.76, ppl 116.20, throughput 400.68 samples/s, lr 30.43
[Epoch 60] throughput 27088.56 samples/s
[Epoch 60] time cost 84.15s, valid loss 4.41, valid ppl 81.89, lr 30.00
[Epoch 60] test loss 4.35, test ppl 77.28
[Epoch 61 Batch 200/372] current loss 4.76, ppl 116.34, throughput 396.79 samples/s, lr 32.57
[Epoch 61] throughput 27113.74 samples/s
[Epoch 61] time cost 84.05s, valid loss 4.40, valid ppl 81.72, lr 30.00
[Epoch 61] test loss 4.35, test ppl 77.13
[Epoch 62 Batch 200/372] current loss 4.76, ppl 116.39, throughput 402.02 samples/s, lr 32.57
[Epoch 62] throughput 27479.31 samples/s
[Epoch 62] time cost 82.98s, valid loss 4.40, valid ppl 81.55, lr 30.00
[Epoch 62] test loss 4.34, test ppl 76.99
[Epoch 63 Batch 200/372] current loss 4.74, ppl 114.11, throughput 400.35 samples/s, lr 26.14
[Epoch 63] throughput 27004.14 samples/s
[Epoch 63] time cost 84.32s, valid loss 4.40, valid ppl 81.40, lr 30.00
[Epoch 63] test loss 4.34, test ppl 76.86
[Epoch 64 Batch 200/372] current loss 4.74, ppl 113.89, throughput 400.92 samples/s, lr 28.71
[Epoch 64] throughput 27069.68 samples/s
[Epoch 64] time cost 84.19s, valid loss 4.40, valid ppl 81.26, lr 30.00
[Epoch 64] test loss 4.34, test ppl 76.74
[Epoch 65 Batch 200/372] current loss 4.73, ppl 113.16, throughput 395.09 samples/s, lr 30.43
[Epoch 65] throughput 27300.37 samples/s
[Epoch 65] time cost 83.50s, valid loss 4.40, valid ppl 81.11, lr 30.00
[Epoch 65] test loss 4.34, test ppl 76.61
[Epoch 66 Batch 200/372] current loss 4.72, ppl 111.79, throughput 397.53 samples/s, lr 33.86
[Epoch 66] throughput 27353.17 samples/s
[Epoch 66] time cost 83.40s, valid loss 4.39, valid ppl 80.97, lr 30.00
[Epoch 66] test loss 4.34, test ppl 76.50
[Epoch 67 Batch 200/372] current loss 4.73, ppl 112.97, throughput 401.89 samples/s, lr 28.71
[Epoch 67] throughput 27274.72 samples/s
[Epoch 67] time cost 83.58s, valid loss 4.39, valid ppl 80.84, lr 30.00
[Epoch 67] test loss 4.34, test ppl 76.39
[Epoch 68 Batch 200/372] current loss 4.72, ppl 112.29, throughput 405.11 samples/s, lr 31.29
[Epoch 68] throughput 27482.87 samples/s
[Epoch 68] time cost 83.00s, valid loss 4.39, valid ppl 80.71, lr 30.00
[Epoch 68] test loss 4.33, test ppl 76.29
[Epoch 69 Batch 200/372] current loss 4.71, ppl 111.29, throughput 407.29 samples/s, lr 27.43
[Epoch 69] throughput 27169.21 samples/s
[Epoch 69] time cost 83.88s, valid loss 4.39, valid ppl 80.59, lr 30.00
[Epoch 69] test loss 4.33, test ppl 76.18
[Epoch 70 Batch 200/372] current loss 4.70, ppl 110.32, throughput 392.61 samples/s, lr 25.71
[Epoch 70] throughput 27035.18 samples/s
[Epoch 70] time cost 84.28s, valid loss 4.39, valid ppl 80.48, lr 30.00
[Epoch 70] test loss 4.33, test ppl 76.08
[Epoch 71 Batch 200/372] current loss 4.70, ppl 110.39, throughput 387.76 samples/s, lr 29.57
[Epoch 71] throughput 26835.05 samples/s
[Epoch 71] time cost 84.81s, valid loss 4.39, valid ppl 80.37, lr 30.00
[Epoch 71] test loss 4.33, test ppl 75.98
[Epoch 72 Batch 200/372] current loss 4.69, ppl 109.08, throughput 392.37 samples/s, lr 13.71
[Epoch 72] throughput 27432.08 samples/s
[Epoch 72] time cost 83.13s, valid loss 4.39, valid ppl 80.26, lr 30.00
[Epoch 72] test loss 4.33, test ppl 75.89
[Epoch 73 Batch 200/372] current loss 4.70, ppl 110.25, throughput 393.87 samples/s, lr 28.29
[Epoch 73] throughput 26991.64 samples/s
[Epoch 73] time cost 84.38s, valid loss 4.38, valid ppl 80.16, lr 30.00
[Epoch 73] test loss 4.33, test ppl 75.79
[Epoch 74 Batch 200/372] current loss 4.69, ppl 108.62, throughput 402.37 samples/s, lr 30.86
[Epoch 74] throughput 27543.94 samples/s
[Epoch 74] time cost 82.86s, valid loss 4.38, valid ppl 80.06, lr 30.00
[Epoch 74] test loss 4.33, test ppl 75.70
[Epoch 75 Batch 200/372] current loss 4.69, ppl 108.49, throughput 398.95 samples/s, lr 28.71
[Epoch 75] throughput 27475.44 samples/s
[Epoch 75] time cost 82.99s, valid loss 4.38, valid ppl 79.96, lr 30.00
[Epoch 75] test loss 4.33, test ppl 75.61
[Epoch 76 Batch 200/372] current loss 4.68, ppl 107.66, throughput 410.75 samples/s, lr 15.86
[Epoch 76] throughput 27088.72 samples/s
[Epoch 76] time cost 84.10s, valid loss 4.38, valid ppl 79.86, lr 30.00
[Epoch 76] test loss 4.32, test ppl 75.52
[Epoch 77 Batch 200/372] current loss 4.68, ppl 107.91, throughput 394.09 samples/s, lr 29.14
[Epoch 77] throughput 27013.38 samples/s
[Epoch 77] time cost 84.32s, valid loss 4.38, valid ppl 79.77, lr 30.00
[Epoch 77] test loss 4.32, test ppl 75.44
[Epoch 78 Batch 200/372] current loss 4.66, ppl 106.07, throughput 397.17 samples/s, lr 29.57
[Epoch 78] throughput 26817.53 samples/s
[Epoch 78] time cost 84.90s, valid loss 4.38, valid ppl 79.67, lr 30.00
[Epoch 78] test loss 4.32, test ppl 75.36
[Epoch 79 Batch 200/372] current loss 4.68, ppl 107.46, throughput 401.24 samples/s, lr 27.00
[Epoch 79] throughput 27316.01 samples/s
[Epoch 79] time cost 83.49s, valid loss 4.38, valid ppl 79.58, lr 30.00
[Epoch 79] test loss 4.32, test ppl 75.28
[Epoch 80 Batch 200/372] current loss 4.68, ppl 107.34, throughput 387.84 samples/s, lr 29.57
[Epoch 80] throughput 26941.33 samples/s
[Epoch 80] time cost 84.50s, valid loss 4.38, valid ppl 79.49, lr 30.00
[Epoch 80] test loss 4.32, test ppl 75.20
[Epoch 81 Batch 200/372] current loss 4.66, ppl 105.86, throughput 397.60 samples/s, lr 32.57
[Epoch 81] throughput 27062.98 samples/s
[Epoch 81] time cost 84.13s, valid loss 4.37, valid ppl 79.41, lr 30.00
[Epoch 81] test loss 4.32, test ppl 75.12
[Epoch 82 Batch 200/372] current loss 4.65, ppl 104.90, throughput 397.62 samples/s, lr 28.29
[Epoch 82] throughput 26968.82 samples/s
[Epoch 82] time cost 84.47s, valid loss 4.37, valid ppl 79.32, lr 30.00
[Epoch 82] test loss 4.32, test ppl 75.05
[Epoch 83 Batch 200/372] current loss 4.66, ppl 105.64, throughput 394.51 samples/s, lr 27.43
[Epoch 83] throughput 26878.41 samples/s
[Epoch 83] time cost 84.70s, valid loss 4.37, valid ppl 79.24, lr 30.00
[Epoch 83] test loss 4.32, test ppl 74.97
[Epoch 84 Batch 200/372] current loss 4.66, ppl 105.46, throughput 399.21 samples/s, lr 26.57
[Epoch 84] throughput 27162.49 samples/s
[Epoch 84] time cost 83.92s, valid loss 4.37, valid ppl 79.15, lr 30.00
[Epoch 84] test loss 4.32, test ppl 74.90
[Epoch 85 Batch 200/372] current loss 4.64, ppl 103.52, throughput 402.40 samples/s, lr 32.57
[Epoch 85] throughput 27179.81 samples/s
[Epoch 85] time cost 83.89s, valid loss 4.37, valid ppl 79.07, lr 30.00
[Epoch 85] test loss 4.32, test ppl 74.83
[Epoch 86 Batch 200/372] current loss 4.64, ppl 103.53, throughput 399.44 samples/s, lr 30.86
[Epoch 86] throughput 27226.38 samples/s
[Epoch 86] time cost 83.73s, valid loss 4.37, valid ppl 78.99, lr 30.00
[Epoch 86] test loss 4.31, test ppl 74.76
[Epoch 87 Batch 200/372] current loss 4.64, ppl 103.72, throughput 406.15 samples/s, lr 29.14
[Epoch 87] throughput 27314.29 samples/s
[Epoch 87] time cost 83.45s, valid loss 4.37, valid ppl 78.91, lr 30.00
[Epoch 87] test loss 4.31, test ppl 74.70
[Epoch 88 Batch 200/372] current loss 4.64, ppl 103.69, throughput 398.82 samples/s, lr 33.00
[Epoch 88] throughput 26733.46 samples/s
[Epoch 88] time cost 85.10s, valid loss 4.37, valid ppl 78.83, lr 30.00
[Epoch 88] test loss 4.31, test ppl 74.63
[Epoch 89 Batch 200/372] current loss 4.63, ppl 102.66, throughput 392.36 samples/s, lr 27.86
[Epoch 89] throughput 26973.50 samples/s
[Epoch 89] time cost 84.44s, valid loss 4.37, valid ppl 78.76, lr 30.00
[Epoch 89] test loss 4.31, test ppl 74.56
[Epoch 90 Batch 200/372] current loss 4.63, ppl 102.68, throughput 392.96 samples/s, lr 28.71
[Epoch 90] throughput 27019.20 samples/s
[Epoch 90] time cost 84.31s, valid loss 4.37, valid ppl 78.69, lr 30.00
[Epoch 90] test loss 4.31, test ppl 74.50
[Epoch 91 Batch 200/372] current loss 4.65, ppl 104.33, throughput 395.29 samples/s, lr 15.43
[Epoch 91] throughput 26903.06 samples/s
[Epoch 91] time cost 84.65s, valid loss 4.36, valid ppl 78.61, lr 30.00
[Epoch 91] test loss 4.31, test ppl 74.43
[Epoch 92 Batch 200/372] current loss 4.61, ppl 100.56, throughput 399.54 samples/s, lr 28.71
[Epoch 92] throughput 27286.66 samples/s
[Epoch 92] time cost 83.51s, valid loss 4.36, valid ppl 78.54, lr 30.00
[Epoch 92] test loss 4.31, test ppl 74.37
[Epoch 93 Batch 200/372] current loss 4.61, ppl 100.98, throughput 404.78 samples/s, lr 13.29
[Epoch 93] throughput 27485.51 samples/s
[Epoch 93] time cost 82.95s, valid loss 4.36, valid ppl 78.47, lr 30.00
[Epoch 93] test loss 4.31, test ppl 74.31
[Epoch 94 Batch 200/372] current loss 4.62, ppl 101.73, throughput 401.17 samples/s, lr 33.43
[Epoch 94] throughput 27412.93 samples/s
[Epoch 94] time cost 83.18s, valid loss 4.36, valid ppl 78.40, lr 30.00
[Epoch 94] test loss 4.31, test ppl 74.24
[Epoch 95 Batch 200/372] current loss 4.62, ppl 101.06, throughput 398.86 samples/s, lr 28.29
[Epoch 95] throughput 27312.14 samples/s
[Epoch 95] time cost 83.53s, valid loss 4.36, valid ppl 78.33, lr 30.00
[Epoch 95] test loss 4.31, test ppl 74.18
[Epoch 96 Batch 200/372] current loss 4.61, ppl 100.45, throughput 401.32 samples/s, lr 31.71
[Epoch 96] throughput 27035.86 samples/s
[Epoch 96] time cost 84.22s, valid loss 4.36, valid ppl 78.27, lr 30.00
[Epoch 96] test loss 4.31, test ppl 74.12
[Epoch 97 Batch 200/372] current loss 4.61, ppl 100.59, throughput 403.59 samples/s, lr 34.29
[Epoch 97] throughput 27502.89 samples/s
[Epoch 97] time cost 82.95s, valid loss 4.36, valid ppl 78.21, lr 30.00
[Epoch 97] test loss 4.30, test ppl 74.07
[Epoch 98 Batch 200/372] current loss 4.61, ppl 100.44, throughput 400.75 samples/s, lr 15.43
[Epoch 98] throughput 27484.68 samples/s
[Epoch 98] time cost 83.01s, valid loss 4.36, valid ppl 78.15, lr 30.00
[Epoch 98] test loss 4.30, test ppl 74.01
[Epoch 99 Batch 200/372] current loss 4.61, ppl 100.20, throughput 406.94 samples/s, lr 28.71
[Epoch 99] throughput 27146.41 samples/s
[Epoch 99] time cost 83.90s, valid loss 4.36, valid ppl 78.09, lr 30.00
[Epoch 99] test loss 4.30, test ppl 73.95
[Epoch 100 Batch 200/372] current loss 4.60, ppl 99.91, throughput 392.21 samples/s, lr 27.00
[Epoch 100] throughput 26897.14 samples/s
[Epoch 100] time cost 84.67s, valid loss 4.36, valid ppl 78.03, lr 30.00
[Epoch 100] test loss 4.30, test ppl 73.90
[Epoch 101 Batch 200/372] current loss 4.61, ppl 100.62, throughput 404.61 samples/s, lr 17.14
[Epoch 101] throughput 27493.11 samples/s
[Epoch 101] time cost 82.99s, valid loss 4.36, valid ppl 77.97, lr 30.00
[Epoch 101] test loss 4.30, test ppl 73.84
[Epoch 102 Batch 200/372] current loss 4.60, ppl 99.02, throughput 393.88 samples/s, lr 27.43
[Epoch 102] throughput 27312.00 samples/s
[Epoch 102] time cost 83.43s, valid loss 4.36, valid ppl 77.91, lr 30.00
[Epoch 102] test loss 4.30, test ppl 73.79
[Epoch 103 Batch 200/372] current loss 4.60, ppl 99.11, throughput 398.08 samples/s, lr 28.71
[Epoch 103] throughput 27413.87 samples/s
[Epoch 103] time cost 83.21s, valid loss 4.35, valid ppl 77.85, lr 30.00
[Epoch 103] test loss 4.30, test ppl 73.74
[Epoch 104 Batch 200/372] current loss 4.59, ppl 98.09, throughput 402.52 samples/s, lr 31.29
[Epoch 104] throughput 27102.38 samples/s
[Epoch 104] time cost 84.11s, valid loss 4.35, valid ppl 77.80, lr 30.00
[Epoch 104] test loss 4.30, test ppl 73.69
[Epoch 105 Batch 200/372] current loss 4.60, ppl 99.39, throughput 399.22 samples/s, lr 27.86
[Epoch 105] throughput 26912.19 samples/s
[Epoch 105] time cost 84.62s, valid loss 4.35, valid ppl 77.74, lr 30.00
[Epoch 105] test loss 4.30, test ppl 73.64
[Epoch 106 Batch 200/372] current loss 4.58, ppl 97.25, throughput 408.70 samples/s, lr 27.86
[Epoch 106] throughput 27478.71 samples/s
[Epoch 106] time cost 83.05s, valid loss 4.35, valid ppl 77.69, lr 30.00
[Epoch 106] test loss 4.30, test ppl 73.59
[Epoch 107 Batch 200/372] current loss 4.58, ppl 97.50, throughput 395.18 samples/s, lr 24.00
[Epoch 107] throughput 27063.50 samples/s
[Epoch 107] time cost 84.20s, valid loss 4.35, valid ppl 77.63, lr 30.00
[Epoch 107] test loss 4.30, test ppl 73.54
[Epoch 108 Batch 200/372] current loss 4.57, ppl 96.70, throughput 398.60 samples/s, lr 31.29
[Epoch 108] throughput 27035.14 samples/s
[Epoch 108] time cost 84.28s, valid loss 4.35, valid ppl 77.58, lr 30.00
[Epoch 108] test loss 4.30, test ppl 73.49
[Epoch 109 Batch 200/372] current loss 4.59, ppl 98.26, throughput 394.87 samples/s, lr 30.00
[Epoch 109] throughput 26794.34 samples/s
[Epoch 109] time cost 84.92s, valid loss 4.35, valid ppl 77.53, lr 30.00
[Epoch 109] test loss 4.30, test ppl 73.45
[Epoch 110 Batch 200/372] current loss 4.60, ppl 99.19, throughput 393.30 samples/s, lr 34.71
[Epoch 110] throughput 26998.67 samples/s
[Epoch 110] time cost 84.37s, valid loss 4.35, valid ppl 77.48, lr 30.00
[Epoch 110] test loss 4.30, test ppl 73.40
[Epoch 111 Batch 200/372] current loss 4.58, ppl 97.27, throughput 406.77 samples/s, lr 32.57
[Epoch 111] throughput 27548.92 samples/s
[Epoch 111] time cost 82.77s, valid loss 4.35, valid ppl 77.43, lr 30.00
[Epoch 111] test loss 4.30, test ppl 73.36
[Epoch 112 Batch 200/372] current loss 4.57, ppl 96.79, throughput 395.06 samples/s, lr 28.71
[Epoch 112] throughput 26993.76 samples/s
[Epoch 112] time cost 84.42s, valid loss 4.35, valid ppl 77.38, lr 30.00
[Epoch 112] test loss 4.29, test ppl 73.31
[Epoch 113 Batch 200/372] current loss 4.58, ppl 97.14, throughput 406.90 samples/s, lr 27.86
[Epoch 113] throughput 27084.94 samples/s
[Epoch 113] time cost 84.11s, valid loss 4.35, valid ppl 77.33, lr 30.00
[Epoch 113] test loss 4.29, test ppl 73.27
[Epoch 114 Batch 200/372] current loss 4.55, ppl 94.89, throughput 402.36 samples/s, lr 32.57
[Epoch 114] throughput 27124.18 samples/s
[Epoch 114] time cost 84.06s, valid loss 4.35, valid ppl 77.28, lr 30.00
[Epoch 114] test loss 4.29, test ppl 73.22
[Epoch 115 Batch 200/372] current loss 4.56, ppl 96.00, throughput 392.72 samples/s, lr 31.71
[Epoch 115] throughput 27140.40 samples/s
[Epoch 115] time cost 83.98s, valid loss 4.35, valid ppl 77.23, lr 30.00
[Epoch 115] test loss 4.29, test ppl 73.18
[Epoch 116 Batch 200/372] current loss 4.56, ppl 95.62, throughput 396.01 samples/s, lr 31.29
[Epoch 116] throughput 27086.29 samples/s
[Epoch 116] time cost 84.12s, valid loss 4.35, valid ppl 77.18, lr 30.00
[Epoch 116] test loss 4.29, test ppl 73.14
[Epoch 117 Batch 200/372] current loss 4.57, ppl 96.51, throughput 396.61 samples/s, lr 28.71
[Epoch 117] throughput 26950.84 samples/s
[Epoch 117] time cost 84.62s, valid loss 4.35, valid ppl 77.14, lr 30.00
[Epoch 117] test loss 4.29, test ppl 73.10
[Epoch 118 Batch 200/372] current loss 4.56, ppl 95.22, throughput 401.45 samples/s, lr 31.29
[Epoch 118] throughput 27197.82 samples/s
[Epoch 118] time cost 83.83s, valid loss 4.34, valid ppl 77.09, lr 30.00
[Epoch 118] test loss 4.29, test ppl 73.06
[Epoch 119 Batch 200/372] current loss 4.57, ppl 96.60, throughput 394.95 samples/s, lr 29.57
[Epoch 119] throughput 27341.52 samples/s
[Epoch 119] time cost 83.36s, valid loss 4.34, valid ppl 77.05, lr 30.00
[Epoch 119] test loss 4.29, test ppl 73.02
[Epoch 120 Batch 200/372] current loss 4.56, ppl 96.05, throughput 399.93 samples/s, lr 33.86
[Epoch 120] throughput 27213.68 samples/s
[Epoch 120] time cost 83.84s, valid loss 4.34, valid ppl 77.00, lr 30.00
[Epoch 120] test loss 4.29, test ppl 72.98
[Epoch 121 Batch 200/372] current loss 4.56, ppl 95.68, throughput 396.71 samples/s, lr 29.57
[Epoch 121] throughput 26785.62 samples/s
[Epoch 121] time cost 84.98s, valid loss 4.34, valid ppl 76.96, lr 30.00
[Epoch 121] test loss 4.29, test ppl 72.94
[Epoch 122 Batch 200/372] current loss 4.54, ppl 93.71, throughput 393.10 samples/s, lr 30.00
[Epoch 122] throughput 27042.26 samples/s
[Epoch 122] time cost 84.26s, valid loss 4.34, valid ppl 76.92, lr 30.00
[Epoch 122] test loss 4.29, test ppl 72.90
[Epoch 123 Batch 200/372] current loss 4.53, ppl 93.02, throughput 404.27 samples/s, lr 30.86
[Epoch 123] throughput 27424.41 samples/s
[Epoch 123] time cost 83.15s, valid loss 4.34, valid ppl 76.87, lr 30.00
[Epoch 123] test loss 4.29, test ppl 72.86
[Epoch 124 Batch 200/372] current loss 4.55, ppl 94.45, throughput 395.10 samples/s, lr 29.57
[Epoch 124] throughput 27095.50 samples/s
[Epoch 124] time cost 84.07s, valid loss 4.34, valid ppl 76.83, lr 30.00
[Epoch 124] test loss 4.29, test ppl 72.82
[Epoch 125 Batch 200/372] current loss 4.54, ppl 94.06, throughput 389.77 samples/s, lr 27.86
[Epoch 125] throughput 27229.98 samples/s
[Epoch 125] time cost 83.76s, valid loss 4.34, valid ppl 76.79, lr 30.00
[Epoch 125] test loss 4.29, test ppl 72.79
[Epoch 126 Batch 200/372] current loss 4.55, ppl 94.56, throughput 393.07 samples/s, lr 32.14
[Epoch 126] throughput 27074.09 samples/s
[Epoch 126] time cost 84.12s, valid loss 4.34, valid ppl 76.75, lr 30.00
[Epoch 126] test loss 4.29, test ppl 72.75
[Epoch 127 Batch 200/372] current loss 4.55, ppl 94.57, throughput 395.03 samples/s, lr 31.71
[Epoch 127] throughput 26754.78 samples/s
[Epoch 127] time cost 85.01s, valid loss 4.34, valid ppl 76.71, lr 30.00
[Epoch 127] test loss 4.29, test ppl 72.72
[Epoch 128 Batch 200/372] current loss 4.54, ppl 93.28, throughput 402.22 samples/s, lr 30.00
[Epoch 128] throughput 27264.26 samples/s
[Epoch 128] time cost 83.61s, valid loss 4.34, valid ppl 76.67, lr 30.00
[Epoch 128] test loss 4.29, test ppl 72.68
[Epoch 129 Batch 200/372] current loss 4.55, ppl 94.77, throughput 405.41 samples/s, lr 32.57
[Epoch 129] throughput 27380.88 samples/s
[Epoch 129] time cost 83.23s, valid loss 4.34, valid ppl 76.63, lr 30.00
[Epoch 129] test loss 4.29, test ppl 72.64
[Epoch 130 Batch 200/372] current loss 4.54, ppl 93.75, throughput 396.84 samples/s, lr 31.29
[Epoch 130] throughput 26905.30 samples/s
[Epoch 130] time cost 84.62s, valid loss 4.34, valid ppl 76.59, lr 30.00
[Epoch 130] test loss 4.29, test ppl 72.61
[Epoch 131 Batch 200/372] current loss 4.53, ppl 93.11, throughput 402.14 samples/s, lr 14.57
[Epoch 131] throughput 27349.78 samples/s
[Epoch 131] time cost 83.35s, valid loss 4.34, valid ppl 76.55, lr 30.00
[Epoch 131] test loss 4.28, test ppl 72.58
[Epoch 132 Batch 200/372] current loss 4.54, ppl 93.46, throughput 397.77 samples/s, lr 27.00
[Epoch 132] throughput 26988.67 samples/s
[Epoch 132] time cost 84.39s, valid loss 4.34, valid ppl 76.51, lr 30.00
[Epoch 132] test loss 4.28, test ppl 72.54
[Epoch 133 Batch 200/372] current loss 4.53, ppl 92.66, throughput 402.55 samples/s, lr 29.57
[Epoch 133] throughput 27303.13 samples/s
[Epoch 133] time cost 83.61s, valid loss 4.34, valid ppl 76.47, lr 30.00
[Epoch 133] test loss 4.28, test ppl 72.51
[Epoch 134 Batch 200/372] current loss 4.54, ppl 93.44, throughput 400.05 samples/s, lr 30.43
[Epoch 134] throughput 27141.20 samples/s
[Epoch 134] time cost 83.96s, valid loss 4.34, valid ppl 76.43, lr 30.00
[Epoch 134] test loss 4.28, test ppl 72.47
[Epoch 135 Batch 200/372] current loss 4.53, ppl 93.04, throughput 392.42 samples/s, lr 28.71
[Epoch 135] throughput 27229.15 samples/s
[Epoch 135] time cost 83.73s, valid loss 4.34, valid ppl 76.40, lr 30.00
[Epoch 135] test loss 4.28, test ppl 72.44
[Epoch 136 Batch 200/372] current loss 4.53, ppl 92.41, throughput 405.28 samples/s, lr 27.00
[Epoch 136] throughput 27650.09 samples/s
[Epoch 136] time cost 82.49s, valid loss 4.34, valid ppl 76.36, lr 30.00
[Epoch 136] test loss 4.28, test ppl 72.41
[Epoch 137 Batch 200/372] current loss 4.54, ppl 93.42, throughput 410.59 samples/s, lr 29.57
[Epoch 137] throughput 27709.35 samples/s
[Epoch 137] time cost 82.41s, valid loss 4.34, valid ppl 76.33, lr 30.00
[Epoch 137] test loss 4.28, test ppl 72.37
[Epoch 138 Batch 200/372] current loss 4.52, ppl 91.69, throughput 402.96 samples/s, lr 33.00
[Epoch 138] throughput 27732.98 samples/s
[Epoch 138] time cost 82.31s, valid loss 4.33, valid ppl 76.29, lr 30.00
[Epoch 138] test loss 4.28, test ppl 72.34
[Epoch 139 Batch 200/372] current loss 4.52, ppl 91.59, throughput 405.31 samples/s, lr 29.57
[Epoch 139] throughput 27639.71 samples/s
[Epoch 139] time cost 82.55s, valid loss 4.33, valid ppl 76.26, lr 30.00
[Epoch 139] test loss 4.28, test ppl 72.31
[Epoch 140 Batch 200/372] current loss 4.53, ppl 92.87, throughput 406.54 samples/s, lr 13.71
[Epoch 140] throughput 27980.04 samples/s
[Epoch 140] time cost 81.68s, valid loss 4.33, valid ppl 76.22, lr 30.00
[Epoch 140] test loss 4.28, test ppl 72.28
[Epoch 141 Batch 200/372] current loss 4.52, ppl 91.66, throughput 412.68 samples/s, lr 30.43
[Epoch 141] throughput 27677.44 samples/s
[Epoch 141] time cost 82.44s, valid loss 4.33, valid ppl 76.19, lr 30.00
[Epoch 141] test loss 4.28, test ppl 72.25
[Epoch 142 Batch 200/372] current loss 4.51, ppl 90.96, throughput 394.58 samples/s, lr 27.86
[Epoch 142] throughput 27110.74 samples/s
[Epoch 142] time cost 84.02s, valid loss 4.33, valid ppl 76.16, lr 30.00
[Epoch 142] test loss 4.28, test ppl 72.22
[Epoch 143 Batch 200/372] current loss 4.51, ppl 91.23, throughput 403.30 samples/s, lr 26.14
[Epoch 143] throughput 27209.73 samples/s
[Epoch 143] time cost 83.81s, valid loss 4.33, valid ppl 76.12, lr 30.00
[Epoch 143] test loss 4.28, test ppl 72.19
[Epoch 144 Batch 200/372] current loss 4.51, ppl 91.23, throughput 397.78 samples/s, lr 27.00
[Epoch 144] throughput 27113.64 samples/s
[Epoch 144] time cost 84.06s, valid loss 4.33, valid ppl 76.09, lr 30.00
[Epoch 144] test loss 4.28, test ppl 72.16
[Epoch 145 Batch 200/372] current loss 4.52, ppl 91.73, throughput 402.59 samples/s, lr 32.14
[Epoch 145] throughput 27173.94 samples/s
[Epoch 145] time cost 83.85s, valid loss 4.33, valid ppl 76.06, lr 30.00
[Epoch 145] test loss 4.28, test ppl 72.14
[Epoch 146 Batch 200/372] current loss 4.51, ppl 90.94, throughput 410.96 samples/s, lr 31.29
[Epoch 146] throughput 27490.06 samples/s
[Epoch 146] time cost 82.94s, valid loss 4.33, valid ppl 76.03, lr 30.00
[Epoch 146] test loss 4.28, test ppl 72.11
[Epoch 147 Batch 200/372] current loss 4.52, ppl 91.62, throughput 405.38 samples/s, lr 31.29
[Epoch 147] throughput 27492.38 samples/s
[Epoch 147] time cost 82.99s, valid loss 4.33, valid ppl 75.99, lr 30.00
[Epoch 147] test loss 4.28, test ppl 72.08
[Epoch 148 Batch 200/372] current loss 4.51, ppl 90.50, throughput 395.22 samples/s, lr 29.57
[Epoch 148] throughput 27123.97 samples/s
[Epoch 148] time cost 84.00s, valid loss 4.33, valid ppl 75.96, lr 30.00
[Epoch 148] test loss 4.28, test ppl 72.06
[Epoch 149 Batch 200/372] current loss 4.51, ppl 90.58, throughput 404.02 samples/s, lr 29.14
[Epoch 149] throughput 27207.76 samples/s
[Epoch 149] time cost 83.92s, valid loss 4.33, valid ppl 75.93, lr 30.00
[Epoch 149] test loss 4.28, test ppl 72.03
[Epoch 150 Batch 200/372] current loss 4.53, ppl 92.46, throughput 394.02 samples/s, lr 31.71
[Epoch 150] throughput 27378.43 samples/s
[Epoch 150] time cost 83.32s, valid loss 4.33, valid ppl 75.90, lr 30.00
[Epoch 150] test loss 4.28, test ppl 72.00
[Epoch 151 Batch 200/372] current loss 4.50, ppl 90.41, throughput 415.06 samples/s, lr 31.29
[Epoch 151] throughput 27405.17 samples/s
[Epoch 151] time cost 83.22s, valid loss 4.33, valid ppl 75.87, lr 30.00
[Epoch 151] test loss 4.28, test ppl 71.98
[Epoch 152 Batch 200/372] current loss 4.50, ppl 90.15, throughput 403.33 samples/s, lr 30.43
[Epoch 152] throughput 27559.63 samples/s
[Epoch 152] time cost 82.80s, valid loss 4.33, valid ppl 75.85, lr 30.00
[Epoch 152] test loss 4.28, test ppl 71.95
[Epoch 153 Batch 200/372] current loss 4.50, ppl 89.95, throughput 405.04 samples/s, lr 30.86
[Epoch 153] throughput 27564.47 samples/s
[Epoch 153] time cost 82.80s, valid loss 4.33, valid ppl 75.82, lr 30.00
[Epoch 153] test loss 4.28, test ppl 71.93
[Epoch 154 Batch 200/372] current loss 4.50, ppl 90.07, throughput 398.45 samples/s, lr 33.00
[Epoch 154] throughput 27395.90 samples/s
[Epoch 154] time cost 83.23s, valid loss 4.33, valid ppl 75.79, lr 30.00
[Epoch 154] test loss 4.28, test ppl 71.91
[Epoch 155 Batch 200/372] current loss 4.50, ppl 89.86, throughput 409.15 samples/s, lr 30.86
[Epoch 155] throughput 27385.21 samples/s
[Epoch 155] time cost 83.21s, valid loss 4.33, valid ppl 75.76, lr 30.00
[Epoch 155] test loss 4.28, test ppl 71.88
[Epoch 156 Batch 200/372] current loss 4.50, ppl 90.30, throughput 395.13 samples/s, lr 30.43
[Epoch 156] throughput 27205.60 samples/s
[Epoch 156] time cost 83.76s, valid loss 4.33, valid ppl 75.73, lr 30.00
[Epoch 156] test loss 4.27, test ppl 71.86
[Epoch 157 Batch 200/372] current loss 4.50, ppl 90.20, throughput 400.03 samples/s, lr 27.43
[Epoch 157] throughput 27171.44 samples/s
[Epoch 157] time cost 83.85s, valid loss 4.33, valid ppl 75.70, lr 30.00
[Epoch 157] test loss 4.27, test ppl 71.83
[Epoch 158 Batch 200/372] current loss 4.50, ppl 89.64, throughput 404.46 samples/s, lr 33.43
[Epoch 158] throughput 27265.27 samples/s
[Epoch 158] time cost 83.63s, valid loss 4.33, valid ppl 75.68, lr 30.00
[Epoch 158] test loss 4.27, test ppl 71.81
[Epoch 159 Batch 200/372] current loss 4.49, ppl 89.11, throughput 397.99 samples/s, lr 27.43
[Epoch 159] throughput 27303.98 samples/s
[Epoch 159] time cost 83.51s, valid loss 4.33, valid ppl 75.65, lr 30.00
[Epoch 159] test loss 4.27, test ppl 71.78
[Epoch 160 Batch 200/372] current loss 4.48, ppl 88.17, throughput 403.80 samples/s, lr 27.86
[Epoch 160] throughput 27397.95 samples/s
[Epoch 160] time cost 83.23s, valid loss 4.33, valid ppl 75.62, lr 30.00
[Epoch 160] test loss 4.27, test ppl 71.76
[Epoch 161 Batch 200/372] current loss 4.48, ppl 88.11, throughput 398.93 samples/s, lr 30.43
[Epoch 161] throughput 27237.45 samples/s
[Epoch 161] time cost 83.65s, valid loss 4.33, valid ppl 75.59, lr 30.00
[Epoch 161] test loss 4.27, test ppl 71.73
[Epoch 162 Batch 200/372] current loss 4.49, ppl 88.78, throughput 394.51 samples/s, lr 29.14
[Epoch 162] throughput 27008.46 samples/s
[Epoch 162] time cost 84.42s, valid loss 4.33, valid ppl 75.57, lr 30.00
[Epoch 162] test loss 4.27, test ppl 71.71
[Epoch 163 Batch 200/372] current loss 4.49, ppl 88.80, throughput 402.74 samples/s, lr 28.71
[Epoch 163] throughput 27448.41 samples/s
[Epoch 163] time cost 83.01s, valid loss 4.32, valid ppl 75.54, lr 30.00
[Epoch 163] test loss 4.27, test ppl 71.69
[Epoch 164 Batch 200/372] current loss 4.50, ppl 89.71, throughput 398.47 samples/s, lr 30.43
[Epoch 164] throughput 26965.67 samples/s
[Epoch 164] time cost 84.47s, valid loss 4.32, valid ppl 75.51, lr 30.00
[Epoch 164] test loss 4.27, test ppl 71.67
[Epoch 165 Batch 200/372] current loss 4.49, ppl 88.89, throughput 395.28 samples/s, lr 29.14
[Epoch 165] throughput 27126.82 samples/s
[Epoch 165] time cost 83.91s, valid loss 4.32, valid ppl 75.49, lr 30.00
[Epoch 165] test loss 4.27, test ppl 71.64
[Epoch 166 Batch 200/372] current loss 4.49, ppl 88.78, throughput 395.60 samples/s, lr 29.57
[Epoch 166] throughput 27565.97 samples/s
[Epoch 166] time cost 82.73s, valid loss 4.32, valid ppl 75.46, lr 30.00
[Epoch 166] test loss 4.27, test ppl 71.62
[Epoch 167 Batch 200/372] current loss 4.48, ppl 88.28, throughput 393.92 samples/s, lr 34.71
[Epoch 167] throughput 27118.19 samples/s
[Epoch 167] time cost 83.97s, valid loss 4.32, valid ppl 75.44, lr 30.00
[Epoch 167] test loss 4.27, test ppl 71.60
[Epoch 168 Batch 200/372] current loss 4.48, ppl 88.53, throughput 399.19 samples/s, lr 29.57
[Epoch 168] throughput 27068.53 samples/s
[Epoch 168] time cost 84.09s, valid loss 4.32, valid ppl 75.41, lr 30.00
[Epoch 168] test loss 4.27, test ppl 71.58
[Epoch 169 Batch 200/372] current loss 4.49, ppl 89.02, throughput 398.23 samples/s, lr 28.71
[Epoch 169] throughput 27492.33 samples/s
[Epoch 169] time cost 82.94s, valid loss 4.32, valid ppl 75.39, lr 30.00
[Epoch 169] test loss 4.27, test ppl 71.56
[Epoch 170 Batch 200/372] current loss 4.48, ppl 88.04, throughput 395.03 samples/s, lr 30.00
[Epoch 170] throughput 27139.57 samples/s
[Epoch 170] time cost 83.93s, valid loss 4.32, valid ppl 75.36, lr 30.00
[Epoch 170] test loss 4.27, test ppl 71.54
[Epoch 171 Batch 200/372] current loss 4.48, ppl 88.52, throughput 396.14 samples/s, lr 32.14
[Epoch 171] throughput 27214.95 samples/s
[Epoch 171] time cost 83.70s, valid loss 4.32, valid ppl 75.34, lr 30.00
[Epoch 171] test loss 4.27, test ppl 71.52
[Epoch 172 Batch 200/372] current loss 4.46, ppl 86.55, throughput 395.02 samples/s, lr 29.14
[Epoch 172] throughput 26935.67 samples/s
[Epoch 172] time cost 84.57s, valid loss 4.32, valid ppl 75.31, lr 30.00
[Epoch 172] test loss 4.27, test ppl 71.50
[Epoch 173 Batch 200/372] current loss 4.48, ppl 87.84, throughput 390.93 samples/s, lr 31.29
[Epoch 173] throughput 27165.39 samples/s
[Epoch 173] time cost 83.85s, valid loss 4.32, valid ppl 75.29, lr 30.00
[Epoch 173] test loss 4.27, test ppl 71.48
[Epoch 174 Batch 200/372] current loss 4.48, ppl 88.38, throughput 403.58 samples/s, lr 29.14
[Epoch 174] throughput 27546.52 samples/s
[Epoch 174] time cost 82.88s, valid loss 4.32, valid ppl 75.26, lr 30.00
[Epoch 174] test loss 4.27, test ppl 71.46
[Epoch 175 Batch 200/372] current loss 4.48, ppl 88.61, throughput 400.21 samples/s, lr 33.00
[Epoch 175] throughput 27229.15 samples/s
[Epoch 175] time cost 83.65s, valid loss 4.32, valid ppl 75.24, lr 30.00
[Epoch 175] test loss 4.27, test ppl 71.44
[Epoch 176 Batch 200/372] current loss 4.46, ppl 86.47, throughput 397.55 samples/s, lr 35.14
[Epoch 176] throughput 27078.63 samples/s
[Epoch 176] time cost 84.12s, valid loss 4.32, valid ppl 75.21, lr 30.00
[Epoch 176] test loss 4.27, test ppl 71.42
[Epoch 177 Batch 200/372] current loss 4.46, ppl 86.84, throughput 403.92 samples/s, lr 28.29
[Epoch 177] throughput 27459.02 samples/s
[Epoch 177] time cost 82.96s, valid loss 4.32, valid ppl 75.19, lr 30.00
[Epoch 177] test loss 4.27, test ppl 71.40
[Epoch 178 Batch 200/372] current loss 4.46, ppl 86.41, throughput 394.64 samples/s, lr 28.29
[Epoch 178] throughput 27016.64 samples/s
[Epoch 178] time cost 84.23s, valid loss 4.32, valid ppl 75.17, lr 30.00
[Epoch 178] test loss 4.27, test ppl 71.38
[Epoch 179 Batch 200/372] current loss 4.47, ppl 87.11, throughput 399.25 samples/s, lr 30.00
[Epoch 179] throughput 27027.39 samples/s
[Epoch 179] time cost 84.24s, valid loss 4.32, valid ppl 75.14, lr 30.00
[Epoch 179] test loss 4.27, test ppl 71.35
[Epoch 180 Batch 200/372] current loss 4.48, ppl 87.99, throughput 399.02 samples/s, lr 29.57
[Epoch 180] throughput 27195.42 samples/s
[Epoch 180] time cost 83.74s, valid loss 4.32, valid ppl 75.12, lr 30.00
[Epoch 180] test loss 4.27, test ppl 71.33
[Epoch 181 Batch 200/372] current loss 4.47, ppl 87.36, throughput 410.59 samples/s, lr 30.43
[Epoch 181] throughput 27314.89 samples/s
[Epoch 181] time cost 83.48s, valid loss 4.32, valid ppl 75.10, lr 30.00
[Epoch 181] test loss 4.27, test ppl 71.31
[Epoch 182 Batch 200/372] current loss 4.47, ppl 87.01, throughput 403.44 samples/s, lr 28.71
[Epoch 182] throughput 27645.99 samples/s
[Epoch 182] time cost 82.48s, valid loss 4.32, valid ppl 75.07, lr 30.00
[Epoch 182] test loss 4.27, test ppl 71.30
[Epoch 183 Batch 200/372] current loss 4.48, ppl 88.02, throughput 402.73 samples/s, lr 27.86
[Epoch 183] throughput 27634.59 samples/s
[Epoch 183] time cost 82.53s, valid loss 4.32, valid ppl 75.05, lr 30.00
[Epoch 183] test loss 4.27, test ppl 71.28
[Epoch 184 Batch 200/372] current loss 4.47, ppl 87.58, throughput 409.66 samples/s, lr 30.43
[Epoch 184] throughput 27760.38 samples/s
[Epoch 184] time cost 82.25s, valid loss 4.32, valid ppl 75.03, lr 30.00
[Epoch 184] test loss 4.27, test ppl 71.26
[Epoch 185 Batch 200/372] current loss 4.47, ppl 86.93, throughput 404.68 samples/s, lr 33.00
[Epoch 185] throughput 27517.99 samples/s
[Epoch 185] time cost 82.84s, valid loss 4.32, valid ppl 75.00, lr 30.00
[Epoch 185] test loss 4.27, test ppl 71.24
[Epoch 186 Batch 200/372] current loss 4.47, ppl 87.17, throughput 409.79 samples/s, lr 30.86
[Epoch 186] throughput 27749.25 samples/s
[Epoch 186] time cost 82.17s, valid loss 4.32, valid ppl 74.98, lr 30.00
[Epoch 186] test loss 4.27, test ppl 71.22
[Epoch 187 Batch 200/372] current loss 4.47, ppl 87.55, throughput 403.36 samples/s, lr 15.43
[Epoch 187] throughput 27177.84 samples/s
[Epoch 187] time cost 83.79s, valid loss 4.32, valid ppl 74.96, lr 30.00
[Epoch 187] test loss 4.27, test ppl 71.20
[Epoch 188 Batch 200/372] current loss 4.44, ppl 84.77, throughput 409.04 samples/s, lr 28.29
[Epoch 188] throughput 27656.31 samples/s
[Epoch 188] time cost 82.47s, valid loss 4.32, valid ppl 74.94, lr 30.00
[Epoch 188] test loss 4.27, test ppl 71.18
[Epoch 189 Batch 200/372] current loss 4.47, ppl 87.10, throughput 391.62 samples/s, lr 27.43
[Epoch 189] throughput 27204.49 samples/s
[Epoch 189] time cost 83.71s, valid loss 4.32, valid ppl 74.92, lr 30.00
[Epoch 189] test loss 4.26, test ppl 71.16
[Epoch 190 Batch 200/372] current loss 4.45, ppl 85.22, throughput 408.51 samples/s, lr 28.29
[Epoch 190] throughput 27446.09 samples/s
[Epoch 190] time cost 83.25s, valid loss 4.32, valid ppl 74.90, lr 30.00
[Epoch 190] test loss 4.26, test ppl 71.14
[Epoch 191 Batch 200/372] current loss 4.45, ppl 85.63, throughput 407.86 samples/s, lr 28.29
[Epoch 191] throughput 27496.12 samples/s
[Epoch 191] time cost 82.90s, valid loss 4.32, valid ppl 74.87, lr 30.00
[Epoch 191] test loss 4.26, test ppl 71.13
[Epoch 192 Batch 200/372] current loss 4.47, ppl 87.08, throughput 414.06 samples/s, lr 30.86
[Epoch 192] throughput 27669.08 samples/s
[Epoch 192] time cost 82.43s, valid loss 4.32, valid ppl 74.85, lr 30.00
[Epoch 192] test loss 4.26, test ppl 71.11
[Epoch 193 Batch 200/372] current loss 4.45, ppl 86.03, throughput 407.90 samples/s, lr 31.29
[Epoch 193] throughput 27436.41 samples/s
[Epoch 193] time cost 83.09s, valid loss 4.32, valid ppl 74.83, lr 30.00
[Epoch 193] test loss 4.26, test ppl 71.09
[Epoch 194 Batch 200/372] current loss 4.44, ppl 85.03, throughput 408.76 samples/s, lr 34.71
[Epoch 194] throughput 27788.74 samples/s
[Epoch 194] time cost 82.03s, valid loss 4.31, valid ppl 74.81, lr 30.00
[Epoch 194] test loss 4.26, test ppl 71.07
[Epoch 195 Batch 200/372] current loss 4.45, ppl 85.94, throughput 407.11 samples/s, lr 28.71
[Epoch 195] throughput 27663.72 samples/s
[Epoch 195] time cost 82.49s, valid loss 4.31, valid ppl 74.79, lr 30.00
[Epoch 195] test loss 4.26, test ppl 71.06
[Epoch 196 Batch 200/372] current loss 4.46, ppl 86.45, throughput 406.84 samples/s, lr 30.00
[Epoch 196] throughput 27546.40 samples/s
[Epoch 196] time cost 82.80s, valid loss 4.31, valid ppl 74.77, lr 30.00
[Epoch 196] test loss 4.26, test ppl 71.04
[Epoch 197 Batch 200/372] current loss 4.45, ppl 86.03, throughput 403.65 samples/s, lr 15.43
[Epoch 197] throughput 27571.09 samples/s
[Epoch 197] time cost 82.72s, valid loss 4.31, valid ppl 74.75, lr 30.00
[Epoch 197] test loss 4.26, test ppl 71.02
[Epoch 198 Batch 200/372] current loss 4.47, ppl 87.10, throughput 409.34 samples/s, lr 30.43
[Epoch 198] throughput 27764.88 samples/s
[Epoch 198] time cost 82.30s, valid loss 4.31, valid ppl 74.73, lr 30.00
[Epoch 198] test loss 4.26, test ppl 71.00
[Epoch 199 Batch 200/372] current loss 4.45, ppl 85.56, throughput 406.29 samples/s, lr 30.00
[Epoch 199] throughput 27616.48 samples/s
[Epoch 199] time cost 82.55s, valid loss 4.31, valid ppl 74.71, lr 30.00
[Epoch 199] test loss 4.26, test ppl 70.99
[Epoch 200 Batch 200/372] current loss 4.45, ppl 85.65, throughput 409.13 samples/s, lr 30.86
[Epoch 200] throughput 27609.83 samples/s
[Epoch 200] time cost 82.59s, valid loss 4.31, valid ppl 74.69, lr 30.00
[Epoch 200] test loss 4.26, test ppl 70.97
[Epoch 201 Batch 200/372] current loss 4.44, ppl 84.91, throughput 406.30 samples/s, lr 30.00
[Epoch 201] throughput 27573.07 samples/s
[Epoch 201] time cost 82.68s, valid loss 4.31, valid ppl 74.67, lr 30.00
[Epoch 201] test loss 4.26, test ppl 70.95
[Epoch 202 Batch 200/372] current loss 4.46, ppl 86.18, throughput 407.30 samples/s, lr 30.00
[Epoch 202] throughput 27643.03 samples/s
[Epoch 202] time cost 82.60s, valid loss 4.31, valid ppl 74.65, lr 30.00
[Epoch 202] test loss 4.26, test ppl 70.94
[Epoch 203 Batch 200/372] current loss 4.45, ppl 85.39, throughput 405.13 samples/s, lr 28.29
[Epoch 203] throughput 27560.48 samples/s
[Epoch 203] time cost 82.62s, valid loss 4.31, valid ppl 74.63, lr 30.00
[Epoch 203] test loss 4.26, test ppl 70.92
[Epoch 204 Batch 200/372] current loss 4.45, ppl 85.76, throughput 402.55 samples/s, lr 28.29
[Epoch 204] throughput 27477.28 samples/s
[Epoch 204] time cost 82.92s, valid loss 4.31, valid ppl 74.61, lr 30.00
[Epoch 204] test loss 4.26, test ppl 70.91
[Epoch 205 Batch 200/372] current loss 4.45, ppl 85.92, throughput 402.70 samples/s, lr 32.14
[Epoch 205] throughput 27517.07 samples/s
[Epoch 205] time cost 82.83s, valid loss 4.31, valid ppl 74.59, lr 30.00
[Epoch 205] test loss 4.26, test ppl 70.89
[Epoch 206 Batch 200/372] current loss 4.46, ppl 86.10, throughput 401.54 samples/s, lr 28.29
[Epoch 206] throughput 27406.47 samples/s
[Epoch 206] time cost 83.10s, valid loss 4.31, valid ppl 74.58, lr 30.00
[Epoch 206] test loss 4.26, test ppl 70.88
[Epoch 207 Batch 200/372] current loss 4.45, ppl 85.51, throughput 399.48 samples/s, lr 30.43
[Epoch 207] throughput 27545.56 samples/s
[Epoch 207] time cost 82.72s, valid loss 4.31, valid ppl 74.56, lr 30.00
[Epoch 207] test loss 4.26, test ppl 70.86
[Epoch 208 Batch 200/372] current loss 4.45, ppl 85.94, throughput 404.77 samples/s, lr 33.43
[Epoch 208] throughput 27281.13 samples/s
[Epoch 208] time cost 83.51s, valid loss 4.31, valid ppl 74.54, lr 30.00
[Epoch 208] test loss 4.26, test ppl 70.85
[Epoch 209 Batch 200/372] current loss 4.44, ppl 84.43, throughput 404.96 samples/s, lr 34.71
[Epoch 209] throughput 27470.99 samples/s
[Epoch 209] time cost 82.93s, valid loss 4.31, valid ppl 74.52, lr 30.00
[Epoch 209] test loss 4.26, test ppl 70.83
[Epoch 210 Batch 200/372] current loss 4.45, ppl 85.87, throughput 402.33 samples/s, lr 31.29
[Epoch 210] throughput 27457.94 samples/s
[Epoch 210] time cost 82.94s, valid loss 4.31, valid ppl 74.50, lr 30.00
[Epoch 210] test loss 4.26, test ppl 70.82
[Epoch 211 Batch 200/372] current loss 4.45, ppl 85.81, throughput 410.75 samples/s, lr 28.29
[Epoch 211] throughput 27630.89 samples/s
[Epoch 211] time cost 82.49s, valid loss 4.31, valid ppl 74.48, lr 30.00
[Epoch 211] test loss 4.26, test ppl 70.80
[Epoch 212 Batch 200/372] current loss 4.45, ppl 85.49, throughput 404.62 samples/s, lr 29.57
[Epoch 212] throughput 27646.39 samples/s
[Epoch 212] time cost 82.50s, valid loss 4.31, valid ppl 74.47, lr 30.00
[Epoch 212] test loss 4.26, test ppl 70.79
[Epoch 213 Batch 200/372] current loss 4.44, ppl 84.45, throughput 404.74 samples/s, lr 13.29
[Epoch 213] throughput 27544.15 samples/s
[Epoch 213] time cost 82.78s, valid loss 4.31, valid ppl 74.45, lr 30.00
[Epoch 213] test loss 4.26, test ppl 70.77
[Epoch 214 Batch 200/372] current loss 4.45, ppl 85.85, throughput 397.14 samples/s, lr 31.29
[Epoch 214] throughput 27351.94 samples/s
[Epoch 214] time cost 83.30s, valid loss 4.31, valid ppl 74.43, lr 30.00
[Epoch 214] test loss 4.26, test ppl 70.75
[Epoch 215 Batch 200/372] current loss 4.44, ppl 84.49, throughput 396.59 samples/s, lr 29.14
[Epoch 215] throughput 27324.09 samples/s
[Epoch 215] time cost 83.31s, valid loss 4.31, valid ppl 74.41, lr 30.00
[Epoch 215] test loss 4.26, test ppl 70.74
[Epoch 216 Batch 200/372] current loss 4.45, ppl 85.21, throughput 410.10 samples/s, lr 32.14
[Epoch 216] throughput 27784.50 samples/s
[Epoch 216] time cost 82.23s, valid loss 4.31, valid ppl 74.40, lr 30.00
[Epoch 216] test loss 4.26, test ppl 70.72
[Epoch 217 Batch 200/372] current loss 4.43, ppl 83.85, throughput 404.94 samples/s, lr 28.71
[Epoch 217] throughput 27815.28 samples/s
[Epoch 217] time cost 82.02s, valid loss 4.31, valid ppl 74.38, lr 30.00
[Epoch 217] test loss 4.26, test ppl 70.71
[Epoch 218 Batch 200/372] current loss 4.42, ppl 83.16, throughput 407.01 samples/s, lr 30.86
[Epoch 218] throughput 27634.95 samples/s
[Epoch 218] time cost 82.46s, valid loss 4.31, valid ppl 74.36, lr 30.00
[Epoch 218] test loss 4.26, test ppl 70.69
[Epoch 219 Batch 200/372] current loss 4.44, ppl 84.59, throughput 410.84 samples/s, lr 29.14
[Epoch 219] throughput 27487.45 samples/s
[Epoch 219] time cost 82.89s, valid loss 4.31, valid ppl 74.34, lr 30.00
[Epoch 219] test loss 4.26, test ppl 70.68
[Epoch 220 Batch 200/372] current loss 4.44, ppl 84.44, throughput 397.58 samples/s, lr 29.57
[Epoch 220] throughput 27467.03 samples/s
[Epoch 220] time cost 82.95s, valid loss 4.31, valid ppl 74.32, lr 30.00
[Epoch 220] test loss 4.26, test ppl 70.66
[Epoch 221 Batch 200/372] current loss 4.43, ppl 84.03, throughput 404.66 samples/s, lr 28.71
[Epoch 221] throughput 27450.52 samples/s
[Epoch 221] time cost 83.02s, valid loss 4.31, valid ppl 74.31, lr 30.00
[Epoch 221] test loss 4.26, test ppl 70.65
[Epoch 222 Batch 200/372] current loss 4.44, ppl 84.56, throughput 403.98 samples/s, lr 29.57
[Epoch 222] throughput 27764.46 samples/s
[Epoch 222] time cost 82.11s, valid loss 4.31, valid ppl 74.29, lr 30.00
[Epoch 222] test loss 4.26, test ppl 70.63
[Epoch 223 Batch 200/372] current loss 4.43, ppl 84.12, throughput 400.06 samples/s, lr 30.00
[Epoch 223] throughput 27678.05 samples/s
[Epoch 223] time cost 82.35s, valid loss 4.31, valid ppl 74.27, lr 30.00
[Epoch 223] test loss 4.26, test ppl 70.62
[Epoch 224 Batch 200/372] current loss 4.43, ppl 83.70, throughput 407.91 samples/s, lr 28.71
[Epoch 224] throughput 27563.47 samples/s
[Epoch 224] time cost 82.72s, valid loss 4.31, valid ppl 74.26, lr 30.00
[Epoch 224] test loss 4.26, test ppl 70.61
[Epoch 225 Batch 200/372] current loss 4.43, ppl 83.84, throughput 405.59 samples/s, lr 26.57
[Epoch 225] throughput 27700.91 samples/s
[Epoch 225] time cost 82.33s, valid loss 4.31, valid ppl 74.24, lr 30.00
[Epoch 225] test loss 4.26, test ppl 70.59
[Epoch 226 Batch 200/372] current loss 4.44, ppl 84.41, throughput 407.69 samples/s, lr 28.71
[Epoch 226] throughput 27701.66 samples/s
[Epoch 226] time cost 82.30s, valid loss 4.31, valid ppl 74.22, lr 30.00
[Epoch 226] test loss 4.26, test ppl 70.58
[Epoch 227 Batch 200/372] current loss 4.43, ppl 83.63, throughput 401.13 samples/s, lr 28.29
[Epoch 227] throughput 27473.19 samples/s
[Epoch 227] time cost 82.97s, valid loss 4.31, valid ppl 74.21, lr 30.00
[Epoch 227] test loss 4.26, test ppl 70.56
[Epoch 228 Batch 200/372] current loss 4.43, ppl 84.00, throughput 403.45 samples/s, lr 29.14
[Epoch 228] throughput 27254.16 samples/s
[Epoch 228] time cost 83.58s, valid loss 4.31, valid ppl 74.19, lr 30.00
[Epoch 228] test loss 4.26, test ppl 70.55
[Epoch 229 Batch 200/372] current loss 4.41, ppl 82.30, throughput 409.42 samples/s, lr 28.71
[Epoch 229] throughput 27496.05 samples/s
[Epoch 229] time cost 82.87s, valid loss 4.31, valid ppl 74.18, lr 30.00
[Epoch 229] test loss 4.26, test ppl 70.54
[Epoch 230 Batch 200/372] current loss 4.43, ppl 83.84, throughput 395.11 samples/s, lr 29.14
[Epoch 230] throughput 27338.18 samples/s
[Epoch 230] time cost 83.33s, valid loss 4.31, valid ppl 74.16, lr 30.00
[Epoch 230] test loss 4.26, test ppl 70.52
[Epoch 231 Batch 200/372] current loss 4.44, ppl 84.58, throughput 394.25 samples/s, lr 27.43
[Epoch 231] throughput 27146.94 samples/s
[Epoch 231] time cost 83.79s, valid loss 4.31, valid ppl 74.14, lr 30.00
[Epoch 231] test loss 4.26, test ppl 70.51
[Epoch 232 Batch 200/372] current loss 4.42, ppl 83.47, throughput 405.11 samples/s, lr 34.71
[Epoch 232] throughput 27429.10 samples/s
[Epoch 232] time cost 83.04s, valid loss 4.31, valid ppl 74.13, lr 30.00
[Epoch 232] test loss 4.26, test ppl 70.50
[Epoch 233 Batch 200/372] current loss 4.43, ppl 84.24, throughput 400.09 samples/s, lr 30.86
[Epoch 233] throughput 27110.02 samples/s
[Epoch 233] time cost 83.94s, valid loss 4.31, valid ppl 74.11, lr 30.00
[Epoch 233] test loss 4.26, test ppl 70.49
[Epoch 234 Batch 200/372] current loss 4.42, ppl 83.09, throughput 406.05 samples/s, lr 30.00
[Epoch 234] throughput 27323.97 samples/s
[Epoch 234] time cost 83.30s, valid loss 4.31, valid ppl 74.10, lr 30.00
[Epoch 234] test loss 4.26, test ppl 70.47
[Epoch 235 Batch 200/372] current loss 4.44, ppl 84.62, throughput 397.32 samples/s, lr 27.86
[Epoch 235] throughput 27210.66 samples/s
[Epoch 235] time cost 83.78s, valid loss 4.31, valid ppl 74.08, lr 30.00
[Epoch 235] test loss 4.26, test ppl 70.46
[Epoch 236 Batch 200/372] current loss 4.40, ppl 81.27, throughput 397.99 samples/s, lr 28.29
[Epoch 236] throughput 27083.09 samples/s
[Epoch 236] time cost 84.01s, valid loss 4.30, valid ppl 74.06, lr 30.00
[Epoch 236] test loss 4.25, test ppl 70.45
[Epoch 237 Batch 200/372] current loss 4.42, ppl 82.99, throughput 398.61 samples/s, lr 25.29
[Epoch 237] throughput 27149.89 samples/s
[Epoch 237] time cost 83.79s, valid loss 4.30, valid ppl 74.05, lr 30.00
[Epoch 237] test loss 4.25, test ppl 70.43
[Epoch 238 Batch 200/372] current loss 4.43, ppl 83.67, throughput 396.44 samples/s, lr 27.43
[Epoch 238] throughput 27288.32 samples/s
[Epoch 238] time cost 83.45s, valid loss 4.30, valid ppl 74.03, lr 30.00
[Epoch 238] test loss 4.25, test ppl 70.42
[Epoch 239 Batch 200/372] current loss 4.42, ppl 82.91, throughput 399.28 samples/s, lr 29.57
[Epoch 239] throughput 27172.74 samples/s
[Epoch 239] time cost 83.80s, valid loss 4.30, valid ppl 74.02, lr 30.00
[Epoch 239] test loss 4.25, test ppl 70.41
[Epoch 240 Batch 200/372] current loss 4.41, ppl 82.64, throughput 395.49 samples/s, lr 33.00
[Epoch 240] throughput 27170.79 samples/s
[Epoch 240] time cost 83.76s, valid loss 4.30, valid ppl 74.00, lr 30.00
[Epoch 240] test loss 4.25, test ppl 70.39
[Epoch 241 Batch 200/372] current loss 4.41, ppl 81.91, throughput 399.64 samples/s, lr 28.29
[Epoch 241] throughput 27092.95 samples/s
[Epoch 241] time cost 84.02s, valid loss 4.30, valid ppl 73.99, lr 30.00
[Epoch 241] test loss 4.25, test ppl 70.38
[Epoch 242 Batch 200/372] current loss 4.41, ppl 82.18, throughput 404.78 samples/s, lr 30.00
[Epoch 242] throughput 27617.68 samples/s
[Epoch 242] time cost 82.52s, valid loss 4.30, valid ppl 73.98, lr 30.00
[Epoch 242] test loss 4.25, test ppl 70.37
[Epoch 243 Batch 200/372] current loss 4.42, ppl 83.04, throughput 397.11 samples/s, lr 27.43
[Epoch 243] throughput 27193.15 samples/s
[Epoch 243] time cost 83.72s, valid loss 4.30, valid ppl 73.96, lr 30.00
[Epoch 243] test loss 4.25, test ppl 70.36
[Epoch 244 Batch 200/372] current loss 4.42, ppl 83.46, throughput 403.71 samples/s, lr 31.29
[Epoch 244] throughput 27283.62 samples/s
[Epoch 244] time cost 83.47s, valid loss 4.30, valid ppl 73.95, lr 30.00
[Epoch 244] test loss 4.25, test ppl 70.35
[Epoch 245 Batch 200/372] current loss 4.42, ppl 82.78, throughput 397.51 samples/s, lr 30.86
[Epoch 245] throughput 27109.35 samples/s
[Epoch 245] time cost 83.94s, valid loss 4.30, valid ppl 73.93, lr 30.00
[Epoch 245] test loss 4.25, test ppl 70.33
[Epoch 246 Batch 200/372] current loss 4.41, ppl 82.63, throughput 395.48 samples/s, lr 31.71
[Epoch 246] throughput 27163.51 samples/s
[Epoch 246] time cost 83.84s, valid loss 4.30, valid ppl 73.92, lr 30.00
[Epoch 246] test loss 4.25, test ppl 70.32
[Epoch 247 Batch 200/372] current loss 4.40, ppl 81.75, throughput 409.69 samples/s, lr 30.86
[Epoch 247] throughput 27486.74 samples/s
[Epoch 247] time cost 82.89s, valid loss 4.30, valid ppl 73.91, lr 30.00
[Epoch 247] test loss 4.25, test ppl 70.31
[Epoch 248 Batch 200/372] current loss 4.41, ppl 82.19, throughput 404.04 samples/s, lr 29.57
[Epoch 248] throughput 26861.76 samples/s
[Epoch 248] time cost 84.68s, valid loss 4.30, valid ppl 73.89, lr 30.00
[Epoch 248] test loss 4.25, test ppl 70.30
[Epoch 249 Batch 200/372] current loss 4.42, ppl 83.35, throughput 406.14 samples/s, lr 29.57
[Epoch 249] throughput 27259.46 samples/s
[Epoch 249] time cost 83.51s, valid loss 4.30, valid ppl 73.88, lr 30.00