[Model] Add MASTER (open-mmlab#807)

* fix open-mmlab#794: add MASTER * fix conflict add MASTER * fix conflict add MASTER * fix conflict add MASTER * fix conflict add MASTER * fix conflict add MASTER * fix conflict add MASTER * fix conflict add MASTER * Fix linting * after git rebase main * after git rebase main * fix conflict add MASTER * fix conflict add MASTER * after git rebase main * fix conflict add MASTER * fix conflict add MASTER * fix conflict add MASTER * after git rebase main * add GCAModule to plugins * coexist master and master_old * fix merge mmocr 0.5.0 conflict * fix lint error * update * [fix] remove remains in __init__ * [update] update code in review * update readme for master * Add docstr to MasterDecoder, refined MasterDecoder, remove MASTERLoss * Unify the output length of MasterDecoder in train and test mode; add test for it, remove MasterLoss * update readme * update * update metafile,README,demo/README,config,ocr.py * Update mmocr/utils/ocr.py * update Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> Co-authored-by: Mountchicken <mountchicken@outlook.com>
gaotongxiao · May 5, 2022 · e8e040c · e8e040c
1 parent 9fffd12
commit e8e040c
Show file tree

Hide file tree

Showing 22 changed files with 768 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -81,6 +81,7 @@ Supported algorithms:
 
 - [x] [ABINet](configs/textrecog/abinet/README.md) (CVPR'2021)
 - [x] [CRNN](configs/textrecog/crnn/README.md) (TPAMI'2016)
+- [x] [MASTER](configs/textrecog/master/README.md) (PR'2021)
 - [x] [NRTR](configs/textrecog/nrtr/README.md) (ICDAR'2019)
 - [x] [RobustScanner](configs/textrecog/robust_scanner/README.md) (ECCV'2020)
 - [x] [SAR](configs/textrecog/sar/README.md) (AAAI'2019)

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -81,6 +81,7 @@ MMOCR 是基于 PyTorch 和 mmdetection 的开源工具箱，专注于文本检
 
 - [x] [ABINet](configs/textrecog/abinet/README.md) (CVPR'2021)
 - [x] [CRNN](configs/textrecog/crnn/README.md) (TPAMI'2016)
+- [x] [MASTER](configs/textrecog/master/README.md) (PR'2021)
 - [x] [NRTR](configs/textrecog/nrtr/README.md) (ICDAR'2019)
 - [x] [RobustScanner](configs/textrecog/robust_scanner/README.md) (ECCV'2020)
 - [x] [SAR](configs/textrecog/sar/README.md) (AAAI'2019)

diff --git a/configs/_base_/recog_datasets/ST_SA_MJ_train.py b/configs/_base_/recog_datasets/ST_SA_MJ_train.py
@@ -0,0 +1,41 @@
+# Text Recognition Training set, including:
+# Synthetic Datasets: SynthText, Syn90k
+
+train_root = 'data/mixture'
+
+train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
+train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
+
+train1 = dict(
+    type='OCRDataset',
+    img_prefix=train_img_prefix1,
+    ann_file=train_ann_file1,
+    loader=dict(
+        type='AnnFileLoader',
+        repeat=1,
+        file_format='lmdb',
+        parser=dict(
+            type='LineStrParser',
+            keys=['filename', 'text'],
+            keys_idx=[0, 1],
+            separator=' ')),
+    pipeline=None,
+    test_mode=False)
+
+train_img_prefix2 = f'{train_root}/SynthText/' + \
+    'synthtext/SynthText_patch_horizontal'
+train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
+
+train_img_prefix3 = f'{train_root}/SynthText_Add'
+train_ann_file3 = f'{train_root}/SynthText_Add/label.txt'
+
+train2 = {key: value for key, value in train1.items()}
+train2['img_prefix'] = train_img_prefix2
+train2['ann_file'] = train_ann_file2
+
+train3 = {key: value for key, value in train1.items()}
+train3['img_prefix'] = train_img_prefix3
+train3['ann_file'] = train_ann_file3
+train3['loader']['file_format'] = 'txt'
+
+train_list = [train1, train2, train3]
diff --git a/configs/_base_/recog_models/master.py b/configs/_base_/recog_models/master.py
@@ -0,0 +1,61 @@
+label_convertor = dict(
+    type='AttnConvertor', dict_type='DICT90', with_unknown=True)
+
+model = dict(
+    type='MASTER',
+    backbone=dict(
+        type='ResNet',
+        in_channels=3,
+        stem_channels=[64, 128],
+        block_cfgs=dict(
+            type='BasicBlock',
+            plugins=dict(
+                cfg=dict(
+                    type='GCAModule',
+                    ratio=0.0625,
+                    headers=1,
+                    pooling_type='att',
+                    is_att_scale=False,
+                    fusion_type='channel_add'),
+                position='after_conv2')),
+        arch_layers=[1, 2, 5, 3],
+        arch_channels=[256, 256, 512, 512],
+        strides=[1, 1, 1, 1],
+        plugins=[
+            dict(
+                cfg=dict(type='Maxpool2d', kernel_size=2, stride=(2, 2)),
+                stages=(True, True, False, False),
+                position='before_stage'),
+            dict(
+                cfg=dict(type='Maxpool2d', kernel_size=(2, 1), stride=(2, 1)),
+                stages=(False, False, True, False),
+                position='before_stage'),
+            dict(
+                cfg=dict(
+                    type='ConvModule',
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=dict(type='BN'),
+                    act_cfg=dict(type='ReLU')),
+                stages=(True, True, True, True),
+                position='after_stage')
+        ],
+        init_cfg=[
+            dict(type='Kaiming', layer='Conv2d'),
+            dict(type='Constant', val=1, layer='BatchNorm2d'),
+        ]),
+    encoder=None,
+    decoder=dict(
+        type='MasterDecoder',
+        d_model=512,
+        n_head=8,
+        attn_drop=0.,
+        ffn_drop=0.,
+        d_inner=2048,
+        n_layers=3,
+        feat_pe_drop=0.2,
+        feat_size=6 * 40),
+    loss=dict(type='TFLoss', reduction='mean'),
+    label_convertor=label_convertor,
+    max_seq_len=30)
diff --git a/configs/_base_/recog_pipelines/master_pipeline.py b/configs/_base_/recog_pipelines/master_pipeline.py
@@ -0,0 +1,42 @@
+img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='ResizeOCR',
+        height=48,
+        min_width=48,
+        max_width=160,
+        keep_aspect_ratio=True),
+    dict(type='ToTensorOCR'),
+    dict(type='NormalizeOCR', **img_norm_cfg),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio',
+            'resize_shape'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiRotateAugOCR',
+        rotate_degrees=[0, 90, 270],
+        transforms=[
+            dict(
+                type='ResizeOCR',
+                height=48,
+                min_width=48,
+                max_width=160,
+                keep_aspect_ratio=True),
+            dict(type='ToTensorOCR'),
+            dict(type='NormalizeOCR', **img_norm_cfg),
+            dict(
+                type='Collect',
+                keys=['img'],
+                meta_keys=[
+                    'filename', 'ori_shape', 'img_shape', 'valid_ratio',
+                    'img_norm_cfg', 'ori_filename', 'resize_shape'
+                ]),
+        ])
+]
diff --git a/configs/_base_/schedules/schedule_adam_step_12e.py b/configs/_base_/schedules/schedule_adam_step_12e.py
@@ -0,0 +1,12 @@
+# optimizer
+optimizer = dict(type='Adam', lr=4e-4)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=100,
+    warmup_ratio=1.0 / 3,
+    step=[11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
+checkpoint_config = dict(interval=1)
diff --git a/configs/textrecog/master/README.md b/configs/textrecog/master/README.md
@@ -0,0 +1,52 @@
+# MASTER
+
+>[MASTER: Multi-aspect non-local network for scene text recognition](https://arxiv.org/abs/1910.02562)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+Attention-based scene text recognizers have gained huge success, which leverages a more compact intermediate representation to learn 1d- or 2d- attention by a RNN-based encoder-decoder architecture. However, such methods suffer from attention-drift problem because high similarity among encoded features leads to attention confusion under the RNN-based local attention mechanism. Moreover, RNN-based methods have low efficiency due to poor parallelization. To overcome these problems, we propose the MASTER, a self-attention based scene text recognizer that (1) not only encodes the input-output attention but also learns self-attention which encodes feature-feature and target-target relationships inside the encoder and decoder and (2) learns a more powerful and robust intermediate representation to spatial distortion, and (3) owns a great training efficiency because of high training parallelization and a high-speed inference because of an efficient memory-cache mechanism. Extensive experiments on various benchmarks demonstrate the superior performance of our MASTER on both regular and irregular scene text.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/65173622/164642001-037f81b7-37dd-4808-a6a9-09ff6f6a17ea.JPG">
+</div>
+
+## Dataset
+
+### Train Dataset
+
+| trainset  | instance_num | repeat_num | source |
+| :-------: | :----------: | :--------: | :----: |
+| SynthText |   7266686    |     1      | synth  |
+| SynthAdd  |   1216889    |     1      | synth  |
+|  Syn90k   |   8919273    |     1      | synth  |
+
+### Test Dataset
+
+| testset | instance_num |   type    |
+| :-----: | :----------: | :-------: |
+| IIIT5K  |     3000     |  regular  |
+|   SVT   |     647      |  regular  |
+|  IC13   |     1015     |  regular  |
+|  IC15   |     2077     | irregular |
+|  SVTP   |     645      | irregular |
+|  CT80   |     288      | irregular |
+
+## Results and Models
+
+|                        Methods                         |   Backbone    |        | Regular Text |       |       |       | Irregular Text |       |                                                                                                   download                                                                                                    |
+| :----------------------------------------------------: | :-----------: | :----: | :----------: | :---: | :---: | :---: | :------------: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|                                                        |               | IIIT5K |     SVT      | IC13  |       | IC15  |      SVTP      | CT80  |
+| [MASTER](/configs/textrecog/master/master_academic.py) | R31-GCAModule | 95.27  |     89.8     | 95.17 |       | 77.03 |     82.95      | 89.93 | [model](https://download.openmmlab.com/mmocr/textrecog/master/master_r31_12e_ST_MJ_SA-787edd36.pth) \| [log](https://download.openmmlab.com/mmocr/textrecog/master/master_r31_12e_ST_MJ_SA-787edd36.log.json) |
+
+## Citation
+
+```bibtex
+@article{Lu2021MASTER,
+  title={{MASTER}: Multi-Aspect Non-local Network for Scene Text Recognition},
+  author={Ning Lu and Wenwen Yu and Xianbiao Qi and Yihao Chen and Ping Gong and Rong Xiao and Xiang Bai},
+  journal={Pattern Recognition},
+  year={2021}
+}
+```
diff --git a/configs/textrecog/master/master_r31_12e_ST_MJ_SA.py b/configs/textrecog/master/master_r31_12e_ST_MJ_SA.py
@@ -0,0 +1,33 @@
+_base_ = [
+    '../../_base_/default_runtime.py', '../../_base_/recog_models/master.py',
+    '../../_base_/schedules/schedule_adam_step_12e.py',
+    '../../_base_/recog_pipelines/master_pipeline.py',
+    '../../_base_/recog_datasets/ST_SA_MJ_train.py',
+    '../../_base_/recog_datasets/academic_test.py'
+]
+
+train_list = {{_base_.train_list}}
+test_list = {{_base_.test_list}}
+
+train_pipeline = {{_base_.train_pipeline}}
+test_pipeline = {{_base_.test_pipeline}}
+
+data = dict(
+    samples_per_gpu=512,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=128),
+    test_dataloader=dict(samples_per_gpu=128),
+    train=dict(
+        type='UniformConcatDataset',
+        datasets=train_list,
+        pipeline=train_pipeline),
+    val=dict(
+        type='UniformConcatDataset',
+        datasets=test_list,
+        pipeline=test_pipeline),
+    test=dict(
+        type='UniformConcatDataset',
+        datasets=test_list,
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=1, metric='acc')
diff --git a/configs/textrecog/master/master_toy_dataset.py b/configs/textrecog/master/master_toy_dataset.py
@@ -0,0 +1,30 @@
+_base_ = [
+    '../../_base_/default_runtime.py', '../../_base_/recog_models/master.py',
+    '../../_base_/schedules/schedule_adam_step_12e.py',
+    '../../_base_/recog_pipelines/master_pipeline.py',
+    '../../_base_/recog_datasets/toy_data.py'
+]
+
+train_list = {{_base_.train_list}}
+test_list = {{_base_.test_list}}
+
+train_pipeline = {{_base_.train_pipeline}}
+test_pipeline = {{_base_.test_pipeline}}
+
+data = dict(
+    workers_per_gpu=2,
+    samples_per_gpu=8,
+    train=dict(
+        type='UniformConcatDataset',
+        datasets=train_list,
+        pipeline=train_pipeline),
+    val=dict(
+        type='UniformConcatDataset',
+        datasets=test_list,
+        pipeline=test_pipeline),
+    test=dict(
+        type='UniformConcatDataset',
+        datasets=test_list,
+        pipeline=test_pipeline))
+
+evaluation = dict(interval=1, metric='acc')
diff --git a/configs/textrecog/master/metafile.yml b/configs/textrecog/master/metafile.yml
@@ -0,0 +1,52 @@
+Collections:
+  - Name: MASTER
+    Metadata:
+      Training Data: OCRDataset
+      Training Techniques:
+        - Adam
+      Epochs: 12
+      Batch Size: 512
+      Training Resources: 4x Tesla A100
+      Architecture:
+        - ResNet31-GCAModule
+        - MASTERDecoder
+    Paper:
+      URL: https://arxiv.org/abs/1910.02562
+      Title: "MASTER: Multi-Aspect Non-local Network for Scene Text Recognition"
+    README: configs/textrecog/master/README.md
+
+Models:
+  - Name: master_academic
+    In Collection: MASTER
+    Config: configs/textrecog/master/master_academic.py
+    Metadata:
+      Training Data:
+        - SynthText
+        - SynthAdd
+        - Syn90k
+    Results:
+      - Task: Text Recognition
+        Dataset: IIIT5K
+        Metrics:
+          word_acc: 95.27
+      - Task: Text Recognition
+        Dataset: SVT
+        Metrics:
+          word_acc: 89.8
+      - Task: Text Recognition
+        Dataset: ICDAR2013
+        Metrics:
+          word_acc: 95.17
+      - Task: Text Recognition
+        Dataset: ICDAR2015
+        Metrics:
+          word_acc: 77.03
+      - Task: Text Recognition
+        Dataset: SVTP
+        Metrics:
+          word_acc: 82.95
+      - Task: Text Recognition
+        Dataset: CT80
+        Metrics:
+          word_acc: 89.93
+    Weights: https://download.openmmlab.com/mmocr/textrecog/master/master_r31_12e_ST_MJ_SA-787edd36.pth
diff --git a/demo/README.md b/demo/README.md
@@ -220,6 +220,7 @@ means that `batch_mode` and `print_result` are set to `True`)
 | ABINet        |           [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#read-like-humans-autonomous-bidirectional-and-iterative-language-modeling-for-scene-text-recognition)            |       :heavy_check_mark:       |
 | CRNN          | [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#an-end-to-end-trainable-neural-network-for-image-based-sequence-recognition-and-its-application-to-scene-text-recognition) |              :x:               |
 | CRNN_TPS      |                                                  [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#crnn-with-tps-based-stn)                                                  |       :heavy_check_mark:       |
+| MASTER        |                                                          [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#master)                                                           |       :heavy_check_mark:       |
 | NRTR_1/16-1/8 |                                                           [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#nrtr)                                                            |       :heavy_check_mark:       |
 | NRTR_1/8-1/4  |                                                           [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#nrtr)                                                            |       :heavy_check_mark:       |
 | RobustScanner |                     [link](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#robustscanner-dynamically-enhancing-positional-clues-for-robust-text-recognition)                      |       :heavy_check_mark:       |

diff --git a/demo/README_zh-CN.md b/demo/README_zh-CN.md
@@ -217,6 +217,7 @@ mmocr 为了方便使用提供了预置的模型配置和对应的预训练权
 | ABINet        |           [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#read-like-humans-autonomous-bidirectional-and-iterative-language-modeling-for-scene-text-recognition)            |  :heavy_check_mark:   |
 | CRNN          | [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#an-end-to-end-trainable-neural-network-for-image-based-sequence-recognition-and-its-application-to-scene-text-recognition) |          :x:          |
 | CRNN_TPS      |                                                  [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#crnn-with-tps-based-stn)                                                  |  :heavy_check_mark:   |
+| MASTER        |                                                          [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#master)                                                           |  :heavy_check_mark:   |
 | NRTR_1/16-1/8 |                                                           [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#nrtr)                                                            |  :heavy_check_mark:   |
 | NRTR_1/8-1/4  |                                                           [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#nrtr)                                                            |  :heavy_check_mark:   |
 | RobustScanner |                     [链接](https://mmocr.readthedocs.io/en/latest/textrecog_models.html#robustscanner-dynamically-enhancing-positional-clues-for-robust-text-recognition)                      |  :heavy_check_mark:   |

diff --git a/docs/en/model_summary.md b/docs/en/model_summary.md
@@ -110,6 +110,15 @@ Fuser fuses the feature output from encoder and decoder before generating the fi
 - Loss: [CTCLoss](https://mmocr.readthedocs.io/en/latest/api.html#mmocr.models.textrecog.losses.CTCLoss)
 - Converter: [CTCConvertor](https://mmocr.readthedocs.io/en/latest/api.html#mmocr.models.textrecog.convertors.CTCConvertor)
 
+### MASTER
+
+- Preprocessor: None
+- Backbone: [ResNet](https://mmocr.readthedocs.io/en/latest/api.html#mmocr.models.textrecog.backbones.ResNet)
+- Encoder: None
+- Decoder: [MasterDecoder](https://mmocr.readthedocs.io/en/latest/api.html#mmocr.models.textrecog.decoders.MasterDecoder)
+- Loss: [TFLoss](https://mmocr.readthedocs.io/en/latest/api.html#mmocr.models.textrecog.losses.TFLoss)
+- Converter: [AttnConvertor](https://mmocr.readthedocs.io/en/latest/api.html#mmocr.models.textrecog.convertors.AttnConvertor)
+
 ### NRTR
 
 - Preprocessor: None

diff --git a/mmocr/models/textrecog/decoders/__init__.py b/mmocr/models/textrecog/decoders/__init__.py
@@ -3,6 +3,7 @@
 from .abinet_vision_decoder import ABIVisionDecoder
 from .base_decoder import BaseDecoder
 from .crnn_decoder import CRNNDecoder
+from .master_decoder import MasterDecoder
 from .nrtr_decoder import NRTRDecoder
 from .position_attention_decoder import PositionAttentionDecoder
 from .robust_scanner_decoder import RobustScannerDecoder
@@ -14,5 +15,6 @@
     'CRNNDecoder', 'ParallelSARDecoder', 'SequentialSARDecoder',
     'ParallelSARDecoderWithBS', 'NRTRDecoder', 'BaseDecoder',
     'SequenceAttentionDecoder', 'PositionAttentionDecoder',
-    'RobustScannerDecoder', 'ABILanguageDecoder', 'ABIVisionDecoder'
+    'RobustScannerDecoder', 'ABILanguageDecoder', 'ABIVisionDecoder',
+    'MasterDecoder'
 ]