Add list_from_file and list_to_file (open-mmlab#226)

* Add list_from_file and list_to_file Signed-off-by: lizz <lizz@sensetime.com> * Add test list_to_file and list_from_file * more * Fix tests
gaotongxiao · May 24, 2021 · 0b360a5 · 0b360a5
1 parent cdab0dc
commit 0b360a5
Show file tree

Hide file tree

Showing 15 changed files with 261 additions and 200 deletions.
diff --git a/mmocr/datasets/kie_dataset.py b/mmocr/datasets/kie_dataset.py
@@ -1,15 +1,14 @@
 import copy
 from os import path as osp
 
-import mmcv
 import numpy as np
 import torch
 
-import mmocr.utils as utils
 from mmdet.datasets.builder import DATASETS
 from mmocr.core import compute_f1_score
 from mmocr.datasets.base_dataset import BaseDataset
 from mmocr.datasets.pipelines import sort_vertex8
+from mmocr.utils import is_type_list, list_from_file
 
 
 @DATASETS.register_module()
@@ -52,7 +51,7 @@ def __init__(self,
             '': 0,
             **{
                 line.rstrip('\r\n'): ind
-                for ind, line in enumerate(mmcv.list_from_file(dict_file), 1)
+                for ind, line in enumerate(list_from_file(dict_file), 1)
             }
         }
 
@@ -79,7 +78,7 @@ def _parse_anno_info(self, annotations):
                     box_num * (box_num + 1).
         """
 
-        assert utils.is_type_list(annotations, dict)
+        assert is_type_list(annotations, dict)
         assert len(annotations) > 0, 'Please remove data with empty annotation'
         assert 'box' in annotations[0]
         assert 'text' in annotations[0]

diff --git a/mmocr/datasets/utils/loader.py b/mmocr/datasets/utils/loader.py
@@ -1,8 +1,7 @@
 import os.path as osp
 
-import mmcv
-
 from mmocr.datasets.builder import LOADERS, build_parser
+from mmocr.utils import list_from_file
 
 
 @LOADERS.register_module()
@@ -60,7 +59,7 @@ class HardDiskLoader(Loader):
     """
 
     def _load(self, ann_file):
-        return mmcv.list_from_file(ann_file)
+        return list_from_file(ann_file)
 
 
 @LOADERS.register_module()

diff --git a/mmocr/models/kie/extractors/sdmgr.py b/mmocr/models/kie/extractors/sdmgr.py
@@ -8,6 +8,7 @@
 from mmdet.models.builder import DETECTORS, build_roi_extractor
 from mmdet.models.detectors import SingleStageDetector
 from mmocr.core import imshow_edge_node
+from mmocr.utils import list_from_file
 
 
 @DETECTORS.register_module()
@@ -126,11 +127,9 @@ def show_result(self,
 
         idx_to_cls = {}
         if self.class_list is not None:
-            with open(self.class_list, 'r') as fr:
-                for line in fr:
-                    line = line.strip().split()
-                    class_idx, class_label = line
-                    idx_to_cls[class_idx] = class_label
+            for line in list_from_file(self.class_list):
+                class_idx, class_label = line.strip().split()
+                idx_to_cls[class_idx] = class_label
 
         # if out_file specified, do not show image in window
         if out_file is not None:

diff --git a/mmocr/models/textrecog/convertors/base.py b/mmocr/models/textrecog/convertors/base.py
@@ -1,4 +1,5 @@
 from mmocr.models.builder import CONVERTORS
+from mmocr.utils import list_from_file
 
 
 @CONVERTORS.register_module()
@@ -27,11 +28,10 @@ def __init__(self, dict_type='DICT90', dict_file=None, dict_list=None):
         assert dict_list is None or isinstance(dict_list, list)
         self.idx2char = []
         if dict_file is not None:
-            with open(dict_file, encoding='utf-8') as fr:
-                for line in fr:
-                    line = line.strip()
-                    if line != '':
-                        self.idx2char.append(line)
+            for line in list_from_file(dict_file):
+                line = line.strip()
+                if line != '':
+                    self.idx2char.append(line)
         elif dict_list is not None:
             self.idx2char = dict_list
         else:

diff --git a/mmocr/utils/__init__.py b/mmocr/utils/__init__.py
@@ -4,6 +4,7 @@
                              is_none_or_type, is_type_list, valid_boundary)
 from .collect_env import collect_env
 from .data_convert_util import convert_annotations
+from .fileio import list_from_file, list_to_file
 from .img_util import drop_orientation, is_not_png
 from .lmdb_util import lmdb_converter
 from .logger import get_root_logger
@@ -12,5 +13,6 @@
     'Registry', 'build_from_cfg', 'get_root_logger', 'collect_env',
     'is_3dlist', 'is_ndarray_list', 'is_type_list', 'is_none_or_type',
     'equal_len', 'is_2dlist', 'valid_boundary', 'lmdb_converter',
-    'drop_orientation', 'convert_annotations', 'is_not_png'
+    'drop_orientation', 'convert_annotations', 'is_not_png', 'list_to_file',
+    'list_from_file'
 ]
diff --git a/mmocr/utils/fileio.py b/mmocr/utils/fileio.py
@@ -0,0 +1,31 @@
+def list_to_file(filename, lines):
+    """Write a list of strings to a text file.
+
+    Args:
+        filename (str): The output filename. It will be created/overwritten.
+        lines (list(str)): Data to be written.
+    """
+    with open(filename, 'w', encoding='utf-8') as fw:
+        for line in lines:
+            fw.write(f'{line}\n')
+
+
+def list_from_file(filename, encoding='utf-8'):
+    """Load a text file and parse the content as a list of strings. The
+    trailing "\\r" and "\\n" of each line will be removed.
+
+    Note:
+        This will be replaced by mmcv's version after it supports encoding.
+
+    Args:
+        filename (str): Filename.
+        encoding (str): Encoding used to open the file. Default utf-8.
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    item_list = []
+    with open(filename, 'r', encoding=encoding) as f:
+        for line in f:
+            item_list.append(line.rstrip('\n\r'))
+    return item_list
diff --git a/mmocr/utils/lmdb_util.py b/mmocr/utils/lmdb_util.py
@@ -5,11 +5,12 @@
 
 import lmdb
 
+from mmocr.utils import list_from_file
 
-def lmdb_converter(img_list, output, batch_size=1000, coding='utf-8'):
-    # read img_list
-    with open(img_list) as f:
-        lines = f.readlines()
+
+def lmdb_converter(img_list_file, output, batch_size=1000, coding='utf-8'):
+    # read img_list_file
+    lines = list_from_file(img_list_file)
 
     # create lmdb database
     if Path(output).is_dir():

diff --git a/tests/test_utils/test_textio.py b/tests/test_utils/test_textio.py
@@ -0,0 +1,46 @@
+import tempfile
+
+from mmocr.utils import list_from_file, list_to_file
+
+lists = [
+    [],
+    [' '],
+    ['\t'],
+    ['a'],
+    [1],
+    [1.],
+    ['a', 'b'],
+    ['a', 1, 1.],
+    [1, 1., 'a'],
+    ['啊', '啊啊'],
+    ['選択', 'noël', 'Информацией', 'ÄÆä'],
+]
+
+
+def test_list_to_file():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for i, lines in enumerate(lists):
+            filename = f'{tmpdirname}/{i}.txt'
+            list_to_file(filename, lines)
+            lines2 = [
+                line.rstrip('\r\n')
+                for line in open(filename, 'r', encoding='utf-8').readlines()
+            ]
+            lines = list(map(str, lines))
+            assert len(lines) == len(lines2)
+            assert all(line1 == line2 for line1, line2 in zip(lines, lines2))
+
+
+def test_list_from_file():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for encoding in ['utf-8', 'utf-8-sig']:
+            for lineend in ['\n', '\r\n']:
+                for i, lines in enumerate(lists):
+                    filename = f'{tmpdirname}/{i}.txt'
+                    with open(filename, 'w', encoding=encoding) as f:
+                        f.writelines(f'{line}{lineend}' for line in lines)
+                    lines2 = list_from_file(filename, encoding=encoding)
+                    lines = list(map(str, lines))
+                    assert len(lines) == len(lines2)
+                    assert all(line1 == line2
+                               for line1, line2 in zip(lines, lines2))
diff --git a/tools/data/textdet/coco_to_line_dict.py b/tools/data/textdet/coco_to_line_dict.py
@@ -1,16 +1,13 @@
 import argparse
-import codecs
 import json
 
+import mmcv
 
-def read_json(fpath):
-    with codecs.open(fpath, 'r', 'utf-8') as f:
-        obj = json.load(f)
-    return obj
+from mmocr.utils import list_to_file
 
 
 def parse_coco_json(in_path):
-    json_obj = read_json(in_path)
+    json_obj = mmcv.load(in_path)
     image_infos = json_obj['images']
     annotations = json_obj['annotations']
     imgid2imgname = {}
@@ -35,18 +32,17 @@ def parse_coco_json(in_path):
 
 
 def gen_line_dict_file(out_path, imgid2imgname, imgid2anno):
-    # import pdb; pdb.set_trace()
-    with codecs.open(out_path, 'w', 'utf-8') as fw:
-        for key, value in imgid2imgname.items():
-            if key in imgid2anno:
-                anno = imgid2anno[key]
-                line_dict = {}
-                line_dict['file_name'] = value['file_name']
-                line_dict['height'] = value['height']
-                line_dict['width'] = value['width']
-                line_dict['annotations'] = anno
-                line_dict_str = json.dumps(line_dict)
-                fw.write(line_dict_str + '\n')
+    lines = []
+    for key, value in imgid2imgname.items():
+        if key in imgid2anno:
+            anno = imgid2anno[key]
+            line_dict = {}
+            line_dict['file_name'] = value['file_name']
+            line_dict['height'] = value['height']
+            line_dict['width'] = value['width']
+            line_dict['annotations'] = anno
+            lines.append(json.dumps(line_dict))
+    list_to_file(out_path, lines)
 
 
 def parse_args():

diff --git a/tools/data/textdet/ctw1500_converter.py b/tools/data/textdet/ctw1500_converter.py
@@ -8,7 +8,8 @@
 import numpy as np
 from shapely.geometry import Polygon
 
-from mmocr.utils import convert_annotations, drop_orientation, is_not_png
+from mmocr.utils import (convert_annotations, drop_orientation, is_not_png,
+                         list_from_file)
 
 
 def collect_files(img_dir, gt_dir, split):
@@ -84,11 +85,8 @@ def collect_annotations(files, split, nproc=1):
 
 
 def load_txt_info(gt_file, img_info):
-    with open(gt_file) as f:
-        gt_list = f.readlines()
-
     anno_info = []
-    for line in gt_list:
+    for line in list_from_file(gt_file):
         # each line has one ploygen (n vetices), and one text.
         # e.g., 695,885,866,888,867,1146,696,1143,####Latin 9
         line = line.strip()

diff --git a/tools/data/textdet/icdar_converter.py b/tools/data/textdet/icdar_converter.py
@@ -7,7 +7,8 @@
 import numpy as np
 from shapely.geometry import Polygon
 
-from mmocr.utils import convert_annotations, drop_orientation, is_not_png
+from mmocr.utils import (convert_annotations, drop_orientation, is_not_png,
+                         list_from_file)
 
 
 def collect_files(img_dir, gt_dir):
@@ -96,11 +97,9 @@ def load_img_info(files, dataset):
     assert img.shape[0:2] == img_color.shape[0:2]
 
     if dataset == 'icdar2017':
-        with open(gt_file) as f:
-            gt_list = f.readlines()
+        gt_list = list_from_file(gt_file)
     elif dataset == 'icdar2015':
-        with open(gt_file, mode='r', encoding='utf-8-sig') as f:
-            gt_list = f.readlines()
+        gt_list = list_from_file(gt_file, encoding='utf-8-sig')
     else:
         raise NotImplementedError(f'Not support {dataset}')