Merge pull request #223 from CatalystCode/enhancement/data-import-cov…

…erage Enhancement/Ensure data pagination is covered in tests
doccano · Jun 11, 2019 · 427f59b · 427f59b
2 parents d5514ab + e1ae68c
commit 427f59b
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 13 deletions.
diff --git a/app/app/settings.py b/app/app/settings.py
@@ -257,7 +257,7 @@
 
 # Size of the batch for creating documents
 # on the import phase
-IMPORT_BATCH_SIZE = 500
+IMPORT_BATCH_SIZE = env.int('IMPORT_BATCH_SIZE', 500)
 
 GOOGLE_TRACKING_ID = env('GOOGLE_TRACKING_ID', 'UA-125643874-2')
 

diff --git a/app/server/tests/test_api.py b/app/server/tests/test_api.py
@@ -904,6 +904,7 @@ def test_no_cloud_upload(self):
         self.assertFalse(response.json().get('cloud_upload'))
 
 
+@override_settings(IMPORT_BATCH_SIZE=2)
 class TestParser(APITestCase):
 
     def parser_helper(self, filename, parser, include_label=True):

diff --git a/app/server/utils.py b/app/server/utils.py
@@ -7,10 +7,10 @@
 from random import Random
 
 from django.db import transaction
+from django.conf import settings
 from rest_framework.renderers import JSONRenderer
 from seqeval.metrics.sequence_labeling import get_entities
 
-from app.settings import IMPORT_BATCH_SIZE
 from .exceptions import FileParseException
 from .models import Label
 from .serializers import DocumentSerializer, LabelSerializer
@@ -242,19 +242,13 @@ class CoNLLParser(FileParser):
     ```
     """
     def parse(self, file):
-        """Store json for seq2seq.
-
-        Return format:
-        {"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
-        ...
-        """
         words, tags = [], []
         data = []
+        file = io.TextIOWrapper(file, encoding='utf-8')
         for i, line in enumerate(file, start=1):
-            if len(data) >= IMPORT_BATCH_SIZE:
+            if len(data) >= settings.IMPORT_BATCH_SIZE:
                 yield data
                 data = []
-            line = line.decode('utf-8')
             line = line.strip()
             if line:
                 try:
@@ -301,7 +295,7 @@ class PlainTextParser(FileParser):
     def parse(self, file):
         file = io.TextIOWrapper(file, encoding='utf-8')
         while True:
-            batch = list(itertools.islice(file, IMPORT_BATCH_SIZE))
+            batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
             if not batch:
                 break
             yield [{'text': line.strip()} for line in batch]
@@ -327,7 +321,7 @@ def parse(self, file):
         columns = next(reader)
         data = []
         for i, row in enumerate(reader, start=2):
-            if len(data) >= IMPORT_BATCH_SIZE:
+            if len(data) >= settings.IMPORT_BATCH_SIZE:
                 yield data
                 data = []
             if len(row) == len(columns) and len(row) >= 2:
@@ -347,7 +341,7 @@ def parse(self, file):
         file = io.TextIOWrapper(file, encoding='utf-8')
         data = []
         for i, line in enumerate(file, start=1):
-            if len(data) >= IMPORT_BATCH_SIZE:
+            if len(data) >= settings.IMPORT_BATCH_SIZE:
                 yield data
                 data = []
             try: