doccano · Hironsan · Oct 15, 2019 · Oct 14, 2019
diff --git a/app/api/tests/data/example.utf16.csv b/app/api/tests/data/example.utf16.csv
diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py
@@ -841,6 +841,12 @@ def test_can_upload_classification_csv(self):
                                 file_format='csv',
                                 expected_status=status.HTTP_201_CREATED)
 
+    def test_can_upload_csv_with_non_utf8_encoding(self):
+        self.upload_test_helper(project_id=self.classification_project.id,
+                                filename='example.utf16.csv',
+                                file_format='csv',
+                                expected_status=status.HTTP_201_CREATED)
+
     def test_can_upload_seq2seq_csv(self):
         self.upload_test_helper(project_id=self.seq2seq_project.id,
                                 filename='example.csv',

diff --git a/app/api/utils.py b/app/api/utils.py
@@ -7,6 +7,7 @@
 from random import Random
 
 import conllu
+from chardet import UniversalDetector
 from django.db import transaction
 from django.conf import settings
 import pyexcel
@@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
     """
     def parse(self, file):
         data = []
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
 
         # Add check exception
 
@@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
     ```
     """
     def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
         while True:
             batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
             if not batch:
@@ -323,7 +326,8 @@ class CSVParser(FileParser):
     ```
     """
     def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
         reader = csv.reader(file)
         yield from ExcelParser.parse_excel_csv_reader(reader)
 
@@ -364,7 +368,8 @@ def parse_excel_csv_reader(reader):
 class JSONParser(FileParser):
 
     def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
         data = []
         for i, line in enumerate(file, start=1):
             if len(data) >= settings.IMPORT_BATCH_SIZE:
@@ -506,3 +511,34 @@ def readinto(self, b):
                 return 0    # indicate EOF
 
     return io.BufferedReader(IterStream(), buffer_size=buffer_size)
+
+
+class EncodedIO(io.RawIOBase):
+    def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
+        buffer = b''
+        detector = UniversalDetector()
+
+        while True:
+            read = fobj.read(buffer_size)
+            detector.feed(read)
+            buffer += read
+            if detector.done or len(read) < buffer_size:
+                break
+
+        if detector.done:
+            self.encoding = detector.result['encoding']
+        else:
+            self.encoding = default_encoding
+
+        self._fobj = fobj
+        self._buffer = buffer
+
+    def readable(self):
+        return self._fobj.readable()
+
+    def readinto(self, b):
+        l = len(b)
+        chunk = self._buffer or self._fobj.read(l)
+        output, self._buffer = chunk[:l], chunk[l:]
+        b[:len(output)] = output
+        return len(output)
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 apache-libcloud==2.4.0
 applicationinsights==0.11.7
+chardet==3.0.4
 coverage==4.5.3
 dj-database-url==0.5.0
 Django==2.1.7