Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/Fix data import from non UTF-8 files #399

Merged
merged 1 commit into from Oct 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added app/api/tests/data/example.utf16.csv
Binary file not shown.
6 changes: 6 additions & 0 deletions app/api/tests/test_api.py
Expand Up @@ -841,6 +841,12 @@ def test_can_upload_classification_csv(self):
file_format='csv',
expected_status=status.HTTP_201_CREATED)

def test_can_upload_csv_with_non_utf8_encoding(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.utf16.csv',
file_format='csv',
expected_status=status.HTTP_201_CREATED)

def test_can_upload_seq2seq_csv(self):
self.upload_test_helper(project_id=self.seq2seq_project.id,
filename='example.csv',
Expand Down
44 changes: 40 additions & 4 deletions app/api/utils.py
Expand Up @@ -7,6 +7,7 @@
from random import Random

import conllu
from chardet import UniversalDetector
from django.db import transaction
from django.conf import settings
import pyexcel
Expand Down Expand Up @@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
"""
def parse(self, file):
data = []
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)

# Add check exception

Expand Down Expand Up @@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
```
"""
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
while True:
batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
if not batch:
Expand All @@ -323,7 +326,8 @@ class CSVParser(FileParser):
```
"""
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
reader = csv.reader(file)
yield from ExcelParser.parse_excel_csv_reader(reader)

Expand Down Expand Up @@ -364,7 +368,8 @@ def parse_excel_csv_reader(reader):
class JSONParser(FileParser):

def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
data = []
for i, line in enumerate(file, start=1):
if len(data) >= settings.IMPORT_BATCH_SIZE:
Expand Down Expand Up @@ -506,3 +511,34 @@ def readinto(self, b):
return 0 # indicate EOF

return io.BufferedReader(IterStream(), buffer_size=buffer_size)


class EncodedIO(io.RawIOBase):
def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
buffer = b''
detector = UniversalDetector()

while True:
read = fobj.read(buffer_size)
detector.feed(read)
buffer += read
if detector.done or len(read) < buffer_size:
break

if detector.done:
self.encoding = detector.result['encoding']
else:
self.encoding = default_encoding

self._fobj = fobj
self._buffer = buffer

def readable(self):
return self._fobj.readable()

def readinto(self, b):
l = len(b)
chunk = self._buffer or self._fobj.read(l)
output, self._buffer = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)
1 change: 1 addition & 0 deletions requirements.txt
@@ -1,5 +1,6 @@
apache-libcloud==2.4.0
applicationinsights==0.11.7
chardet==3.0.4
coverage==4.5.3
dj-database-url==0.5.0
Django==2.1.7
Expand Down