Skip to content

Commit

Permalink
ftp: Use UTF8 w/ surrogateescape instead of Latin1
Browse files Browse the repository at this point in the history
Closes #264
  • Loading branch information
chfoo committed Apr 19, 2015
1 parent d82c6a0 commit bbb6b8d
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 30 deletions.
2 changes: 2 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ Unreleased
* Fixed: Connecting to sites with IPv4 & IPv6 support resulted in errors when IPv6 was not supported by the local network. Connections now use Happy Eyeballs Algorithm for IPv4 & IPv6 dual-stack support.
* Fixed: SQLAlchemy error with PyPy and SQLAlchemy 1.0.
* Fixed: Input URLs are not fetched in order. Regression since 1.1.
* Fixed: UnicodeEncodeError when fetching FTP files with non-ASCII filenames.
* Changed: FTP communication uses UTF-8 instead of Latin-1.
* Changed: ``--prefer-family=none`` is now default.
* Added: ``none`` as a choice to ``--prefer-family``.

Expand Down
6 changes: 5 additions & 1 deletion doc/warc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ The resource is formatted as followed:
* Requests are indented with an ASCII greater-than and space.
* Responses are indented with an ASCII less-than and space.

The document encoding is Latin-1.
The document encoding is UTF-8.

.. versionchanged:: 1.2a1

The document encoding previously used Latin-1.


Response data
Expand Down
23 changes: 20 additions & 3 deletions wpull/app_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,7 +1643,7 @@ def test_args(self):
self.assertTrue(os.path.exists('readme.txt'))
self.assertFalse(os.path.islink('readme.txt'))
self.assertTrue(os.path.exists('example1/.listing'))
self.assertTrue(os.path.exists('example2/.listing'))
self.assertTrue(os.path.exists('example2💎/.listing'))
self.assertTrue(os.path.exists('mywarc.warc.gz'))

with gzip.GzipFile('mywarc.warc.gz') as in_file:
Expand Down Expand Up @@ -1682,7 +1682,7 @@ def test_retr_symlinks_off(self):
def test_file_vs_directory(self):
arg_parser = AppArgumentParser()
args = arg_parser.parse_args([
self.get_url('/example2'),
self.get_url('/example2💎'),
'--no-host-directories',
'--no-remove-listing',
'-r',
Expand All @@ -1696,7 +1696,24 @@ def test_file_vs_directory(self):
print(list(os.walk('.')))

self.assertEqual(0, exit_code)
self.assertTrue(os.path.exists('example2/.listing'))
self.assertTrue(os.path.exists('example2💎/.listing'))

@wpull.testing.async.async_test(timeout=DEFAULT_TIMEOUT)
def test_invalid_char_dir_list(self):
arg_parser = AppArgumentParser()
args = arg_parser.parse_args([
self.get_url('/hidden/invalid_chars/'),
'--no-host-directories',
'--no-remove-listing',
])
builder = Builder(args, unit_test=True)

app = builder.build()
exit_code = yield From(app.run())
print(list(os.walk('.')))

self.assertEqual(0, exit_code)
self.assertTrue(os.path.exists('.listing'))


@trollius.coroutine
Expand Down
6 changes: 4 additions & 2 deletions wpull/ftp/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ def read_listing_content(self, file, duration_timeout=None):
self._response.body.seek(0)

machine_listings = wpull.ftp.util.parse_machine_listing(
self._response.body.read().decode('latin-1'),
self._response.body.read().decode('utf-8',
errors='surrogateescape'),
convert=True, strict=False
)
listings = list(
Expand All @@ -300,7 +301,8 @@ def read_listing_content(self, file, duration_timeout=None):
else:
self._response.body.seek(0)

file = io.TextIOWrapper(self._response.body, encoding='latin-1')
file = io.TextIOWrapper(self._response.body, encoding='utf-8',
errors='surrogateescape')

listing_parser = ListingParser(file=file)
heuristics_result = listing_parser.run_heuristics()
Expand Down
2 changes: 1 addition & 1 deletion wpull/ftp/client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_fetch_listing(self):
self.assertEqual(5, len(response.files))
self.assertEqual('junk', response.files[0].name)
self.assertEqual('example1', response.files[1].name)
self.assertEqual('example2', response.files[2].name)
self.assertEqual('example2💎', response.files[2].name)
self.assertEqual('example (copy).txt', response.files[3].name)
self.assertEqual('readme.txt', response.files[4].name)

Expand Down
19 changes: 11 additions & 8 deletions wpull/ftp/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class Command(SerializableMixin, DictableMixin):
'''FTP request command.
Encoding is Latin-1.
Encoding is UTF-8.
Attributes:
name (str): The command. Usually 4 characters or less.
Expand Down Expand Up @@ -42,11 +42,12 @@ def parse(self, data):
if not match:
raise ProtocolError('Failed to parse command.')

self.name = match.group(1).decode('latin-1')
self.argument = match.group(2).decode('latin-1')
self.name = match.group(1).decode('utf-8', errors='surrogateescape')
self.argument = match.group(2).decode('utf-8', errors='surrogateescape')

def to_bytes(self):
return '{0} {1}\r\n'.format(self.name, self.argument).encode('latin-1')
return '{0} {1}\r\n'.format(self.name, self.argument).encode(
'utf-8', errors='surrogateescape')

def to_dict(self):
return {
Expand All @@ -58,7 +59,7 @@ def to_dict(self):
class Reply(SerializableMixin, DictableMixin):
'''FTP reply.
Encoding is always Latin-1.
Encoding is always UTF-8.
Attributes:
code (int): Reply code.
Expand All @@ -80,9 +81,11 @@ def parse(self, data):
self.code = int(match.group(1))

if self.text is None:
self.text = match.group(3).decode('latin-1')
self.text = match.group(3).decode('utf-8',
errors='surrogateescape')
else:
self.text += '\r\n{0}'.format(match.group(3).decode('latin-1'))
self.text += '\r\n{0}'.format(match.group(3).decode(
'utf-8', errors='surrogateescape'))

def to_bytes(self):
assert self.code is not None
Expand All @@ -99,7 +102,7 @@ def to_bytes(self):
else:
lines.append('{0}-{1}\r\n'.format(self.code, line))

return ''.join(lines).encode('latin-1')
return ''.join(lines).encode('utf-8', errors='surrogateescape')

def to_dict(self):
return {
Expand Down
18 changes: 13 additions & 5 deletions wpull/recorder/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,25 +669,33 @@ def end_control(self, response, connection_closed=False):

def request_control_data(self, data):
text = textwrap_indent(
data.decode('latin-1'), '> ', predicate=lambda line: True
data.decode('utf-8', errors='surrogateescape'),
'> ', predicate=lambda line: True
)
self._control_record.block_file.write(
text.encode('utf-8', errors='surrogateescape')
)
self._control_record.block_file.write(text.encode('latin-1'))

if not data.endswith(b'\n'):
self._control_record.block_file.write(b'\n')

def response_control_data(self, data):
text = textwrap_indent(
data.decode('latin-1'), '< ', predicate=lambda line: True
data.decode('utf-8', errors='surrogateescape'),
'< ', predicate=lambda line: True
)
self._control_record.block_file.write(
text.encode('utf-8', errors='surrogateescape')
)
self._control_record.block_file.write(text.encode('latin-1'))

if not data.endswith(b'\n'):
self._control_record.block_file.write(b'\n')

def _write_control_event(self, text):
text = textwrap_indent(text, '* ', predicate=lambda line: True)
self._control_record.block_file.write(text.encode('latin-1'))
self._control_record.block_file.write(
text.encode('utf-8', errors='surrogateescape')
)

if not text.endswith('\n'):
self._control_record.block_file.write(b'\n')
Expand Down
22 changes: 12 additions & 10 deletions wpull/testing/ftp.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ def __init__(self, reader, writer):
self.routes = {
'/':
('dir',
b'junk\nexample1\nexample2\nexample (copy).txt\nreadme.txt\n',
'junk\nexample1\nexample2💎\nexample (copy).txt\nreadme.txt\n'.encode('utf8'),
('drw-r--r-- 1 smaug smaug 0 Apr 01 00:00 junk\r\n'
'drw-r--r-- 1 smaug smaug 0 Apr 01 00:00 example1\r\n'
'drw-r--r-- 1 smaug smaug 0 Apr 01 00:00 example2\r\n'
'drw-r--r-- 1 smaug smaug 0 Apr 01 00:00 example2💎\r\n'
'-rw-r--r-- 1 smaug smaug 42 Apr 01 00:00 example (copy).txt\r\n'
'lrwxrwxrwx 1 smaug smaug 4 Apr 01 00:00 readme.txt -> example (copy).txt\r\n'
).encode('utf-8')),
Expand All @@ -61,16 +61,18 @@ def __init__(self, reader, writer):
('dir', b'loopy', b'loopy'),
'/example1/loopy':
('symlink', '/'),
'/example2':
'/example2💎':
('dir', b'trash.txt',
('02-09-2010 03:00PM 13 trash.txt\r\n'
).encode('utf-8'),
b'type=file; trash.txt\n'
),
'/example2/trash.txt':
'/example2💎/trash.txt':
('file', b'hello dragon!'),
'/hidden/sleep.txt':
('file', b'asdf'),
'/hidden/invalid_chars':
('dir', b'wow\xf0\xff', b'wow\xf0\xff'),
}
self.command = None
self.arg = None
Expand All @@ -93,9 +95,9 @@ def process(self):
return

try:
command, arg = line.decode('latin-1').split(' ', 1)
command, arg = line.decode('utf-8', errors='replace').split(' ', 1)
except ValueError:
command = line.decode('latin-1').strip()
command = line.decode('utf-8', errors='replace').strip()
arg = ''

self.command = command.upper()
Expand Down Expand Up @@ -179,7 +181,7 @@ def data_server_cb(data_reader, data_writer):
else:
self.writer.write('227 Now passive mode (127,0,0,1,{},{})\r\n'
.format(big_port_num, small_port_num)
.encode('latin-1'))
.encode('utf-8'))

@trollius.coroutine
def _wait_data_writer(self):
Expand Down Expand Up @@ -281,7 +283,7 @@ def _cmd_retr(self):
self.writer.write(b'226 End data\r\n')
self.data_server.close()

if self.path == '/example2/trash.txt':
if self.path == '/example2💎/trash.txt':
self.writer.close()
else:
self.writer.write(b'550 File error\r\n')
Expand All @@ -294,7 +296,7 @@ def _cmd_size(self):
self.writer.write(b'213 ')
self.writer.write(str(len(info[1])).encode('ascii'))
self.writer.write(b'\r\n')
elif info and info[0] == 'file' and self.path == '/example2/trash.txt':
elif info and info[0] == 'file' and self.path == '/example2💎/trash.txt':
self.writer.write(b'213 3.14\r\n')
else:
self.writer.write(b'550 Unknown command\r\n')
Expand All @@ -315,7 +317,7 @@ def _cmd_rest(self):

@trollius.coroutine
def _cmd_cwd(self):
if self.arg in ('example1', 'example2', '/'):
if self.arg in ('example1', 'example2💎', '/'):
self.writer.write(b'250 Changed directory\r\n')
else:
self.writer.write(b'550 Change directory error\r\n')
Expand Down

0 comments on commit bbb6b8d

Please sign in to comment.