Skip to content
This repository has been archived by the owner on Feb 10, 2021. It is now read-only.

Commit

Permalink
Be more strict about unicode/bytes conversions (#147)
Browse files Browse the repository at this point in the history
* Be more strict about unicode/bytes conversions

Previously this library relied upon duck-typed behavior, instead of
following precise schemas for the conversions between ctypes structs and
dictionaries of python objects. This led to unicode decode errors, as
bytestrings not representing human readable content were being decoded.

This fixes this issue in the following ways:

- Add `to_dict` methods to each relevant struct class that knows exactly
which fields are bytes and which are strings, and handles all
conversions appropriately.
- Make `ensure_bytes` and `ensure_string` more strict in terms of input
and output.

* fileds -> fields
  • Loading branch information
jcrist committed Jan 22, 2018
1 parent 264de72 commit 199a7ff
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 40 deletions.
25 changes: 4 additions & 21 deletions hdfs3/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,9 @@ def info(self, path):
if not self.exists(path):
raise FileNotFoundError(path)
fi = _lib.hdfsGetPathInfo(self._handle, ensure_bytes(path)).contents
out = info_to_dict(fi)
out = fi.to_dict()
_lib.hdfsFreeFileInfo(ctypes.byref(fi), 1)
return ensure_string(out)
return out

def isdir(self, path):
"""Return True if path refers to an existing directory."""
Expand Down Expand Up @@ -381,7 +381,7 @@ def ls(self, path, detail=False):
num = ctypes.c_int(0)
fi = _lib.hdfsListDirectory(self._handle, ensure_bytes(path),
ctypes.byref(num))
out = [ensure_string(info_to_dict(fi[i])) for i in range(num.value)]
out = [fi[i].to_dict() for i in range(num.value)]
_lib.hdfsFreeFileInfo(fi, num.value)
if detail:
return out
Expand Down Expand Up @@ -637,7 +637,7 @@ def list_encryption_zones(self):
msg = ensure_string(_lib.hdfsGetLastError()).split('\n')[0]
raise IOError("EZ listing failed: %s" % msg)

res = [struct_to_dict(out[i]) for i in range(x.value)]
res = [out[i].to_dict() for i in range(x.value)]
if res:
_lib.hdfsFreeEncryptionZoneInfo(out, x)
return res
Expand All @@ -658,23 +658,6 @@ def get_lib():
_lib = l


def struct_to_dict(s):
""" Return dictionary views of a simple ctypes record-like structure """
return dict((ensure_string(name), getattr(s, name))
for (name, p) in s._fields_)


def info_to_dict(s):
""" Process data returned by hdfsInfo """
d = struct_to_dict(s)
d['kind'] = {68: 'directory', 70: 'file'}[d['kind']]
if d['encryption_info']:
d['encryption_info'] = struct_to_dict(d['encryption_info'].contents)
else:
d['encryption_info'] = None
return d


mode_numbers = {'w': 1, 'r': 0, 'a': 1025,
'wb': 1, 'rb': 0, 'ab': 1025}

Expand Down
39 changes: 38 additions & 1 deletion hdfs3/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
"""
Low-level interface to libhdfs3
"""
from __future__ import absolute_import

import sys
import ctypes as ct
from .utils import ensure_string


PY3 = sys.version_info.major > 2
Expand Down Expand Up @@ -47,14 +49,29 @@ class EncryptionFileInfo(ct.Structure):
('mIv', ct.c_char_p),
('zone_key_version_name', ct.c_char_p)]

def to_dict(self):
return {'suite': self.suite,
'protocol_version': self.protocol_version,
'key': self.key,
'key_name': ensure_string(self.key_name),
'mIv': self.mIv,
'zone_key_version_name': self.zone_key_version_name}


class EncryptionZoneInfo(ct.Structure):
_fileds_ = [('suite', ct.c_int),
_fields_ = [('suite', ct.c_int),
('protocol_version', ct.c_int),
('mId', ct.c_int64),
('path', ct.c_char_p),
('key_name', ct.c_char_p)]

def to_dict(self):
return {'suite': self.suite,
'protocol_version': self.protocol_version,
'mId': self.mId,
'path': self.path,
'key_name': ensure_string(self.key_name)}


class FileInfo(ct.Structure):
_fields_ = [('kind', ct.c_int8),
Expand All @@ -69,6 +86,26 @@ class FileInfo(ct.Structure):
('last_access', ct.c_int64), # time_t, could be 32bit
('encryption_info', ct.POINTER(EncryptionFileInfo))]

def to_dict(self):
if self.encryption_info:
e_info = self.encryption_info.contents.to_dict()
else:
e_info = None

kind = {68: 'directory', 70: 'file'}.get(self.kind)

return {'kind': kind,
'name': ensure_string(self.name),
'last_mod': self.last_mod,
'size': self.size,
'replication': self.replication,
'block_size': self.block_size,
'owner': ensure_string(self.owner),
'group': ensure_string(self.group),
'permissions': self.permissions,
'last_access': self.last_access,
'encryption_info': e_info}


hdfsGetPathInfo = _lib.hdfsGetPathInfo
hdfsGetPathInfo.argtypes = [ct.c_char_p]
Expand Down
2 changes: 0 additions & 2 deletions hdfs3/tests/test_hdfs3.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,8 +627,6 @@ def test_ensure():
assert isinstance(ensure_bytes(b''), bytes)
assert isinstance(ensure_string(''), unicode)
assert isinstance(ensure_string(b''), unicode)
assert ensure_string({'x': b'', 'y': ''}) == {'x': '', 'y': ''}
assert ensure_bytes({'x': b'', 'y': ''}) == {'x': b'', 'y': b''}


def test_touch_exists(hdfs):
Expand Down
23 changes: 7 additions & 16 deletions hdfs3/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import absolute_import

from contextlib import contextmanager
import os
import shutil
import tempfile

from .compatibility import PY3
from .compatibility import PY3, bytes, unicode


def seek_delimiter(file, delimiter, blocksize, allow_zero=True):
Expand Down Expand Up @@ -94,9 +95,7 @@ def read_block(f, offset, length, delimiter=None):

def ensure_bytes(s):
""" Give strings that ctypes is guaranteed to handle """
if PY3 and isinstance(s, bytes):
return s
if not PY3 and isinstance(s, str):
if isinstance(s, bytes):
return s
if hasattr(s, 'encode'):
return s.encode()
Expand All @@ -106,11 +105,8 @@ def ensure_bytes(s):
return bytes(s)
if not PY3 and hasattr(s, 'tostring'):
return s.tostring()
if isinstance(s, dict):
return {k: ensure_bytes(v) for k, v in s.items()}
else:
# Perhaps it works anyway - could raise here
return s
# Perhaps it works anyway - could raise here
return s


def ensure_string(s):
Expand All @@ -120,15 +116,10 @@ def ensure_string(s):
'123'
>>> ensure_string('123')
'123'
>>> ensure_string({'x': b'123'})
{'x': '123'}
"""
if isinstance(s, dict):
return {k: ensure_string(v) for k, v in s.items()}
if hasattr(s, 'decode'):
if not isinstance(s, unicode):
return s.decode()
else:
return s
return s


def ensure_trailing_slash(s, ensure=True):
Expand Down

0 comments on commit 199a7ff

Please sign in to comment.