forked from jcrobak/parquet-python
-
-
Notifications
You must be signed in to change notification settings - Fork 172
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #202 from martindurant/cleanup
Cleanup
- Loading branch information
Showing
9 changed files
with
150 additions
and
141 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import os | ||
import pickle | ||
from fastparquet import ParquetFile, parquet_thrift | ||
from fastparquet.test.util import TEST_DATA | ||
from fastparquet.schema import schema_tree | ||
|
||
|
||
def test_serialize(): | ||
fn = os.path.join(TEST_DATA, "nation.impala.parquet") | ||
pf = ParquetFile(fn) | ||
fmd2 = pickle.loads(pickle.dumps(pf.fmd)) | ||
schema_tree(fmd2.schema) # because we added fake fields when loading pf | ||
assert fmd2 == pf.fmd | ||
|
||
rg = pf.row_groups[0] | ||
rg2 = pickle.loads(pickle.dumps(rg)) | ||
assert rg == rg2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,130 @@ | ||
import os | ||
import io | ||
from thrift.protocol.TCompactProtocol import TCompactProtocolAccelerated as TCompactProtocol | ||
from thrift.protocol.TProtocol import TProtocolException | ||
|
||
# import thriftpy | ||
from .parquet_thrift.parquet import ttypes as parquet_thrift | ||
|
||
|
||
# THRIFT_FILE = os.path.join(os.path.dirname(__file__), "parquet.thrift") | ||
# parquet_thrift = thriftpy.load(THRIFT_FILE, module_name="parquet_thrift") # pylint: disable=invalid-name | ||
def read_thrift(file_obj, ttype): | ||
"""Read a thrift structure from the given fo.""" | ||
from thrift.transport.TTransport import TFileObjectTransport, TBufferedTransport | ||
starting_pos = file_obj.tell() | ||
|
||
import fastparquet.parquet_thrift.parquet.ttypes as parquet_thrift | ||
|
||
# set up the protocol chain | ||
ft = TFileObjectTransport(file_obj) | ||
bufsize = 2 ** 16 | ||
# for accelerated reading ensure that we wrap this so that the CReadable transport can be used. | ||
bt = TBufferedTransport(ft, bufsize) | ||
pin = TCompactProtocol(bt) | ||
|
||
# read out type | ||
obj = ttype() | ||
obj.read(pin) | ||
|
||
# The read will actually overshoot due to the buffering that thrift does. Seek backwards to the correct spot,. | ||
buffer_pos = bt.cstringio_buf.tell() | ||
ending_pos = file_obj.tell() | ||
blocks = ((ending_pos - starting_pos) // bufsize) - 1 | ||
if blocks < 0: | ||
blocks = 0 | ||
file_obj.seek(starting_pos + blocks * bufsize + buffer_pos) | ||
return obj | ||
|
||
|
||
def write_thrift(fobj, thrift): | ||
"""Write binary compact representation of thiftpy structured object | ||
Parameters | ||
---------- | ||
fobj: open file-like object (binary mode) | ||
thrift: thriftpy object to write | ||
Returns | ||
------- | ||
Number of bytes written | ||
""" | ||
t0 = fobj.tell() | ||
pout = TCompactProtocol(fobj) | ||
try: | ||
thrift.write(pout) | ||
fail = False | ||
except TProtocolException as e: | ||
typ, val, tb = sys.exc_info() | ||
frames = [] | ||
while tb is not None: | ||
frames.append(tb) | ||
tb = tb.tb_next | ||
frame = [tb for tb in frames if 'write_struct' in str(tb.tb_frame.f_code)] | ||
variables = frame[0].tb_frame.f_locals | ||
obj = variables['obj'] | ||
name = variables['fname'] | ||
fail = True | ||
if fail: | ||
raise ParquetException('Thrift parameter validation failure %s' | ||
' when writing: %s-> Field: %s' % ( | ||
val.args[0], obj, name | ||
)) | ||
return fobj.tell() - t0 | ||
|
||
|
||
def is_thrift_item(item): | ||
return hasattr(item, 'thrift_spec') and hasattr(item, 'read') | ||
|
||
|
||
def thrift_copy(structure): | ||
""" | ||
Recursively copy a thriftpy structure | ||
""" | ||
base = structure.__class__() | ||
for key in dir(structure): | ||
if key.startswith('_') or key in ['thrift_spec', 'read', 'write', | ||
'default_spec', 'validate']: | ||
continue | ||
val = getattr(structure, key) | ||
if isinstance(val, list): | ||
setattr(base, key, [thrift_copy(item) | ||
if is_thrift_item(item) | ||
else item for item in val]) | ||
elif is_thrift_item(val): | ||
setattr(base, key, thrift_copy(val)) | ||
else: | ||
setattr(base, key, val) | ||
return base | ||
|
||
|
||
def thrift_print(structure, offset=0): | ||
""" | ||
Handy recursive text ouput for thrift structures | ||
""" | ||
if not is_thrift_item(structure): | ||
return str(structure) | ||
s = str(structure.__class__) + '\n' | ||
for key in dir(structure): | ||
if key.startswith('_') or key in ['thrift_spec', 'read', 'write', | ||
'default_spec', 'validate']: | ||
continue | ||
s = s + ' ' * offset + key + ': ' + thrift_print(getattr(structure, key) | ||
, offset+2) + '\n' | ||
return s | ||
|
||
|
||
for cls in dir(parquet_thrift): | ||
if cls[0].isupper(): | ||
c = getattr(parquet_thrift, cls) | ||
c.__str__ = thrift_print | ||
c.__repr__ = thrift_print | ||
|
||
|
||
def __getstate__(ob): | ||
b = io.BytesIO() | ||
write_thrift(b, ob) | ||
b.seek(0) | ||
return b.read() | ||
|
||
for t in [parquet_thrift.FileMetaData, parquet_thrift.RowGroup]: | ||
def __setstate__(ob, d, t=t): | ||
b = io.BytesIO(d) | ||
o = read_thrift(b, t) | ||
ob.__dict__ = o.__dict__ | ||
t.__setstate__ = __setstate__ | ||
t.__getstate__ = __getstate__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters