Skip to content

Commit

Permalink
Merge pull request #36 from martindurant/speedups
Browse files Browse the repository at this point in the history
speedup
  • Loading branch information
martindurant committed Dec 1, 2016
2 parents 18a9c38 + 4f906fc commit ff1d5d9
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 20 deletions.
5 changes: 3 additions & 2 deletions fastparquet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,10 @@ def grab_cats(self, columns, row_group_index=0):
-------
{column: [list, of, values]}
"""
if len(columns) == 0:
return {}
rg = self.row_groups[row_group_index]
ofname = self.sep.join([os.path.dirname(self.fn),
rg.columns[0].file_path])
ofname = self.row_group_filename(rg)
out = {}

with self.open(ofname, 'rb') as f:
Expand Down
4 changes: 2 additions & 2 deletions fastparquet/benchmarks/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def time_column():
out = pf.to_pandas()

with measure('write random times, no nulls but has_null=True', result):
write(fn, df)
write(fn, df, has_nulls=True)

pf = ParquetFile(fn)
out = pf.to_pandas() # warm-up
Expand All @@ -45,7 +45,7 @@ def time_column():

df.loc[n//2, 'x'] = pd.to_datetime('NaT')
with measure('write random times, with null', result):
write(fn, df)
write(fn, df, has_nulls=True)

pf = ParquetFile(fn)
out = pf.to_pandas() # warm-up
Expand Down
2 changes: 1 addition & 1 deletion fastparquet/converted_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def convert(data, se):
data: pandas series of primitive type
se: a schema element.
"""
data = np.asarray(data, dtype=simple[se.type])
#data = np.asarray(data, dtype=simple[se.type])
ctype = se.converted_type
if ctype is None:
return data
Expand Down
39 changes: 24 additions & 15 deletions fastparquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def read_col(column, schema_helper, infile, use_cat=False,
any_def = any(_[0] is not None for _ in out)
do_convert = True
if all_dict:
final = np.empty(cmd.num_values, np.int64)
dtype = np.int64
my_nan = -1
do_convert = False
else:
Expand All @@ -232,23 +232,32 @@ def read_col(column, schema_helper, infile, use_cat=False,
my_nan = -9223372036854775808 # int64 version of NaT
else:
my_nan = None
final = np.empty(cmd.num_values, dtype)
start = 0
for defi, rep, val, d in out:
if len(out) == 1 and not any_def:
defi, rep, val, d = out[0]
if d and not all_dict:
cval = dic[val]
final = dic[val]
elif do_convert:
cval = convert(val, se)
else:
cval = val
if defi is not None:
part = final[start:start+len(defi)]
part[defi != 1] = my_nan
part[defi == 1] = cval
start += len(defi)
final = convert(val, se)
else:
final[start:start+len(val)] = cval
start += len(val)
final = val
else:
final = np.empty(cmd.num_values, dtype)
start = 0
for defi, rep, val, d in out:
if d and not all_dict:
cval = dic[val]
elif do_convert:
cval = convert(val, se)
else:
cval = val
if defi is not None:
part = final[start:start+len(defi)]
part[defi != 1] = my_nan
part[defi == 1] = cval
start += len(defi)
else:
final[start:start+len(val)] = cval
start += len(val)
if all_dict:
final = pd.Categorical.from_codes(final, categories=dic)
return final
Expand Down

0 comments on commit ff1d5d9

Please sign in to comment.