Skip to content

Commit

Permalink
Merge pull request #174 from martindurant/in_numbers
Browse files Browse the repository at this point in the history
Fix "in" filter to cope with strings that could be numbers
  • Loading branch information
martindurant committed Jun 26, 2017
2 parents 50e7c50 + 46295eb commit 5d598ac
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 8 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ cover
build
dist
parquet.egg-info
.tox
.idea
.cached
13 changes: 6 additions & 7 deletions fastparquet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,19 @@

from collections import OrderedDict
import json
import operator
import os
import re
import six
import struct

import numpy as np
import pandas as pd
import thriftpy
import warnings

from .core import read_thrift
from .thrift_structures import parquet_thrift
from . import core, schema, converted_types, encoding, dataframe
from .util import (default_open, ParquetException, sep_from_open, val_to_num,
from .util import (default_open, ParquetException, val_to_num,
ensure_bytes, check_column_names, metadata_from_many,
ex_from_sep, created_by)
ex_from_sep)


class ParquetFile(object):
Expand Down Expand Up @@ -685,7 +682,9 @@ def filter_out_cats(rg, filters, sep='/'):

app_filters = [f[1:] for f in filters if f[0] == cat]
for op, val in app_filters:
if isinstance(val, str):
tstr = six.string_types + (six.text_type, )
if isinstance(val, tstr) or (isinstance(val, (tuple, list)) and
all(isinstance(x, tstr) for x in val)):
v0 = v
else:
v0 = val_to_num(v)
Expand Down
12 changes: 12 additions & 0 deletions fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,18 @@ def test_in_filter(tempdir):
assert set(out.symbols) == {'a', 'c'}


def test_in_filter_numbers(tempdir):
symbols = ['a', 'a', 'b', 'c', 'c', 'd']
values = [1, 2, 3, 4, 5, 6]
df = pd.DataFrame(data={'symbols': symbols, 'values': values})
write(tempdir, df, file_scheme='hive', partition_on=['values'])
pf = ParquetFile(tempdir)
out = pf.to_pandas(filters=[('values', 'in', ['1', '4'])])
assert set(out.symbols) == {'a', 'c'}
out = pf.to_pandas(filters=[('values', 'in', [1, 4])])
assert set(out.symbols) == {'a', 'c'}


def test_filter_stats(tempdir):
df = pd.DataFrame({
'x': [1, 2, 3, 4, 5, 6, 7],
Expand Down

0 comments on commit 5d598ac

Please sign in to comment.