Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Added support for regex patterns in skip_rows #290

Merged
merged 6 commits into from
Jan 30, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
35 changes: 23 additions & 12 deletions tabulator/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import absolute_import
from __future__ import unicode_literals

import re
import six
import gzip
import zipfile
Expand Down Expand Up @@ -84,9 +85,10 @@ class Stream(object):
to False.

skip_rows (List[Union[int, str]], optional):
List of row numbers and
strings to skip. If a string, it'll skip rows that begin with it
(e.g. '#' and '//').
List of row numbers, strings and regex patterns to skip.
If a string, it'll skip rows that begin with it e.g. '#' and '//'.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'rows that begin with' is a bit ambiguous - perhaps 'rows that their first cells begin with the string or match the regex'

To provide a regext pattern start it with `^` e.g. `^skip_me.*`
For example: `skip_rows=[1, '# comment', '^# (regex|comment)']`

post_parse (List[function], optional):
List of generator functions that
Expand Down Expand Up @@ -152,10 +154,13 @@ def __init__(self,

# Set skip rows
self.__skip_rows_by_numbers = []
self.__skip_rows_by_patterns = []
self.__skip_rows_by_comments = []
for directive in copy(skip_rows):
if isinstance(directive, int):
self.__skip_rows_by_numbers.append(directive)
elif directive.startswith('^'):
self.__skip_rows_by_patterns.append(re.compile(directive))
else:
self.__skip_rows_by_comments.append(str(directive))

Expand Down Expand Up @@ -716,15 +721,21 @@ def __check_if_row_for_skipping(self, row_number, headers, row):
if row_number in self.__skip_rows_by_numbers:
return True

# Get first cell
cell = row[0] if row else None

# Handle empty cell
if cell is None:
return '' in self.__skip_rows_by_comments

# Skip by pattern
for pattern in self.__skip_rows_by_patterns:
if bool(pattern.match(cell)):
return True

# Skip by comment
if not row:
return False
match = lambda comment: (
(isinstance(row[0], six.string_types) and
row[0].startswith(comment)) if len(comment) > 0
else row[0] in ('', None)
)
if any(map(match, self.__skip_rows_by_comments)):
return True
for comment in filter(None, self.__skip_rows_by_comments):
if cell.startswith(comment):
return True

return False
6 changes: 3 additions & 3 deletions tests/schemes/test_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# Stream

# https://github.com/frictionlessdata/tabulator-py/issues/271
@pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', reason='See issue #271')
@pytest.mark.skip
def test_stream_s3(s3_client, bucket):

# Upload a file
Expand All @@ -40,7 +40,7 @@ def test_stream_s3(s3_client, bucket):


# https://github.com/frictionlessdata/tabulator-py/issues/271
@pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', reason='See issue #271')
@pytest.mark.skip
def test_stream_s3_endpoint_url(s3_client, bucket):

# Upload a file
Expand All @@ -57,7 +57,7 @@ def test_stream_s3_endpoint_url(s3_client, bucket):


# https://github.com/frictionlessdata/tabulator-py/issues/271
@pytest.mark.skipif(os.environ.get('TRAVIS') == 'true', reason='See issue #271')
@pytest.mark.skip
def test_stream_s3_non_existent_file(s3_client, bucket):
with pytest.raises(exceptions.IOError):
Stream('s3://%s/table.csv' % bucket).open()
Expand Down
7 changes: 7 additions & 0 deletions tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,13 @@ def test_stream_skip_rows_with_headers_example_from_readme():
assert stream.read() == [['John', 1], ['Alex', 2]]


def test_stream_skip_rows_regex():
source = [['# comment'], ['name', 'order'], ['# cat'], ['# dog'], ['John', 1], ['Alex', 2]]
with Stream(source, headers=1, skip_rows=['# comment', r'^# (cat|dog)']) as stream:
assert stream.headers == ['name', 'order']
assert stream.read() == [['John', 1], ['Alex', 2]]


# Post parse

def test_stream_post_parse_headers():
Expand Down