diff --git a/README.md b/README.md index 7ef75f24..4b9ed034 100644 --- a/README.md +++ b/README.md @@ -332,6 +332,28 @@ with Stream(source, format='csv', headers=1, ignore_blank_headers=True) as strea stream.read(keyed=True) # {'header1': 'value1', 'header3': 'value3'} ``` +#### Ignore listed/not-listed headers + +The option is similar to the `ignore_blank_headers`. It removes arbitrary columns from the data based on the corresponding column names: + +```python +# Ignore listed headers (omit columns) +source = 'text://header1,header2,header3\nvalue1,value2,value3' +with Stream(source, format='csv', headers=1, ignore_listed_headers=['header2']) as stream: + assert stream.headers == ['header1', 'header3'] + assert stream.read(keyed=True) == [ + {'header1': 'value1', 'header3': 'value3'}, + ] + +# Ignore NOT listed headers (pick colums) +source = 'text://header1,header2,header3\nvalue1,value2,value3' +with Stream(source, format='csv', headers=1, ignore_not_listed_headers=['header2']) as stream: + assert stream.headers == ['header2'] + assert stream.read(keyed=True) == [ + {'header2': 'value2'}, + ] +``` + #### Force strings When `True`, all rows' values will be converted to strings (defaults to @@ -781,7 +803,7 @@ Options: ### `Stream` ```python -Stream(self, source, headers=None, scheme=None, format=None, encoding=None, compression=None, allow_html=False, sample_size=100, bytes_sample_size=10000, ignore_blank_headers=False, force_strings=False, force_parse=False, skip_rows=[], post_parse=[], custom_loaders={}, custom_parsers={}, custom_writers={}, **options) +Stream(self, source, headers=None, scheme=None, format=None, encoding=None, compression=None, allow_html=False, sample_size=100, bytes_sample_size=10000, ignore_blank_headers=False, ignore_listed_headers=None, ignore_not_listed_headers=None, force_strings=False, force_parse=False, skip_rows=[], post_parse=[], custom_loaders={}, custom_parsers={}, custom_writers={}, **options) ``` Stream of tabular data. @@ -826,6 +848,12 @@ __Arguments__ - __ignore_blank_headers (bool, optional)__: When True, ignores all columns that have blank headers. Defaults to False. +- __ignore_listed_headers (List[str], optional)__: + When passed, ignores all columns with headers + that the given list includes +- __ignore_not_listed_headers (List[str], optional)__: + When passed, ignores all columns with headers + that the given list DOES NOT include - __force_strings (bool, optional)__: When True, casts all data to strings. Defaults to False. @@ -1257,6 +1285,10 @@ $ make test Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tabulator-py/commits/master). +#### v1.32 + +- Added ability to skip columns (#293) + #### v1.31 - Added `xlsx` writer diff --git a/tabulator/VERSION b/tabulator/VERSION index a59db5df..7b4009ac 100644 --- a/tabulator/VERSION +++ b/tabulator/VERSION @@ -1,2 +1,2 @@ -1.31.2 +1.32.0 diff --git a/tabulator/stream.py b/tabulator/stream.py index af506e79..0b386832 100644 --- a/tabulator/stream.py +++ b/tabulator/stream.py @@ -75,6 +75,14 @@ class Stream(object): When True, ignores all columns that have blank headers. Defaults to False. + ignore_listed_headers (List[str], optional): + When passed, ignores all columns with headers + that the given list includes + + ignore_not_listed_headers (List[str], optional): + When passed, ignores all columns with headers + that the given list DOES NOT include + force_strings (bool, optional): When True, casts all data to strings. Defaults to False. @@ -127,6 +135,8 @@ def __init__(self, sample_size=config.DEFAULT_SAMPLE_SIZE, bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE, ignore_blank_headers=False, + ignore_listed_headers=None, + ignore_not_listed_headers=None, force_strings=False, force_parse=False, skip_rows=[], @@ -178,7 +188,9 @@ def __init__(self, self.__sample_size = sample_size self.__bytes_sample_size = bytes_sample_size self.__ignore_blank_headers = ignore_blank_headers - self.__blank_header_indexes = [] + self.__ignore_listed_headers = ignore_listed_headers + self.__ignore_not_listed_headers = ignore_not_listed_headers + self.__ignored_headers_indexes = [] self.__force_strings = force_strings self.__force_parse = force_parse self.__post_parse = copy(post_parse) @@ -614,16 +626,31 @@ def __extract_headers(self): if row_number == self.__headers_row_last: break - # Ignore blank headers - if self.__ignore_blank_headers: - self.__blank_header_indexes = [] + # Ignore headers + if (self.__ignore_blank_headers or + self.__ignore_listed_headers is not None or + self.__ignore_not_listed_headers is not None): + self.__ignored_headers_indexes = [] raw_headers, self.__headers = self.__headers, [] for index, header in list(enumerate(raw_headers)): + ignore = False + # Ignore blank headers if header in ['', None]: - self.__blank_header_indexes.append(index) + ignore = True + # Ignore listed headers + if self.__ignore_listed_headers is not None: + if header in self.__ignore_listed_headers: + ignore = True + # Ignore not-listed headers + if self.__ignore_not_listed_headers is not None: + if header not in self.__ignore_not_listed_headers: + ignore = True + # Add to the list and skip + if ignore: + self.__ignored_headers_indexes.append(index) continue self.__headers.append(header) - self.__blank_header_indexes = sorted(self.__blank_header_indexes, reverse=True) + self.__ignored_headers_indexes = sorted(self.__ignored_headers_indexes, reverse=True) # Remove headers from data if not keyed_source: @@ -661,9 +688,9 @@ def builtin_processor(extended_rows): if self.__check_if_row_for_skipping(row_number, headers, row): continue - # Ignore blank headers - if self.__blank_header_indexes: - for index in self.__blank_header_indexes: + # Ignore headers + if self.__ignored_headers_indexes: + for index in self.__ignored_headers_indexes: if index < len(row): row = row[:index] + row[index+1:] diff --git a/tests/test_stream.py b/tests/test_stream.py index 4f603989..4d6c2e9f 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -287,6 +287,25 @@ def test_stream_ignore_blank_headers_true(): assert stream.read(keyed=True) == data +# Ignore listed/not_listed headers + +def test_stream_ignore_listed_headers(): + source = 'text://header1,header2,header3\nvalue1,value2,value3' + with Stream(source, format='csv', headers=1, ignore_listed_headers=['header2']) as stream: + assert stream.headers == ['header1', 'header3'] + assert stream.read(keyed=True) == [ + {'header1': 'value1', 'header3': 'value3'}, + ] + +def test_stream_ignore_not_listed_headers(): + source = 'text://header1,header2,header3\nvalue1,value2,value3' + with Stream(source, format='csv', headers=1, ignore_not_listed_headers=['header2']) as stream: + assert stream.headers == ['header2'] + assert stream.read(keyed=True) == [ + {'header2': 'value2'}, + ] + + # Force strings def test_stream_force_strings():