Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
Merge branch 'master' into skip_rows_regex
Browse files Browse the repository at this point in the history
  • Loading branch information
roll committed Jan 30, 2020
2 parents fc1feb3 + c1604ff commit b232497
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 11 deletions.
34 changes: 33 additions & 1 deletion README.md
Expand Up @@ -332,6 +332,28 @@ with Stream(source, format='csv', headers=1, ignore_blank_headers=True) as strea
stream.read(keyed=True) # {'header1': 'value1', 'header3': 'value3'}
```

#### Ignore listed/not-listed headers

The option is similar to the `ignore_blank_headers`. It removes arbitrary columns from the data based on the corresponding column names:

```python
# Ignore listed headers (omit columns)
source = 'text://header1,header2,header3\nvalue1,value2,value3'
with Stream(source, format='csv', headers=1, ignore_listed_headers=['header2']) as stream:
assert stream.headers == ['header1', 'header3']
assert stream.read(keyed=True) == [
{'header1': 'value1', 'header3': 'value3'},
]

# Ignore NOT listed headers (pick colums)
source = 'text://header1,header2,header3\nvalue1,value2,value3'
with Stream(source, format='csv', headers=1, ignore_not_listed_headers=['header2']) as stream:
assert stream.headers == ['header2']
assert stream.read(keyed=True) == [
{'header2': 'value2'},
]
```

#### Force strings

When `True`, all rows' values will be converted to strings (defaults to
Expand Down Expand Up @@ -781,7 +803,7 @@ Options:

### `Stream`
```python
Stream(self, source, headers=None, scheme=None, format=None, encoding=None, compression=None, allow_html=False, sample_size=100, bytes_sample_size=10000, ignore_blank_headers=False, force_strings=False, force_parse=False, skip_rows=[], post_parse=[], custom_loaders={}, custom_parsers={}, custom_writers={}, **options)
Stream(self, source, headers=None, scheme=None, format=None, encoding=None, compression=None, allow_html=False, sample_size=100, bytes_sample_size=10000, ignore_blank_headers=False, ignore_listed_headers=None, ignore_not_listed_headers=None, force_strings=False, force_parse=False, skip_rows=[], post_parse=[], custom_loaders={}, custom_parsers={}, custom_writers={}, **options)
```
Stream of tabular data.

Expand Down Expand Up @@ -826,6 +848,12 @@ __Arguments__
- __ignore_blank_headers (bool, optional)__:
When True, ignores all columns
that have blank headers. Defaults to False.
- __ignore_listed_headers (List[str], optional)__:
When passed, ignores all columns with headers
that the given list includes
- __ignore_not_listed_headers (List[str], optional)__:
When passed, ignores all columns with headers
that the given list DOES NOT include
- __force_strings (bool, optional)__:
When True, casts all data to strings.
Defaults to False.
Expand Down Expand Up @@ -1257,6 +1285,10 @@ $ make test

Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tabulator-py/commits/master).

#### v1.32

- Added ability to skip columns (#293)

#### v1.31

- Added `xlsx` writer
Expand Down
2 changes: 1 addition & 1 deletion tabulator/VERSION
@@ -1,2 +1,2 @@
1.31.2
1.32.0

45 changes: 36 additions & 9 deletions tabulator/stream.py
Expand Up @@ -75,6 +75,14 @@ class Stream(object):
When True, ignores all columns
that have blank headers. Defaults to False.
ignore_listed_headers (List[str], optional):
When passed, ignores all columns with headers
that the given list includes
ignore_not_listed_headers (List[str], optional):
When passed, ignores all columns with headers
that the given list DOES NOT include
force_strings (bool, optional):
When True, casts all data to strings.
Defaults to False.
Expand Down Expand Up @@ -127,6 +135,8 @@ def __init__(self,
sample_size=config.DEFAULT_SAMPLE_SIZE,
bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE,
ignore_blank_headers=False,
ignore_listed_headers=None,
ignore_not_listed_headers=None,
force_strings=False,
force_parse=False,
skip_rows=[],
Expand Down Expand Up @@ -178,7 +188,9 @@ def __init__(self,
self.__sample_size = sample_size
self.__bytes_sample_size = bytes_sample_size
self.__ignore_blank_headers = ignore_blank_headers
self.__blank_header_indexes = []
self.__ignore_listed_headers = ignore_listed_headers
self.__ignore_not_listed_headers = ignore_not_listed_headers
self.__ignored_headers_indexes = []
self.__force_strings = force_strings
self.__force_parse = force_parse
self.__post_parse = copy(post_parse)
Expand Down Expand Up @@ -614,16 +626,31 @@ def __extract_headers(self):
if row_number == self.__headers_row_last:
break

# Ignore blank headers
if self.__ignore_blank_headers:
self.__blank_header_indexes = []
# Ignore headers
if (self.__ignore_blank_headers or
self.__ignore_listed_headers is not None or
self.__ignore_not_listed_headers is not None):
self.__ignored_headers_indexes = []
raw_headers, self.__headers = self.__headers, []
for index, header in list(enumerate(raw_headers)):
ignore = False
# Ignore blank headers
if header in ['', None]:
self.__blank_header_indexes.append(index)
ignore = True
# Ignore listed headers
if self.__ignore_listed_headers is not None:
if header in self.__ignore_listed_headers:
ignore = True
# Ignore not-listed headers
if self.__ignore_not_listed_headers is not None:
if header not in self.__ignore_not_listed_headers:
ignore = True
# Add to the list and skip
if ignore:
self.__ignored_headers_indexes.append(index)
continue
self.__headers.append(header)
self.__blank_header_indexes = sorted(self.__blank_header_indexes, reverse=True)
self.__ignored_headers_indexes = sorted(self.__ignored_headers_indexes, reverse=True)

# Remove headers from data
if not keyed_source:
Expand Down Expand Up @@ -661,9 +688,9 @@ def builtin_processor(extended_rows):
if self.__check_if_row_for_skipping(row_number, headers, row):
continue

# Ignore blank headers
if self.__blank_header_indexes:
for index in self.__blank_header_indexes:
# Ignore headers
if self.__ignored_headers_indexes:
for index in self.__ignored_headers_indexes:
if index < len(row):
row = row[:index] + row[index+1:]

Expand Down
19 changes: 19 additions & 0 deletions tests/test_stream.py
Expand Up @@ -287,6 +287,25 @@ def test_stream_ignore_blank_headers_true():
assert stream.read(keyed=True) == data


# Ignore listed/not_listed headers

def test_stream_ignore_listed_headers():
source = 'text://header1,header2,header3\nvalue1,value2,value3'
with Stream(source, format='csv', headers=1, ignore_listed_headers=['header2']) as stream:
assert stream.headers == ['header1', 'header3']
assert stream.read(keyed=True) == [
{'header1': 'value1', 'header3': 'value3'},
]

def test_stream_ignore_not_listed_headers():
source = 'text://header1,header2,header3\nvalue1,value2,value3'
with Stream(source, format='csv', headers=1, ignore_not_listed_headers=['header2']) as stream:
assert stream.headers == ['header2']
assert stream.read(keyed=True) == [
{'header2': 'value2'},
]


# Force strings

def test_stream_force_strings():
Expand Down

0 comments on commit b232497

Please sign in to comment.