Merge branch 'master' into skip_rows_regex

frictionlessdata · Jan 30, 2020 · b232497 · b232497
2 parents fc1feb3 + c1604ff
commit b232497
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -332,6 +332,28 @@ with Stream(source, format='csv', headers=1, ignore_blank_headers=True) as strea
     stream.read(keyed=True) # {'header1': 'value1', 'header3': 'value3'}
 ```
 
+#### Ignore listed/not-listed headers
+
+The option is similar to the `ignore_blank_headers`. It removes arbitrary columns from the data based on the corresponding column names:
+
+```python
+# Ignore listed headers (omit columns)
+source = 'text://header1,header2,header3\nvalue1,value2,value3'
+with Stream(source, format='csv', headers=1, ignore_listed_headers=['header2']) as stream:
+    assert stream.headers == ['header1', 'header3']
+    assert stream.read(keyed=True) == [
+        {'header1': 'value1', 'header3': 'value3'},
+    ]
+
+# Ignore NOT listed headers (pick colums)
+source = 'text://header1,header2,header3\nvalue1,value2,value3'
+with Stream(source, format='csv', headers=1, ignore_not_listed_headers=['header2']) as stream:
+    assert stream.headers == ['header2']
+    assert stream.read(keyed=True) == [
+        {'header2': 'value2'},
+    ]
+```
+
 #### Force strings
 
 When `True`, all rows' values will be converted to strings (defaults to
@@ -781,7 +803,7 @@ Options:
 
 ### `Stream`
 ```python
-Stream(self, source, headers=None, scheme=None, format=None, encoding=None, compression=None, allow_html=False, sample_size=100, bytes_sample_size=10000, ignore_blank_headers=False, force_strings=False, force_parse=False, skip_rows=[], post_parse=[], custom_loaders={}, custom_parsers={}, custom_writers={}, **options)
+Stream(self, source, headers=None, scheme=None, format=None, encoding=None, compression=None, allow_html=False, sample_size=100, bytes_sample_size=10000, ignore_blank_headers=False, ignore_listed_headers=None, ignore_not_listed_headers=None, force_strings=False, force_parse=False, skip_rows=[], post_parse=[], custom_loaders={}, custom_parsers={}, custom_writers={}, **options)
 ```
 Stream of tabular data.
 
@@ -826,6 +848,12 @@ __Arguments__
 - __ignore_blank_headers (bool, optional)__:
         When True, ignores all columns
         that have blank headers. Defaults to False.
+- __ignore_listed_headers (List[str], optional)__:
+        When passed, ignores all columns with headers
+        that the given list includes
+- __ignore_not_listed_headers (List[str], optional)__:
+        When passed, ignores all columns with headers
+        that the given list DOES NOT include
 - __force_strings (bool, optional)__:
         When True, casts all data to strings.
         Defaults to False.
@@ -1257,6 +1285,10 @@ $ make test
 
 Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tabulator-py/commits/master).
 
+#### v1.32
+
+- Added ability to skip columns (#293)
+
 #### v1.31
 
 - Added `xlsx` writer

diff --git a/tabulator/VERSION b/tabulator/VERSION
@@ -1,2 +1,2 @@
-1.31.2
+1.32.0
 
diff --git a/tabulator/stream.py b/tabulator/stream.py
@@ -75,6 +75,14 @@ class Stream(object):
             When True, ignores all columns
             that have blank headers. Defaults to False.
 
+        ignore_listed_headers (List[str], optional):
+            When passed, ignores all columns with headers
+            that the given list includes
+
+        ignore_not_listed_headers (List[str], optional):
+            When passed, ignores all columns with headers
+            that the given list DOES NOT include
+
         force_strings (bool, optional):
             When True, casts all data to strings.
             Defaults to False.
@@ -127,6 +135,8 @@ def __init__(self,
                  sample_size=config.DEFAULT_SAMPLE_SIZE,
                  bytes_sample_size=config.DEFAULT_BYTES_SAMPLE_SIZE,
                  ignore_blank_headers=False,
+                 ignore_listed_headers=None,
+                 ignore_not_listed_headers=None,
                  force_strings=False,
                  force_parse=False,
                  skip_rows=[],
@@ -178,7 +188,9 @@ def __init__(self,
         self.__sample_size = sample_size
         self.__bytes_sample_size = bytes_sample_size
         self.__ignore_blank_headers = ignore_blank_headers
-        self.__blank_header_indexes = []
+        self.__ignore_listed_headers = ignore_listed_headers
+        self.__ignore_not_listed_headers = ignore_not_listed_headers
+        self.__ignored_headers_indexes = []
         self.__force_strings = force_strings
         self.__force_parse = force_parse
         self.__post_parse = copy(post_parse)
@@ -614,16 +626,31 @@ def __extract_headers(self):
             if row_number == self.__headers_row_last:
                 break
 
-        # Ignore blank headers
-        if self.__ignore_blank_headers:
-            self.__blank_header_indexes = []
+        # Ignore headers
+        if (self.__ignore_blank_headers or
+                self.__ignore_listed_headers is not None or
+                self.__ignore_not_listed_headers is not None):
+            self.__ignored_headers_indexes = []
             raw_headers, self.__headers = self.__headers, []
             for index, header in list(enumerate(raw_headers)):
+                ignore = False
+                # Ignore blank headers
                 if header in ['', None]:
-                    self.__blank_header_indexes.append(index)
+                    ignore = True
+                # Ignore listed headers
+                if self.__ignore_listed_headers is not None:
+                    if header in self.__ignore_listed_headers:
+                        ignore = True
+                # Ignore not-listed headers
+                if self.__ignore_not_listed_headers is not None:
+                    if header not in self.__ignore_not_listed_headers:
+                        ignore = True
+                # Add to the list and skip
+                if ignore:
+                    self.__ignored_headers_indexes.append(index)
                     continue
                 self.__headers.append(header)
-            self.__blank_header_indexes = sorted(self.__blank_header_indexes, reverse=True)
+            self.__ignored_headers_indexes = sorted(self.__ignored_headers_indexes, reverse=True)
 
         # Remove headers from data
         if not keyed_source:
@@ -661,9 +688,9 @@ def builtin_processor(extended_rows):
                 if self.__check_if_row_for_skipping(row_number, headers, row):
                     continue
 
-                # Ignore blank headers
-                if self.__blank_header_indexes:
-                    for index in self.__blank_header_indexes:
+                # Ignore headers
+                if self.__ignored_headers_indexes:
+                    for index in self.__ignored_headers_indexes:
                         if index < len(row):
                             row = row[:index] + row[index+1:]
 

diff --git a/tests/test_stream.py b/tests/test_stream.py
@@ -287,6 +287,25 @@ def test_stream_ignore_blank_headers_true():
         assert stream.read(keyed=True) == data
 
 
+# Ignore listed/not_listed headers
+
+def test_stream_ignore_listed_headers():
+    source = 'text://header1,header2,header3\nvalue1,value2,value3'
+    with Stream(source, format='csv', headers=1, ignore_listed_headers=['header2']) as stream:
+        assert stream.headers == ['header1', 'header3']
+        assert stream.read(keyed=True) == [
+            {'header1': 'value1', 'header3': 'value3'},
+        ]
+
+def test_stream_ignore_not_listed_headers():
+    source = 'text://header1,header2,header3\nvalue1,value2,value3'
+    with Stream(source, format='csv', headers=1, ignore_not_listed_headers=['header2']) as stream:
+        assert stream.headers == ['header2']
+        assert stream.read(keyed=True) == [
+            {'header2': 'value2'},
+        ]
+
+
 # Force strings
 
 def test_stream_force_strings():