Merge 0b03ab6 into e0ea9be

frictionlessdata · Nov 26, 2019 · b4d62bd · b4d62bd
2 parents e0ea9be + 0b03ab6
commit b4d62bd
Show file tree

Hide file tree

Showing 11 changed files with 317 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ A library for reading and writing tabular data (csv/xls/json/etc).
       - [text](#text)
     - [Supported file formats](#supported-file-formats)
       - [csv (read & write)](#csv-read--write)
-      - [xls/xlsx (read only)](#xlsxlsx-read-only)
+      - [xls/xlsx (read & write)](#xlsxlsx-read-only)
       - [ods (read only)](#ods-read-only)
       - [gsheet (read only)](#gsheet-read-only)
       - [sql (read & write)](#sql-read--write)
@@ -57,6 +57,7 @@ A library for reading and writing tabular data (csv/xls/json/etc).
       - [json (read only)](#json-read-only)
       - [ndjson (read only)](#ndjson-read-only)
       - [tsv (read only)](#tsv-read-only)
+      - [html (read only)](#html-read-only)
     - [Adding support for new file sources, formats, and writers](#adding-support-for-new-file-sources-formats-and-writers)
       - [Custom loaders](#custom-loaders)
       - [Custom parsers](#custom-parsers)
@@ -540,7 +541,7 @@ stream = Stream('data.csv', delimiter=',')
 It supports all options from the Python CSV library. Check [their
 documentation][pydoc-csv] for more information.
 
-#### xls/xlsx (read only)
+#### xls/xlsx (read & write)
 
 > Tabulator is unable to stream `xls` files, so the entire file is loaded in
 > memory. Streaming is supported for `xlsx` files.
@@ -551,7 +552,7 @@ stream = Stream('data.xls', sheet=1)
 
 ###### Options
 
-- **sheet**: Sheet name or number (starting from 1)
+- **sheet**: Sheet name or number (starting from 1). 
 - **fill_merged_cells**: if `True` it will unmerge and fill all merged cells by
   a visible value. With this option enabled the parser can't stream data and
   load the whole document into memory.
@@ -644,6 +645,25 @@ stream = Stream('data.ndjson')
 stream = Stream('data.tsv')
 ```
 
+#### html (read only)
+
+
+> This format is not included to package by default. To use it please install `tabulator` with the `html` extra: `$ pip install tabulator[html]`
+
+An HTML table element residing inside an HTML document.
+
+Supports simple tables (no merged cells) with any legal combination of the td, th, tbody & thead elements.
+
+Usually `foramt='html'` would need to be specified explicitly as web URLs don't always use the `.html` extension.
+
+```python
+stream = Stream('http://example.com/some/page.aspx', format='html' selector='.content .data table#id1')
+```
+
+###### Options
+
+- **selector**: CSS selector for specifying which `table` element to extract. By default it's `table`, which takes the first `table` element in the document.
+
 ### Adding support for new file sources, formats, and writers
 
 Tabulator is written with extensibility in mind, allowing you to add support for

diff --git a/data/table1.html b/data/table1.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+</head>
+<body>
+    <table>
+        <tr>
+            <td>id</td>
+            <td>name</td>
+        </tr>
+        <tr>
+            <td>1</td>
+            <td>english</td>
+        </tr>
+        <tr>
+            <td>2</td>
+            <td>中国人</td>
+        </tr>
+    </table>
+</body>
+</html
diff --git a/data/table2.html b/data/table2.html
@@ -0,0 +1,29 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="UTF-8">
+</head>
+
+<body>
+    <table>
+        <thead>
+            <tr>
+                <th>id</th>
+                <th>name</th>
+            </tr>
+        </thead>
+        <tbody>
+            <tr>
+                <td>1</td>
+                <td>english</td>
+            </tr>
+            <tr>
+                <td>2</td>
+                <td>中国人</td>
+            </tr>
+        </tbody>
+    </table>
+</body>
+
+</html
diff --git a/data/table3.html b/data/table3.html
@@ -0,0 +1,50 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+</head>
+<body>
+    <table class='notme'>
+        <tr>
+            <td>id</td>
+            <td>name</td>
+        </tr>
+        <tr>
+            <td>3</td>
+            <td>french</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>עברית</td>
+        </tr>
+    </table>
+    <table class='mememe'>
+        <tr>
+            <td>id</td>
+            <td>name</td>
+        </tr>
+        <tr>
+            <td>1</td>
+            <td>english</td>
+        </tr>
+        <tr>
+            <td>2</td>
+            <td>中国人</td>
+        </tr>
+    </table>
+    <table class='notme'>
+        <tr>
+            <td>id</td>
+            <td>name</td>
+        </tr>
+        <tr>
+            <td>3</td>
+            <td>french</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>עברית</td>
+        </tr>
+    </table>
+</body>
+</html
diff --git a/setup.py b/setup.py
@@ -48,6 +48,9 @@ def read(*paths):
     'ezodf>=0.3',
     'lxml>=3.0',
 ]
+INSTALL_PARSER_HTML_REQUIRES = [
+    'pyquery<2',
+]
 TESTS_REQUIRE = [
     'mock',
     'pylama',
@@ -73,6 +76,7 @@ def read(*paths):
         'datapackage': INSTALL_FORMAT_DATAPACKAGE_REQUIRES,
         'develop': TESTS_REQUIRE,
         'ods': INSTALL_FORMAT_ODS_REQUIRES,
+        'html': INSTALL_PARSER_HTML_REQUIRES,
     },
     entry_points={
         'console_scripts': [

diff --git a/tabulator/config.py b/tabulator/config.py
@@ -42,6 +42,7 @@
     'csv': 'tabulator.parsers.csv.CSVParser',
     'datapackage': 'tabulator.parsers.datapackage.DataPackageParser',
     'gsheet': 'tabulator.parsers.gsheet.GsheetParser',
+    'html': 'tabulator.parsers.html.HTMLTableParser',
     'inline': 'tabulator.parsers.inline.InlineParser',
     'json': 'tabulator.parsers.json.JSONParser',
     'jsonl': 'tabulator.parsers.ndjson.NDJSONParser',
@@ -57,5 +58,6 @@
 
 WRITERS = {
     'csv': 'tabulator.writers.csv.CSVWriter',
+    'xlsx': 'tabulator.writers.xlsx.XLSXWriter',
     'sql': 'tabulator.writers.sql.SQLWriter',
 }
diff --git a/tabulator/parsers/html.py b/tabulator/parsers/html.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+from pyquery import PyQuery as pq
+from ..parser import Parser
+from .. import helpers
+
+
+# Module API
+
+class HTMLTableParser(Parser):
+    """Parser to extract data out of HTML tables
+    """
+
+    # Public
+
+    options = [
+        'selector',
+    ]
+
+    def __init__(self, loader, force_parse=False, selector='table'):
+        self.__loader = loader
+        self.__selector = selector
+        self.__force_parse = force_parse
+        self.__extended_rows = None
+        self.__encoding = None
+        self.__bytes = None
+
+    @property
+    def closed(self):
+        return self.__bytes is None or self.__bytes.closed
+
+    def open(self, source, encoding=None):
+        self.close()
+        self.__encoding = encoding
+        self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
+        if self.__encoding:
+            self.__encoding.lower()
+        self.reset()
+
+    def close(self):
+        if not self.closed:
+            self.__bytes.close()
+
+    def reset(self):
+        helpers.reset_stream(self.__bytes)
+        self.__extended_rows = self.__iter_extended_rows()
+
+    @property
+    def encoding(self):
+        return self.__encoding
+
+    @property
+    def extended_rows(self):
+        return self.__extended_rows
+
+    # Private
+
+    def __iter_extended_rows(self):
+
+        # Get Page content
+        page = pq(self.__bytes.read())
+
+        # Find required table
+        table = pq(page.find(self.__selector)[0])
+
+        # Extract headers
+        rows = (
+            table.children('thead').children('tr') +
+            table.children('tr') +
+            table.children('tbody').children('tr')
+        )
+        rows = [pq(r) for r in rows]
+        first_row = rows.pop(0)
+        headers = [pq(th).text() for th in first_row.find('th,td')]
+
+        # Extract rows
+        rows = [[pq(td).text()
+                 for td in pq(tr).find('td')]
+                for tr in rows]
+
+        # Yield rows
+        for row_number, row in enumerate(rows, start=1):
+            yield (row_number, headers, row)
diff --git a/tabulator/writers/xlsx.py b/tabulator/writers/xlsx.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+import six
+import openpyxl
+from ..writer import Writer
+from .. import helpers
+
+
+# Module API
+
+class XLSXWriter(Writer):
+    """XLXX writer.
+    """
+
+    # Public
+
+    options = [
+        'sheet',
+    ]
+
+    def __init__(self, **options):
+
+        # Make bytes
+        if six.PY2:
+            for key, value in options.items():
+                if isinstance(value, six.string_types):
+                    options[key] = str(value)
+
+        # Set attributes
+        self.__options = options
+
+    def write(self, source, target, headers, encoding=None):
+        helpers.ensure_dir(target)
+        wb = openpyxl.Workbook(write_only=True)
+        ws = wb.create_sheet(title=self.__options.get('sheet'))
+        ws.append(headers)
+        for row in source:
+            ws.append(row)
+        wb.save(target)
diff --git a/tests/formats/test_html.py b/tests/formats/test_html.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+import io
+import pytest
+from mock import Mock
+from six import StringIO
+from tabulator import exceptions, Stream
+
+
+# Stream
+
+@pytest.mark.parametrize('source, selector', [
+    ('data/table1.html', 'table'),
+    ('data/table2.html', 'table'),
+    ('data/table3.html', '.mememe'),
+])
+def test_stream_html(source, selector):
+    with Stream(source, selector=selector, headers=1) as stream:
+        assert stream.headers == ['id', 'name']
+        assert stream.read(keyed=True) == [
+            {'id': '1', 'name': 'english'},
+            {'id': '2', 'name': '中国人'}]
+