Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
Merge 0b03ab6 into e0ea9be
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed Nov 26, 2019
2 parents e0ea9be + 0b03ab6 commit b4d62bd
Show file tree
Hide file tree
Showing 11 changed files with 317 additions and 3 deletions.
26 changes: 23 additions & 3 deletions README.md
Expand Up @@ -48,7 +48,7 @@ A library for reading and writing tabular data (csv/xls/json/etc).
- [text](#text)
- [Supported file formats](#supported-file-formats)
- [csv (read & write)](#csv-read--write)
- [xls/xlsx (read only)](#xlsxlsx-read-only)
- [xls/xlsx (read & write)](#xlsxlsx-read-only)
- [ods (read only)](#ods-read-only)
- [gsheet (read only)](#gsheet-read-only)
- [sql (read & write)](#sql-read--write)
Expand All @@ -57,6 +57,7 @@ A library for reading and writing tabular data (csv/xls/json/etc).
- [json (read only)](#json-read-only)
- [ndjson (read only)](#ndjson-read-only)
- [tsv (read only)](#tsv-read-only)
- [html (read only)](#html-read-only)
- [Adding support for new file sources, formats, and writers](#adding-support-for-new-file-sources-formats-and-writers)
- [Custom loaders](#custom-loaders)
- [Custom parsers](#custom-parsers)
Expand Down Expand Up @@ -540,7 +541,7 @@ stream = Stream('data.csv', delimiter=',')
It supports all options from the Python CSV library. Check [their
documentation][pydoc-csv] for more information.

#### xls/xlsx (read only)
#### xls/xlsx (read & write)

> Tabulator is unable to stream `xls` files, so the entire file is loaded in
> memory. Streaming is supported for `xlsx` files.
Expand All @@ -551,7 +552,7 @@ stream = Stream('data.xls', sheet=1)

###### Options

- **sheet**: Sheet name or number (starting from 1)
- **sheet**: Sheet name or number (starting from 1).
- **fill_merged_cells**: if `True` it will unmerge and fill all merged cells by
a visible value. With this option enabled the parser can't stream data and
load the whole document into memory.
Expand Down Expand Up @@ -644,6 +645,25 @@ stream = Stream('data.ndjson')
stream = Stream('data.tsv')
```

#### html (read only)


> This format is not included to package by default. To use it please install `tabulator` with the `html` extra: `$ pip install tabulator[html]`
An HTML table element residing inside an HTML document.

Supports simple tables (no merged cells) with any legal combination of the td, th, tbody & thead elements.

Usually `foramt='html'` would need to be specified explicitly as web URLs don't always use the `.html` extension.

```python
stream = Stream('http://example.com/some/page.aspx', format='html' selector='.content .data table#id1')
```

###### Options

- **selector**: CSS selector for specifying which `table` element to extract. By default it's `table`, which takes the first `table` element in the document.

### Adding support for new file sources, formats, and writers

Tabulator is written with extensibility in mind, allowing you to add support for
Expand Down
22 changes: 22 additions & 0 deletions data/table1.html
@@ -0,0 +1,22 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
<table>
<tr>
<td>id</td>
<td>name</td>
</tr>
<tr>
<td>1</td>
<td>english</td>
</tr>
<tr>
<td>2</td>
<td>中国人</td>
</tr>
</table>
</body>
</html
29 changes: 29 additions & 0 deletions data/table2.html
@@ -0,0 +1,29 @@
<!DOCTYPE html>
<html>

<head>
<meta charset="UTF-8">
</head>

<body>
<table>
<thead>
<tr>
<th>id</th>
<th>name</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>english</td>
</tr>
<tr>
<td>2</td>
<td>中国人</td>
</tr>
</tbody>
</table>
</body>

</html
50 changes: 50 additions & 0 deletions data/table3.html
@@ -0,0 +1,50 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
<table class='notme'>
<tr>
<td>id</td>
<td>name</td>
</tr>
<tr>
<td>3</td>
<td>french</td>
</tr>
<tr>
<td>4</td>
<td>עברית</td>
</tr>
</table>
<table class='mememe'>
<tr>
<td>id</td>
<td>name</td>
</tr>
<tr>
<td>1</td>
<td>english</td>
</tr>
<tr>
<td>2</td>
<td>中国人</td>
</tr>
</table>
<table class='notme'>
<tr>
<td>id</td>
<td>name</td>
</tr>
<tr>
<td>3</td>
<td>french</td>
</tr>
<tr>
<td>4</td>
<td>עברית</td>
</tr>
</table>
</body>
</html
4 changes: 4 additions & 0 deletions setup.py
Expand Up @@ -48,6 +48,9 @@ def read(*paths):
'ezodf>=0.3',
'lxml>=3.0',
]
INSTALL_PARSER_HTML_REQUIRES = [
'pyquery<2',
]
TESTS_REQUIRE = [
'mock',
'pylama',
Expand All @@ -73,6 +76,7 @@ def read(*paths):
'datapackage': INSTALL_FORMAT_DATAPACKAGE_REQUIRES,
'develop': TESTS_REQUIRE,
'ods': INSTALL_FORMAT_ODS_REQUIRES,
'html': INSTALL_PARSER_HTML_REQUIRES,
},
entry_points={
'console_scripts': [
Expand Down
2 changes: 2 additions & 0 deletions tabulator/config.py
Expand Up @@ -42,6 +42,7 @@
'csv': 'tabulator.parsers.csv.CSVParser',
'datapackage': 'tabulator.parsers.datapackage.DataPackageParser',
'gsheet': 'tabulator.parsers.gsheet.GsheetParser',
'html': 'tabulator.parsers.html.HTMLTableParser',
'inline': 'tabulator.parsers.inline.InlineParser',
'json': 'tabulator.parsers.json.JSONParser',
'jsonl': 'tabulator.parsers.ndjson.NDJSONParser',
Expand All @@ -57,5 +58,6 @@

WRITERS = {
'csv': 'tabulator.writers.csv.CSVWriter',
'xlsx': 'tabulator.writers.xlsx.XLSXWriter',
'sql': 'tabulator.writers.sql.SQLWriter',
}
87 changes: 87 additions & 0 deletions tabulator/parsers/html.py
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals

from pyquery import PyQuery as pq
from ..parser import Parser
from .. import helpers


# Module API

class HTMLTableParser(Parser):
"""Parser to extract data out of HTML tables
"""

# Public

options = [
'selector',
]

def __init__(self, loader, force_parse=False, selector='table'):
self.__loader = loader
self.__selector = selector
self.__force_parse = force_parse
self.__extended_rows = None
self.__encoding = None
self.__bytes = None

@property
def closed(self):
return self.__bytes is None or self.__bytes.closed

def open(self, source, encoding=None):
self.close()
self.__encoding = encoding
self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
if self.__encoding:
self.__encoding.lower()
self.reset()

def close(self):
if not self.closed:
self.__bytes.close()

def reset(self):
helpers.reset_stream(self.__bytes)
self.__extended_rows = self.__iter_extended_rows()

@property
def encoding(self):
return self.__encoding

@property
def extended_rows(self):
return self.__extended_rows

# Private

def __iter_extended_rows(self):

# Get Page content
page = pq(self.__bytes.read())

# Find required table
table = pq(page.find(self.__selector)[0])

# Extract headers
rows = (
table.children('thead').children('tr') +
table.children('tr') +
table.children('tbody').children('tr')
)
rows = [pq(r) for r in rows]
first_row = rows.pop(0)
headers = [pq(th).text() for th in first_row.find('th,td')]

# Extract rows
rows = [[pq(td).text()
for td in pq(tr).find('td')]
for tr in rows]

# Yield rows
for row_number, row in enumerate(rows, start=1):
yield (row_number, headers, row)
43 changes: 43 additions & 0 deletions tabulator/writers/xlsx.py
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals

import six
import openpyxl
from ..writer import Writer
from .. import helpers


# Module API

class XLSXWriter(Writer):
"""XLXX writer.
"""

# Public

options = [
'sheet',
]

def __init__(self, **options):

# Make bytes
if six.PY2:
for key, value in options.items():
if isinstance(value, six.string_types):
options[key] = str(value)

# Set attributes
self.__options = options

def write(self, source, target, headers, encoding=None):
helpers.ensure_dir(target)
wb = openpyxl.Workbook(write_only=True)
ws = wb.create_sheet(title=self.__options.get('sheet'))
ws.append(headers)
for row in source:
ws.append(row)
wb.save(target)
27 changes: 27 additions & 0 deletions tests/formats/test_html.py
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals

import io
import pytest
from mock import Mock
from six import StringIO
from tabulator import exceptions, Stream


# Stream

@pytest.mark.parametrize('source, selector', [
('data/table1.html', 'table'),
('data/table2.html', 'table'),
('data/table3.html', '.mememe'),
])
def test_stream_html(source, selector):
with Stream(source, selector=selector, headers=1) as stream:
assert stream.headers == ['id', 'name']
assert stream.read(keyed=True) == [
{'id': '1', 'name': 'english'},
{'id': '2', 'name': '中国人'}]

0 comments on commit b4d62bd

Please sign in to comment.