This repository has been archived by the owner on Jul 11, 2023. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
317 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<meta charset="UTF-8"> | ||
</head> | ||
<body> | ||
<table> | ||
<tr> | ||
<td>id</td> | ||
<td>name</td> | ||
</tr> | ||
<tr> | ||
<td>1</td> | ||
<td>english</td> | ||
</tr> | ||
<tr> | ||
<td>2</td> | ||
<td>中国人</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
|
||
<head> | ||
<meta charset="UTF-8"> | ||
</head> | ||
|
||
<body> | ||
<table> | ||
<thead> | ||
<tr> | ||
<th>id</th> | ||
<th>name</th> | ||
</tr> | ||
</thead> | ||
<tbody> | ||
<tr> | ||
<td>1</td> | ||
<td>english</td> | ||
</tr> | ||
<tr> | ||
<td>2</td> | ||
<td>中国人</td> | ||
</tr> | ||
</tbody> | ||
</table> | ||
</body> | ||
|
||
</html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<meta charset="UTF-8"> | ||
</head> | ||
<body> | ||
<table class='notme'> | ||
<tr> | ||
<td>id</td> | ||
<td>name</td> | ||
</tr> | ||
<tr> | ||
<td>3</td> | ||
<td>french</td> | ||
</tr> | ||
<tr> | ||
<td>4</td> | ||
<td>עברית</td> | ||
</tr> | ||
</table> | ||
<table class='mememe'> | ||
<tr> | ||
<td>id</td> | ||
<td>name</td> | ||
</tr> | ||
<tr> | ||
<td>1</td> | ||
<td>english</td> | ||
</tr> | ||
<tr> | ||
<td>2</td> | ||
<td>中国人</td> | ||
</tr> | ||
</table> | ||
<table class='notme'> | ||
<tr> | ||
<td>id</td> | ||
<td>name</td> | ||
</tr> | ||
<tr> | ||
<td>3</td> | ||
<td>french</td> | ||
</tr> | ||
<tr> | ||
<td>4</td> | ||
<td>עברית</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import division | ||
from __future__ import print_function | ||
from __future__ import absolute_import | ||
from __future__ import unicode_literals | ||
|
||
from pyquery import PyQuery as pq | ||
from ..parser import Parser | ||
from .. import helpers | ||
|
||
|
||
# Module API | ||
|
||
class HTMLTableParser(Parser): | ||
"""Parser to extract data out of HTML tables | ||
""" | ||
|
||
# Public | ||
|
||
options = [ | ||
'selector', | ||
] | ||
|
||
def __init__(self, loader, force_parse=False, selector='table'): | ||
self.__loader = loader | ||
self.__selector = selector | ||
self.__force_parse = force_parse | ||
self.__extended_rows = None | ||
self.__encoding = None | ||
self.__bytes = None | ||
|
||
@property | ||
def closed(self): | ||
return self.__bytes is None or self.__bytes.closed | ||
|
||
def open(self, source, encoding=None): | ||
self.close() | ||
self.__encoding = encoding | ||
self.__bytes = self.__loader.load(source, mode='b', encoding=encoding) | ||
if self.__encoding: | ||
self.__encoding.lower() | ||
self.reset() | ||
|
||
def close(self): | ||
if not self.closed: | ||
self.__bytes.close() | ||
|
||
def reset(self): | ||
helpers.reset_stream(self.__bytes) | ||
self.__extended_rows = self.__iter_extended_rows() | ||
|
||
@property | ||
def encoding(self): | ||
return self.__encoding | ||
|
||
@property | ||
def extended_rows(self): | ||
return self.__extended_rows | ||
|
||
# Private | ||
|
||
def __iter_extended_rows(self): | ||
|
||
# Get Page content | ||
page = pq(self.__bytes.read()) | ||
|
||
# Find required table | ||
table = pq(page.find(self.__selector)[0]) | ||
|
||
# Extract headers | ||
rows = ( | ||
table.children('thead').children('tr') + | ||
table.children('tr') + | ||
table.children('tbody').children('tr') | ||
) | ||
rows = [pq(r) for r in rows] | ||
first_row = rows.pop(0) | ||
headers = [pq(th).text() for th in first_row.find('th,td')] | ||
|
||
# Extract rows | ||
rows = [[pq(td).text() | ||
for td in pq(tr).find('td')] | ||
for tr in rows] | ||
|
||
# Yield rows | ||
for row_number, row in enumerate(rows, start=1): | ||
yield (row_number, headers, row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import division | ||
from __future__ import print_function | ||
from __future__ import absolute_import | ||
from __future__ import unicode_literals | ||
|
||
import six | ||
import openpyxl | ||
from ..writer import Writer | ||
from .. import helpers | ||
|
||
|
||
# Module API | ||
|
||
class XLSXWriter(Writer): | ||
"""XLXX writer. | ||
""" | ||
|
||
# Public | ||
|
||
options = [ | ||
'sheet', | ||
] | ||
|
||
def __init__(self, **options): | ||
|
||
# Make bytes | ||
if six.PY2: | ||
for key, value in options.items(): | ||
if isinstance(value, six.string_types): | ||
options[key] = str(value) | ||
|
||
# Set attributes | ||
self.__options = options | ||
|
||
def write(self, source, target, headers, encoding=None): | ||
helpers.ensure_dir(target) | ||
wb = openpyxl.Workbook(write_only=True) | ||
ws = wb.create_sheet(title=self.__options.get('sheet')) | ||
ws.append(headers) | ||
for row in source: | ||
ws.append(row) | ||
wb.save(target) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import division | ||
from __future__ import print_function | ||
from __future__ import absolute_import | ||
from __future__ import unicode_literals | ||
|
||
import io | ||
import pytest | ||
from mock import Mock | ||
from six import StringIO | ||
from tabulator import exceptions, Stream | ||
|
||
|
||
# Stream | ||
|
||
@pytest.mark.parametrize('source, selector', [ | ||
('data/table1.html', 'table'), | ||
('data/table2.html', 'table'), | ||
('data/table3.html', '.mememe'), | ||
]) | ||
def test_stream_html(source, selector): | ||
with Stream(source, selector=selector, headers=1) as stream: | ||
assert stream.headers == ['id', 'name'] | ||
assert stream.read(keyed=True) == [ | ||
{'id': '1', 'name': 'english'}, | ||
{'id': '2', 'name': '中国人'}] | ||
|
Oops, something went wrong.