Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
Merge 12ca33c into 48ccbe1
Browse files Browse the repository at this point in the history
  • Loading branch information
akariv committed May 15, 2020
2 parents 48ccbe1 + 12ca33c commit 839b651
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -667,7 +667,7 @@ stream = Stream('http://example.com/some/page.aspx', format='html' selector='.co

**Options**

- **selector**: CSS selector for specifying which `table` element to extract. By default it's `table`, which takes the first `table` element in the document.
- **selector**: CSS selector for specifying which `table` element to extract. By default it's `table`, which takes the first `table` element in the document. If empty, will assume the entire page is the table to be extracted (useful with some Excel formats).

### Custom file sources and formats

Expand Down
4 changes: 2 additions & 2 deletions data/table2.html
Expand Up @@ -7,12 +7,12 @@

<body>
<table>
<thead>
<THEAD>
<tr>
<th>id</th>
<th>name</th>
</tr>
</thead>
</THEAD>
<tbody>
<tr>
<td>1</td>
Expand Down
14 changes: 14 additions & 0 deletions data/table4.html
@@ -0,0 +1,14 @@
<TABLE>
<THEAD>
<th>id</th>
<th>name</th>
</THEAD>
<tr>
<td>1</td>
<td>english</td>
</tr>
<tr>
<td>2</td>
<td>中国人</td>
</tr>
</TABLE>
26 changes: 15 additions & 11 deletions tabulator/parsers/html.py
Expand Up @@ -27,26 +27,26 @@ def __init__(self, loader, force_parse=False, selector='table'):
self.__force_parse = force_parse
self.__extended_rows = None
self.__encoding = None
self.__bytes = None
self.__chars = None

@property
def closed(self):
return self.__bytes is None or self.__bytes.closed
return self.__chars is None or self.__chars.closed

def open(self, source, encoding=None):
self.close()
self.__encoding = encoding
self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
self.__chars = self.__loader.load(source, encoding=encoding)
if self.__encoding:
self.__encoding.lower()
self.reset()

def close(self):
if not self.closed:
self.__bytes.close()
self.__chars.close()

def reset(self):
helpers.reset_stream(self.__bytes)
helpers.reset_stream(self.__chars)
self.__extended_rows = self.__iter_extended_rows()

@property
Expand All @@ -62,25 +62,29 @@ def extended_rows(self):
def __iter_extended_rows(self):

# Get Page content
page = pq(self.__bytes.read())
page = pq(self.__chars.read(), parser='html')

# Find required table
table = pq(page.find(self.__selector)[0])
if self.__selector:
table = pq(page.find(self.__selector)[0])
else:
table = page

# Extract headers
rows = (
table.children('thead').children('tr') +
table.children('thead') +
table.children('tr') +
table.children('tbody').children('tr')
)
rows = [pq(r) for r in rows]
rows = [pq(r) for r in rows if len(r) > 0]
first_row = rows.pop(0)
headers = [pq(th).text() for th in first_row.find('th,td')]

# Extract rows
rows = [[pq(td).text()
for td in pq(tr).find('td')]
for tr in rows]
rows = [pq(tr).find('td') for tr in rows]
rows = [[pq(td).text() for td in tr]
for tr in rows if len(tr) > 0]

# Yield rows
for row_number, row in enumerate(rows, start=1):
Expand Down
3 changes: 2 additions & 1 deletion tests/formats/test_html.py
Expand Up @@ -17,9 +17,10 @@
('data/table1.html', 'table'),
('data/table2.html', 'table'),
('data/table3.html', '.mememe'),
('data/table4.html', ''),
])
def test_stream_html(source, selector):
with Stream(source, selector=selector, headers=1) as stream:
with Stream(source, selector=selector, headers=1, encoding='utf8') as stream:
assert stream.headers == ['id', 'name']
assert stream.read(keyed=True) == [
{'id': '1', 'name': 'english'},
Expand Down

0 comments on commit 839b651

Please sign in to comment.