Skip to content
This repository has been archived by the owner on Jul 11, 2023. It is now read-only.

Commit

Permalink
Improved html detection algorithm (#101)
Browse files Browse the repository at this point in the history
Replace html detection using BeautifulSoup with a simple regex.

New implementatio tries to detect only most like HTML candidates.

See: #101
  • Loading branch information
sirex authored and roll committed Oct 28, 2016
1 parent 3e771f2 commit ea9dfdf
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def read(*paths):
'chardet>=2.0,<3.0a',
'openpyxl>=2.4,<3.0a',
'requests>=2.8,<3.0a',
'beautifulsoup4>=4.4,<5.0a',
'linear-tsv>=1.0,<2.0a',
'unicodecsv>=0.14,<1.0a',
'jsonlines>=1.1,<1.2',
Expand Down
7 changes: 4 additions & 3 deletions tabulator/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,13 @@ def detect_encoding(bytes, encoding=None):
return encoding


detect_html_re = re.compile('\s*<(!doctype|html)', re.IGNORECASE)


def detect_html(text):
"""Detect if text is HTML.
"""
# To reduce tabulator import time
from bs4 import BeautifulSoup
return bool(BeautifulSoup(text, 'html.parser').find())
return bool(detect_html_re.match(text))


def reset_stream(stream):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,14 @@ def test_extract_options():
extracted_options = helpers.extract_options(options, names)
assert options == {'opt3': 3}
assert extracted_options == {'opt1': 1, 'opt2': 2}


@pytest.mark.parametrize('sample', [
('\n\n\t <html>', True),
('<!DOCTYPE html>', True),
('col1,col2\nval1,<html>', False),
('val1,<html>', False),
])
def test_detect_html(sample):
text, is_html = sample
assert helpers.detect_html(text) is is_html

0 comments on commit ea9dfdf

Please sign in to comment.