Skip to content

Commit

Permalink
Always specify parser for Beautiful Soup; fixes extraneous tags
Browse files Browse the repository at this point in the history
The bug that tickled this for me was an extra <html><body> wrapping
the output of page.content.

Thanks to Avaris and winlu in #pelican for helping track down the root cause of
this bug.
  • Loading branch information
aqw committed Jul 18, 2015
1 parent 0bbf089 commit 873c525
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 6 deletions.
2 changes: 1 addition & 1 deletion better_figures_and_images/better_figures_and_images.py
Expand Up @@ -28,7 +28,7 @@ def content_object_init(instance):

if instance._content is not None:
content = instance._content
soup = BeautifulSoup(content)
soup = BeautifulSoup(content, 'html.parser')

if 'img' in content:
for img in soup('img'):
Expand Down
2 changes: 1 addition & 1 deletion extract_toc/extract_toc.py
Expand Up @@ -29,7 +29,7 @@ def extract_toc(content):
toc = soup.find('div', class_='contents topic')
if toc: toc.extract()
if toc:
tag=BeautifulSoup(str(toc))
tag=BeautifulSoup(str(toc), 'html.parser')
tag.div['class']='toc'
tag.div['id']=''
p=tag.find('p', class_='topic-title first')
Expand Down
2 changes: 1 addition & 1 deletion post_stats/post_stats.py
Expand Up @@ -31,7 +31,7 @@ def calculate_stats(instance):
WPM = 250

# Use BeautifulSoup to get readable/visible text
raw_text = BeautifulSoup(content).getText()
raw_text = BeautifulSoup(content, 'html.parser').getText()

# Process the text to remove entities
entities = r'\&\#?.+?;'
Expand Down
2 changes: 1 addition & 1 deletion slim/slim.py
Expand Up @@ -75,7 +75,7 @@ def _render_using_plim(filename, localcontext):
if ('SLIM_OPTIONS' in self.settings and
'PRETTYIFY' in self.settings['SLIM_OPTIONS'] and
self.settings['SLIM_OPTIONS']['PRETTYIFY']):
output = bs(output).prettify() # prettify the html
output = bs(output, 'html.parser').prettify() # prettify the html
else:
output = minify(output) # minify the html
return output
Expand Down
4 changes: 2 additions & 2 deletions tipue_search/tipue_search.py
Expand Up @@ -40,10 +40,10 @@ def create_json_node(self, page):
if getattr(page, 'status', 'published') != 'published':
return

soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '))
soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '), 'html.parser')
page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')

soup_text = BeautifulSoup(page.content)
soup_text = BeautifulSoup(page.content, 'html.parser')
page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
page_text = ' '.join(page_text.split())

Expand Down

0 comments on commit 873c525

Please sign in to comment.