Skip to content
Browse files
Check img|source[@srcset] as part of check -l
Create phantom img[@src] for each *[@srcset] item, allowing lxml
to understand each item as a link.

Resolves issue #1989.
  • Loading branch information
da2x committed Sep 1, 2015
1 parent 9ba5c01 commit a970c13d86821691ccca484814202861063cd90b
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 2 deletions.
@@ -27,6 +27,7 @@ Features

* Check ``img|source[@srcset]`` as part of ``check -l`` (Issue #1989)
* Clean up translations for third party components
* ``pagekind["main_index"]`` set on the main indexes to differentiate
them from all the other indexes.
@@ -1104,7 +1104,7 @@ def rewrite_links(self, doc, src, lang):
doc.rewrite_links(lambda dst: self.url_replacer(src, dst, lang), resolve_base_href=False)

# lxml ignores srcset in img and source elements, so do that by hand
objs = list(doc.findall('*//img')) + list(doc.findall('*//source'))
objs = list(doc.findall('(*//img|*//source)'))
for obj in objs:
if 'srcset' in obj.attrib:
urls = [u.strip() for u in obj.attrib['srcset'].split(',')]
@@ -204,7 +204,15 @@ def analyze(self, fname, find_sources=False, check_remote=False):
return False

d = lxml.html.fromstring(open(filename, 'rb').read())
for l in d.iterlinks():
extra_objs = lxml.html.fromstring('html')

# Turn elements with a srcset attribute into individual img elements with src attributes
for obj in list(d.xpath('(*//img|*//source)')):
if 'srcset' in obj.attrib:
for srcset_item in obj.attrib['srcset'].split(','):
extra_objs.append(lxml.etree.Element('img', src=srcset_item.strip().split(' ')[-0]))

This comment has been minimized.

Copy link

ralsina Sep 1, 2015



for l in list(d.iterlinks()) + list(extra_objs.iterlinks()):
target = l[2]
if target == "#":

0 comments on commit a970c13

Please sign in to comment.