Skip to content

Commit

Permalink
scraper.html: Scrape links from Open Graph meta tags
Browse files Browse the repository at this point in the history
Closes #255
  • Loading branch information
chfoo committed Mar 25, 2015
1 parent ae42a72 commit ad12b12
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Unreleased
==========

* Fixed: ``--regex-type`` to accept ``pcre`` instead of ``posix``. Regular expressions always use Python's regex library. Posix regex is not supported.
* Added: Open Graph and Twitter Card element links extraction.


1.0 (2015-03-14)
Expand Down
34 changes: 34 additions & 0 deletions wpull/scraper/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,14 @@ class ElementWalker(object):
'''Mapping of element tag names to attributes containing links.'''
DYNAMIC_ATTRIBUTES = ('onkey', 'oncli', 'onmou')
'''Attributes that contain JavaScript.'''
OPEN_GRAPH_MEDIA_NAMES = (
'og:image', 'og:audio', 'og:video',
'twitter:image:src', 'twitter:image0', 'twitter:image1',
'twitter:image2', 'twitter:image3', 'twitter:player:stream',
)
OPEN_GRAPH_LINK_NAMES = (
'og:url', 'twitter:player'
)

'''Iterate elements looking for links.
Expand Down Expand Up @@ -427,6 +435,32 @@ def iter_links_meta_element(cls, element):
value_type='refresh',
link_type=None # treat it as a redirect
)
else:
for link_info in cls.iter_links_open_graph_meta(element):
yield link_info

@classmethod
def iter_links_open_graph_meta(cls, element):
name = element.attrib.get('property', '').lower()

if name in cls.OPEN_GRAPH_LINK_NAMES or \
name in cls.OPEN_GRAPH_MEDIA_NAMES:
link = element.attrib.get('content')

if link:
if name in cls.OPEN_GRAPH_MEDIA_NAMES:
link_type = LinkType.media
else:
link_type = None

yield LinkInfo(
element=element, tag=element.tag, attrib='property',
link=link,
inline=False, linked=True,
base_link=None,
value_type='plain',
link_type=link_type
)

@classmethod
def iter_links_object_element(cls, element):
Expand Down
11 changes: 11 additions & 0 deletions wpull/scraper/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ def test_html_scraper_links(self):
inline_urls
)
self.assertEqual({
'http://example.com/og_image.png',
'http://example.com/og_url.html',
'http://example.com/og_audio.mp3',
'http://example.com/og_video.webm',
'http://example.com/twitter_image.png',
'http://example.com/twitter_image0.png',
'http://example.com/twitter_image1.png',
'http://example.com/twitter_image2.png',
'http://example.com/twitter_image3.png',
'http://example.com/twitter_player.html',
'http://example.com/twitter_stream.mp4',
'http://example.net/soup.html',
'http://example.com/a_href.html',
'http://example.com/area_href.html',
Expand Down
11 changes: 11 additions & 0 deletions wpull/testing/samples/many_urls.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@
</style>
<script type="text/javascript" src="script.js"></script>
<link rel="stylesheet" href="link_href.css">
<meta property="og:image" content="og_image.png">
<meta property="og:url" content="og_url.html">
<meta property="og:audio" content="og_audio.mp3">
<meta property="og:video" content="og_video.webm">
<meta property="twitter:image:src" content="twitter_image.png">
<meta property="twitter:image0" content="twitter_image0.png">
<meta property="twitter:image1" content="twitter_image1.png">
<meta property="twitter:image2" content="twitter_image2.png">
<meta property="twitter:image3" content="twitter_image3.png">
<meta property="twitter:player" content="twitter_player.html">
<meta property="twitter:player:stream" content="twitter_stream.mp4">
</head>
<body background="body_background.png">
<table background="images/table_background.png">
Expand Down

0 comments on commit ad12b12

Please sign in to comment.