scraper.html: Scrape links from Open Graph meta tags

Closes #255
ArchiveTeam · Mar 25, 2015 · ad12b12 · ad12b12
1 parent ae42a72
commit ad12b12
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 0 deletions.
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -11,6 +11,7 @@ Unreleased
 ==========
 
 * Fixed: ``--regex-type`` to accept ``pcre`` instead of ``posix``. Regular expressions always use Python's regex library. Posix regex is not supported.
+* Added: Open Graph and Twitter Card element links extraction.
 
 
 1.0 (2015-03-14)

diff --git a/wpull/scraper/html.py b/wpull/scraper/html.py
@@ -298,6 +298,14 @@ class ElementWalker(object):
     '''Mapping of element tag names to attributes containing links.'''
     DYNAMIC_ATTRIBUTES = ('onkey', 'oncli', 'onmou')
     '''Attributes that contain JavaScript.'''
+    OPEN_GRAPH_MEDIA_NAMES = (
+        'og:image', 'og:audio', 'og:video',
+        'twitter:image:src', 'twitter:image0', 'twitter:image1',
+        'twitter:image2', 'twitter:image3', 'twitter:player:stream',
+    )
+    OPEN_GRAPH_LINK_NAMES = (
+        'og:url', 'twitter:player'
+    )
 
     '''Iterate elements looking for links.
 
@@ -427,6 +435,32 @@ def iter_links_meta_element(cls, element):
                         value_type='refresh',
                         link_type=None  # treat it as a redirect
                     )
+        else:
+            for link_info in cls.iter_links_open_graph_meta(element):
+                yield link_info
+
+    @classmethod
+    def iter_links_open_graph_meta(cls, element):
+            name = element.attrib.get('property', '').lower()
+
+            if name in cls.OPEN_GRAPH_LINK_NAMES or \
+                    name in cls.OPEN_GRAPH_MEDIA_NAMES:
+                link = element.attrib.get('content')
+
+                if link:
+                    if name in cls.OPEN_GRAPH_MEDIA_NAMES:
+                        link_type = LinkType.media
+                    else:
+                        link_type = None
+
+                    yield LinkInfo(
+                        element=element, tag=element.tag, attrib='property',
+                        link=link,
+                        inline=False, linked=True,
+                        base_link=None,
+                        value_type='plain',
+                        link_type=link_type
+                    )
 
     @classmethod
     def iter_links_object_element(cls, element):

diff --git a/wpull/scraper/html_test.py b/wpull/scraper/html_test.py
@@ -91,6 +91,17 @@ def test_html_scraper_links(self):
             inline_urls
         )
         self.assertEqual({
+            'http://example.com/og_image.png',
+            'http://example.com/og_url.html',
+            'http://example.com/og_audio.mp3',
+            'http://example.com/og_video.webm',
+            'http://example.com/twitter_image.png',
+            'http://example.com/twitter_image0.png',
+            'http://example.com/twitter_image1.png',
+            'http://example.com/twitter_image2.png',
+            'http://example.com/twitter_image3.png',
+            'http://example.com/twitter_player.html',
+            'http://example.com/twitter_stream.mp4',
             'http://example.net/soup.html',
             'http://example.com/a_href.html',
             'http://example.com/area_href.html',

diff --git a/wpull/testing/samples/many_urls.html b/wpull/testing/samples/many_urls.html
@@ -11,6 +11,17 @@
 </style>
 <script type="text/javascript" src="script.js"></script>
 <link rel="stylesheet" href="link_href.css">
+<meta property="og:image" content="og_image.png">
+<meta property="og:url" content="og_url.html">
+<meta property="og:audio" content="og_audio.mp3">
+<meta property="og:video" content="og_video.webm">
+<meta property="twitter:image:src" content="twitter_image.png">
+<meta property="twitter:image0" content="twitter_image0.png">
+<meta property="twitter:image1" content="twitter_image1.png">
+<meta property="twitter:image2" content="twitter_image2.png">
+<meta property="twitter:image3" content="twitter_image3.png">
+<meta property="twitter:player" content="twitter_player.html">
+<meta property="twitter:player:stream" content="twitter_stream.mp4">
 </head>
 <body background="body_background.png">
 	<table background="images/table_background.png">