Merge pull request #102 from getnikola/import_page

import_page plugin
getnikola · Sep 2, 2015 · 08a9392 · 08a9392
2 parents 763f278 + 1452f90
commit 08a9392
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 0 deletions.
diff --git a/v7/import_page/README.md b/v7/import_page/README.md
@@ -0,0 +1,11 @@
+Plugin to import arbitrary web pages.
+
+Usage:
+
+```
+nikola import_page http://en.wikipedia.org/wiki/Information_extraction
+```
+
+That will produce a information-extraction-wikipedia-the-free-encyclopedia.html that you can edit
+and move into your stories/ folder.
+
diff --git a/v7/import_page/conf.py.sample b/v7/import_page/conf.py.sample
@@ -0,0 +1,4 @@
+PAGES = (
+    ("stories/*.html", "stories", "story.tmpl"),
+)
+
diff --git a/v7/import_page/import_page.plugin b/v7/import_page/import_page.plugin
@@ -0,0 +1,9 @@
+[Core]
+Name = import_page
+Module = import_page
+
+[Documentation]
+Author = Roberto Alsina
+Version = 0.1
+Website = http://plugins.getnikola.com/#import_page
+Description = Try to import arbitrary web content
diff --git a/v7/import_page/import_page.py b/v7/import_page/import_page.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2015 Roberto Alsina and others
+
+# Permission is hereby granted, free of charge, to any
+# person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the
+# Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the
+# Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice
+# shall be included in all copies or substantial portions of
+# the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+from __future__ import unicode_literals, print_function
+
+import codecs
+
+import libextract.api
+import lxml.html
+import requests
+
+from nikola.plugin_categories import Command
+from nikola import utils
+
+LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER)
+
+
+doc_template = '''<!--
+.. title: {title}
+.. slug: {slug}
+-->
+
+{content}
+'''
+
+
+class CommandImportPage(Command):
+    """Import a Page."""
+
+    name = "import_page"
+    needs_config = False
+    doc_usage = "[options] page_url [page_url,...]"
+    doc_purpose = "import arbitrary web pages"
+
+    def _execute(self, options, args):
+        """Import a Page."""
+        for url in args:
+            self._import_page(url)
+
+    def _import_page(self, url):
+        r = requests.get(url)
+        if 199 < r.status_code < 300:  # Got it
+            # Use the page's title
+            doc = lxml.html.fromstring(r.content)
+            title = doc.find('*//title').text_content().decode('utf-8')
+            slug = utils.slugify(title)
+            nodes = list(libextract.api.extract(r.content))
+            # Let's assume the node with more text is the good one
+            lengths = [len(n.text_content()) for n in nodes]
+            node = nodes[lengths.index(max(lengths))]
+            document = doc_template.format(
+                title=title,
+                slug=slug,
+                content=lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8')
+            )
+            with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
+                outf.write(document)
+
+        else:
+            LOGGER.error('Error fetching URL: {}'.format(url))
diff --git a/v7/import_page/requirements.txt b/v7/import_page/requirements.txt
@@ -0,0 +1,2 @@
+requests
+libextract