Merge 43b901b into a223140

collective · Aug 17, 2015 · 1b0b979 · 1b0b979
2 parents a223140 + 43b901b
commit 1b0b979
Show file tree

Hide file tree

Showing 24 changed files with 1,291 additions and 20 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,8 @@
+[report]
+omit =
+    /home/*/.buildout/eggs/*
+    /usr/*
+    bin/test
+    buildout-cache/eggs/*
+    eggs/*
+    parts/*
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ develop-eggs/
 dist/
 downloads/
 eggs/
+htmlcov/
 include/
 lib/
 local.cfg

diff --git a/.travis.yml b/.travis.yml
@@ -15,5 +15,10 @@ install:
 - bin/buildout -Nq
 script:
 - bin/code-analysis
+- bin/test
+after_success:
+- bin/createcoverage
+- pip install -q coveralls
+- coveralls
 notifications:
   irc: irc.freenode.org#plone-testing
diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -3,3 +3,5 @@ Contributors
 
 - David Glick
 - Cris Ewing
+- Héctor Velarde
+- Rodrigo Ferreira de Souza
diff --git a/CSV.rst b/CSV.rst
@@ -0,0 +1,186 @@
+Introduction
+============
+
+This part of the documentation refers to the usage of the CSVSource blueprint section.
+
+Prerequisites
+=============
+
+Export as CSV the following tables from your WordPress site usign the phpMyAdmin interface:
+
+* wp_posts
+* wp_term_relationships
+* wp_term_taxonomy
+* wp_terms
+* wp_users
+
+Use the following options for all:
+
+* Fields terminated by '\\t'
+* Remove CRLF characters within fields
+* Put field names in the first row
+
+For more information see: http://stackoverflow.com/a/31460534/644075
+
+Importing polls
+---------------
+
+If you want to import polls created with the `WP-Polls`_ plugin you must use the `transmogrify.wppolls`_ package.
+
+.. _`transmogrify.wppolls`: https://pypi.python.org/pypi/transmogrify.wppolls
+.. _`WP-Polls`: https://wordpress.org/plugins/wp-polls/
+
+Sections
+========
+
+transmogrify.wordpress.csvsource
+--------------------------------
+
+Currently this section only import posts, pages and attachments; comments are ignored.
+
+A typical configuration for this section should be like this:
+
+.. code-block:: ini
+
+    [csvsource]
+    blueprint = transmogrify.wordpress.csvsource
+    source = /home/customer/site/data/
+    type = collective.nitf.content
+    skip = 146989
+
+source:
+    is the path where all CSV files are stored
+
+type:
+    is the content type to be used for blog posts;
+    pages are imported as `Page` and attachment as `Image` or `File` depending on its mime type by using a `mimeencapsulator` section before the constructor:
+
+.. code-block:: ini
+
+    [mimeencapsulator]
+    blueprint = plone.app.transmogrifier.mimeencapsulator
+    mimetype = item/_mimetype
+    field = python:'image' if item['portal_type'] == 'Image' else 'file'
+
+skip:
+    is a comma-separated list of posts to be explicitly ignored;
+    this is useful if the body of the post contains illegal characters that lead to import errors
+
+.. code-block:: ini
+
+    [csvsource]
+    blueprint = transmogrify.wordpress.csvsource
+    ...
+    skip = 146989,151344,151517
+
+field-size-limit:
+    an integer specifying the CSV field size limit;
+    this is useful to avoid `Error: field larger than field limit (131072)`.
+    If you're getting into this issue use the following command and set the value to an integer larger that the number returned:
+
+.. code-block:: bash
+
+    # wc -L wp_posts.csv
+    687948 wp_posts.csv
+
+.. code-block:: ini
+
+    [csvsource]
+    blueprint = transmogrify.wordpress.csvsource
+    ...
+    field-size-limit = 700000
+
+transmogrify.wordpress.fetchattachment
+--------------------------------------
+
+Fetches attachments from the original site by requesting the content and setting the `_data` field of the item.
+If the item already has data and the size of it is equal to the size of the remote object, it will be skipped assuming both are the same.
+If a status code different from `200` is received, the item is skipped and a warning message is logged.
+
+.. code-block:: ini
+
+    [fetchattachment]
+    blueprint = transmogrify.wordpress.fetchattachment
+    log-level = error
+
+log-level:
+    sets the log level to one of the following options: 'error', 'info' or 'debug'
+
+TODO: add caching feature
+
+transmogrify.wordpress.embedyoutube
+-----------------------------------
+
+Replace youtube pseudo-tag `[youtube id="NwTxjNhGpOM"]` with an iframe embedding youtube video into document.
+
+.. code-block:: ini
+
+    [embedyoutube]
+    blueprint = transmogrify.wordpress.youtube
+
+transmogrify.wordpress.defaultview
+----------------------------------
+
+Sets the default view of a content item.
+You can specify an optional ``condition`` option;
+if given, the view is only changed when the condition, which is a TALES expression, is true.
+
+.. code-block:: ini
+
+    [defaultview]
+    blueprint = transmogrify.wordpress.defaultview
+    view = text_only_view
+    condition = python:item.get('portal_type') == 'collective.nitf.content'
+
+transmogrify.wordpress.resolveuid
+---------------------------------
+
+It is a post processing section that fixes internal links;
+It replaces paths with internal links (those that refer to the same domain we're importing), with calls to `resolveuid`.
+Also, updates the reference catalog so we can search for references, and take care of site integrity.
+
+.. code-block:: ini
+
+    [resolveuid]
+    blueprint = transmogrify.wordpress.resolveuid
+    type = collective.nitf.content
+    domain = wordpress.com
+
+type:
+    data type we are looking to fix urls.
+
+domain:
+    domain name of the site we're importing;
+    this is used to specify links that are going to be treated as internal.
+
+transmogrify.wordpress.relatecontent
+------------------------------------
+
+It is a post processing section that add related items into objects;
+It looks for wordpress `pinged` column and add internal urls as related content (if imported).
+
+.. code-block:: ini
+
+    [relatecontent]
+    blueprint = transmogrify.wordpress.relatecontent
+    domain = wordpress.com
+
+domain:
+    domain name of the site we're importing;
+    this is used to specify links that are going to be treated as internal.
+
+transmogrify.wordpress.moveattachment
+-------------------------------------
+
+It is a post processing section that moves images and files into the specified container type;
+It looks for the reference catalog and checks if an attachment is referenced only by one object of the specified type.
+This pipeline section must be placed after resolveuid, where those references are updated.
+
+.. code-block:: ini
+
+    [moveattachment]
+    blueprint = transmogrify.wordpress.moveattachment
+    type = collective.nitf.content
+
+type:
+    container data type we are moving images and files into.
diff --git a/buildout.cfg b/buildout.cfg
@@ -4,8 +4,11 @@ extends =
     https://raw.github.com/collective/buildout.plonetest/master/qa.cfg
 
 package-name = transmogrify.wordpress
+package-extras = [test]
 
-parts = code-analysis
+parts +=
+    code-analysis
+    createcoverage
 
 [code-analysis]
 directory = ${buildout:directory}/transmogrify/wordpress
@@ -21,5 +24,6 @@ multiprocessing = True
 return-status-codes = False
 
 [versions]
+plone.recipe.codeanalysis = 2.0b1
 # use latest version of setuptools
 setuptools =
diff --git a/setup.py b/setup.py
@@ -13,8 +13,6 @@
     open('CHANGES.rst').read()
 )
 
-tests_require = ['zope.testing']
-
 setup(name='transmogrify.wordpress',
       version=version,
       description=description,
@@ -37,12 +35,15 @@
                         'collective.transmogrifier',
                         'plone.app.transmogrifier',
                         'lxml',
-                        'phpserialize'
-                        # -*- Extra requirements: -*-
+                        'phpserialize',
+                        'requests',
                         ],
-      tests_require=tests_require,
-      extras_require=dict(tests=tests_require),
-      test_suite='transmogrify.wordpress.tests.test_docs.test_suite',
+      extras_require={
+          'test': [
+              'plone.app.testing',
+              'plone.testing',
+          ],
+      },
       entry_points="""
       # -*- entry_points -*-
       [z3c.autoinclude.plugin]

diff --git a/transmogrify/wordpress/blueprints.py → ...smogrify/wordpress/blueprints/__init__.py b/transmogrify/wordpress/blueprints.py → ...smogrify/wordpress/blueprints/__init__.py
@@ -32,7 +32,7 @@ def get_meta_values_by_key(node, meta_key):
     for postmeta in node.iterfind(WP + 'postmeta'):
         if postmeta.find(WP + 'meta_key').text == meta_key:
             yield postmeta.find(WP + 'meta_value').text
-    
+
 
 
 class WXRSource(object):
@@ -99,9 +99,9 @@ def __iter__(self):
             item['_disqus_thread_id'] = self.extract_disqus_thread_id(node)
             # capture image attachments as represented by the 'Image' postmeta
             # key.  Ensure that the image urls are unique so we don't download
-            # any of them more than once.  
+            # any of them more than once.
             item['_postmeta_images'] = self.extract_postmeta_images(node)
-            # capture wordpress attachments as represented by the 
+            # capture wordpress attachments as represented by the
             # 'wp:attachment_url' tag and associated post metadata
             item['_wordpress_attachments'] = self.extract_wp_attachments(node)
 
@@ -223,15 +223,18 @@ def __iter__(self):
             yield item
 
     PRE_RE = re.compile(r'(<pre>.*?</pre>)', re.IGNORECASE | re.DOTALL)
+    CAPTION_RE = re.compile(r'\[/?caption.*?\]', re.IGNORECASE | re.DOTALL)
     def cleanup_text(self, text):
         # - encode if necessary
         # - normalize newlines
         # - replace double-newlines with paragraph tags
         # - replace single newlines with linebreak tags
+        # - remove custom caption tag
         if isinstance(text, unicode):
             text = text.encode('utf8')
         text = self.PRE_RE.sub(lambda x: x.group(1).replace('\r\n\r\n', '\n\n'), text)
         text = text.replace('\r\n\r\n', '<p>').replace('\r\n','\n').replace('\n', '<br />\n')
+        text = self.CAPTION_RE.sub('', text)
         return text
 
         # TODO: handle [googlevideo] links, [gallery], [caption], others?
@@ -408,8 +411,8 @@ def __iter__(self):
 class WPPostmetaEnclosureSource(object):
     """download and insert into the pipeline any files referenced in 'enclosure'
     postmeta tags
-    
-    enclosures will be a list of dicts with the keys 'url', 'size' and 
+
+    enclosures will be a list of dicts with the keys 'url', 'size' and
     'mimetype'
     """
     classProvides(ISectionBlueprint)
@@ -430,7 +433,7 @@ def __iter__(self):
 
             # XXX: it would be good to add a relationship between enclosures
             # and the posts they are related to.  How might we do this?
-            # 
+            #
             item['_enclosure_internal_paths'] = []
             for enclosure in item[self.enclosure_key]:
                 res = safe_urlopen(enclosure['url'])
@@ -449,21 +452,21 @@ def __iter__(self):
                         encl['portal_type'] = 'File'
                         filetitle = 'file'
                         item_key = 'file'
-                    # wrap the data so it'll get added with the correct 
+                    # wrap the data so it'll get added with the correct
                     # filename & mimetype
-                    data = File(filename, filetitle, StringIO(res.read()), 
+                    data = File(filename, filetitle, StringIO(res.read()),
                                 enclosure['mimetype'])
                     path = '/'.join([self.base_path, filename])
                     # XXX avoid collisions
                     encl['_path'] = path
                     encl[item_key] = data
                     logger.info('Importing %s' % path)
                     # add the location where this enclosure will be added
-                    # to the list of internal enclosures.  We can use this 
+                    # to the list of internal enclosures.  We can use this
                     # later as a way of connecting the original item to the
                     # enclosure.
                     item['_enclosure_internal_paths'].append(path)
-                    # yield the enclosure first so it will exist when the 
+                    # yield the enclosure first so it will exist when the
                     # containing item is created.
                     yield encl