Support indexing the ISO files from the IOOS Catalog (#574)

* Catch errors when loading ISO files and improve error logging Carry through the actual exception when trying to insert records * Don't run VACUUM ANALYZE in a transaction * Ignore some shell magic files (direnv, autoenv) and pytest cache * Allow folders of ISO files in the functionaltests data folder * Sort the allowed resource types response so it can be tested reliably * Raise an appropriate exception for response on HTTP exceptions This turns ``` <ows:ExceptionText>Harvest failed: record parsing failed: local variable 'md' referenced before assignment</ows:ExceptionText> ``` into ``` <ows:ExceptionText>Harvest failed: record parsing failed: 404 Client Error: Not Found for url: http://maps.cera.govt.nz/arcgis/services/CERA/CERA_TechClasses_WGS84/MapServer/WFSServer?service=WFS&request=GetCapabilities&version=2.0.0</ows:ExceptionText> ``` * Add functional test with duplicate fileID records * Parse both bytes and str meta records. fixes #561 * Better sanitizing and testing of the identifier on export. fixes #566 Return the loades and exported records from the admin functions Export records in functional tests if an "export" directory exists I spent a good amount of time trying to integrate this into the testing suite more cleanrly (not in the _initialize_database function) but came up with no solution that didn't require a large refator of the testing code.
geopython · Aug 27, 2018 · f82fa0f · f82fa0f
1 parent d008973
commit f82fa0f
Show file tree

Hide file tree

Showing 25 changed files with 6,843 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,9 +12,14 @@ tests/results
 **.cache
 .coverage
 .tox
+.pytest_cache
 
 # test configurations
 /default.cfg
 
 # pycharm ide
 .idea
+
+# shell files
+.env
+.envrc
diff --git a/pycsw/core/admin.py b/pycsw/core/admin.py
@@ -310,10 +310,13 @@ def setup_db(database, table, home, create_sfsql_tables=True, create_plpythonu_f
 
 def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False):
     """Load metadata records from directory of files to database"""
+    from sqlalchemy.exc import DBAPIError
+
     repo = repository.Repository(database, context, table=table)
 
     file_list = []
 
+    loaded_files = set()
     if os.path.isfile(xml_dirpath):
         file_list.append(xml_dirpath)
     elif recursive:
@@ -334,11 +337,18 @@ def load_records(context, database, table, xml_dirpath, recursive=False, force_u
         # read document
         try:
             exml = etree.parse(recfile, context.parser)
+        except etree.XMLSyntaxError as err:
+            LOGGER.error('XML document "%s" is not well-formed', recfile)
+            continue
         except Exception as err:
-            LOGGER.exception('XML document is not well-formed')
+            LOGGER.exception('XML document "%s" is not well-formed', recfile)
             continue
 
-        record = metadata.parse_record(context, exml, repo)
+        try:
+            record = metadata.parse_record(context, exml, repo)
+        except Exception as err:
+            LOGGER.exception('Could not parse "%s" as an XML record', recfile)
+            continue
 
         for rec in record:
             LOGGER.info('Inserting %s %s into database %s, table %s ....',
@@ -347,14 +357,23 @@ def load_records(context, database, table, xml_dirpath, recursive=False, force_u
             # TODO: do this as CSW Harvest
             try:
                 repo.insert(rec, 'local', util.get_today_and_now())
-                LOGGER.info('Inserted')
-            except RuntimeError as err:
+                loaded_files.add(recfile)
+                LOGGER.info('Inserted %s', recfile)
+            except Exception as err:
                 if force_update:
                     LOGGER.info('Record exists. Updating.')
                     repo.update(rec)
-                    LOGGER.info('Updated')
+                    LOGGER.info('Updated %s', recfile)
+                    loaded_files.add(recfile)
                 else:
-                    LOGGER.error('ERROR: not inserted %s', err)
+                    if isinstance(err, DBAPIError) and err.args:
+                        # Pull a decent database error message and not the full SQL that was run
+                        # since INSERT SQL statements are rather large.
+                        LOGGER.error('ERROR: %s not inserted: %s', recfile, err.args[0])
+                    else:
+                        LOGGER.error('ERROR: %s not inserted: %s', recfile, err)
+
+    return tuple(loaded_files)
 
 
 def export_records(context, database, table, xml_dirpath):
@@ -370,6 +389,8 @@ def export_records(context, database, table, xml_dirpath):
 
     dirpath = os.path.abspath(xml_dirpath)
 
+    exported_files = set()
+
     if not os.path.exists(dirpath):
         LOGGER.info('Directory %s does not exist.  Creating...', dirpath)
         try:
@@ -384,11 +405,9 @@ def export_records(context, database, table, xml_dirpath):
                     context.md_core_model['mappings']['pycsw:Identifier'])
 
         LOGGER.info('Processing %s', identifier)
-        if identifier.find(':') != -1:  # it's a URN
-            # sanitize identifier
-            LOGGER.info(' Sanitizing identifier')
-            identifier = identifier.split(':')[-1]
 
+        # sanitize identifier
+        identifier = util.secure_filename(identifier)
         # write to XML document
         filename = os.path.join(dirpath, '%s.xml' % identifier)
         try:
@@ -400,10 +419,17 @@ def export_records(context, database, table, xml_dirpath):
             with open(filename, 'w') as xml:
                 xml.write('<?xml version="1.0" encoding="UTF-8"?>\n')
                 xml.write(str_xml)
-
         except Exception as err:
-            LOGGER.exception('Error writing to disk')
-            raise RuntimeError("Error writing to %s" % filename, err)
+            # Something went wrong so skip over this file but log an error
+            LOGGER.exception('Error writing %s to disk', filename)
+            # If we wrote a partial file or created an empty file make sure it is removed
+            if os.path.exists(filename):
+                os.remove(filename)
+            continue
+        else:
+            exported_files.add(filename)
+
+    return tuple(exported_files)
 
 
 def refresh_harvested_records(context, database, table, url):
@@ -452,10 +478,23 @@ def rebuild_db_indexes(database, table):
 
 def optimize_db(context, database, table):
     """Optimize database"""
+    from sqlalchemy.exc import ArgumentError, OperationalError
 
     LOGGER.info('Optimizing database %s', database)
     repos = repository.Repository(database, context, table=table)
-    repos.engine.connect().execute('VACUUM ANALYZE').close()
+    connection = repos.engine.connect()
+    try:
+        # PostgreSQL
+        connection.execution_options(isolation_level="AUTOCOMMIT")
+        connection.execute('VACUUM ANALYZE')
+    except (ArgumentError, OperationalError):
+        # SQLite
+        connection.autocommit = True
+        connection.execute('VACUUM')
+        connection.execute('ANALYZE')
+    finally:
+        connection.close()
+        LOGGER.info('Done')
 
 
 def gen_sitemap(context, database, table, url, output_file):

diff --git a/pycsw/core/metadata.py b/pycsw/core/metadata.py
@@ -34,6 +34,7 @@
 import logging
 import uuid
 from six.moves import range
+from six import text_type, binary_type
 from six.moves.urllib.parse import urlparse
 
 from geolinks import sniff_link
@@ -124,7 +125,7 @@ def _set(context, obj, name, value):
 def _parse_metadata(context, repos, record):
     """parse metadata formats"""
 
-    if isinstance(record, str):
+    if isinstance(record, binary_type) or isinstance(record, text_type):
         exml = etree.fromstring(record, context.parser)
     else:  # already serialized to lxml
         if hasattr(record, 'getroot'):  # standalone document
@@ -584,6 +585,7 @@ def _parse_wmts(context, repos, record, identifier):
 
 def _parse_wfs(context, repos, record, identifier, version):
 
+    import requests
     from owslib.wfs import WebFeatureService
 
     bboxs = []
@@ -592,6 +594,8 @@ def _parse_wfs(context, repos, record, identifier, version):
 
     try:
         md = WebFeatureService(record, version)
+    except requests.exceptions.HTTPError as err:
+        raise
     except Exception as err:
         if version == '1.1.0':
             md = WebFeatureService(record, '1.0.0')

diff --git a/pycsw/core/repository.py b/pycsw/core/repository.py
@@ -300,9 +300,7 @@ def insert(self, record, source, insert_date):
             self.session.commit()
         except Exception as err:
             self.session.rollback()
-            msg = 'Cannot commit to repository'
-            LOGGER.exception(msg)
-            raise RuntimeError(msg)
+            raise
 
     def update(self, record=None, recprops=None, constraint=None):
         ''' Update a record in the repository based on identifier '''

diff --git a/pycsw/core/util.py b/pycsw/core/util.py
@@ -32,6 +32,8 @@
 #
 # =================================================================
 
+import os
+import re
 import datetime
 import logging
 import time
@@ -51,6 +53,12 @@
 ranking_pass = False
 ranking_query_geometry = ''
 
+# Lookups for the secure_filename function
+# https://github.com/pallets/werkzeug/blob/778f482d1ac0c9e8e98f774d2595e9074e6984d7/werkzeug/utils.py#L30-L31
+_filename_ascii_strip_re = re.compile(r'[^A-Za-z0-9_.-]')
+_windows_device_files = ('CON', 'AUX', 'COM1', 'COM2', 'COM3', 'COM4', 'LPT1',
+                         'LPT2', 'LPT3', 'PRN', 'NUL')
+
 
 def get_today_and_now():
     """Get the date, right now, in ISO8601"""
@@ -338,3 +346,49 @@ def get_anytext(bag):
             bag = etree.fromstring(bag, PARSER)
         # get all XML element content
         return ' '.join([value.strip() for value in bag.xpath('//text()')])
+
+
+# https://github.com/pallets/werkzeug/blob/778f482d1ac0c9e8e98f774d2595e9074e6984d7/werkzeug/utils.py#L253
+def secure_filename(filename):
+    r"""Pass it a filename and it will return a secure version of it.  This
+    filename can then safely be stored on a regular file system and passed
+    to :func:`os.path.join`.  The filename returned is an ASCII only string
+    for maximum portability.
+
+    On windows systems the function also makes sure that the file is not
+    named after one of the special device files.
+
+    >>> secure_filename("My cool movie.mov")
+    'My_cool_movie.mov'
+    >>> secure_filename("../../../etc/passwd")
+    'etc_passwd'
+    >>> secure_filename(u'i contain cool \xfcml\xe4uts.txt')
+    'i_contain_cool_umlauts.txt'
+
+    The function might return an empty filename.  It's your responsibility
+    to ensure that the filename is unique and that you abort or
+    generate a random filename if the function returned an empty one.
+
+    .. versionadded:: 0.5
+
+    :param filename: the filename to secure
+    """
+    if isinstance(filename, six.text_type):
+        from unicodedata import normalize
+        filename = normalize('NFKD', filename).encode('ascii', 'ignore')
+        if not six.PY2:
+            filename = filename.decode('ascii')
+    for sep in os.path.sep, os.path.altsep:
+        if sep:
+            filename = filename.replace(sep, ' ')
+    filename = str(_filename_ascii_strip_re.sub('', '_'.join(
+                   filename.split()))).strip('._')
+
+    # on nt a couple of special files are present in each folder.  We
+    # have to ensure that the target file is not such a filename.  In
+    # this case we prepend an underline
+    if os.name == 'nt' and filename and \
+       filename.split('.')[0].upper() in _windows_device_files:
+        filename = '_' + filename
+
+    return filename
diff --git a/pycsw/ogc/csw/csw2.py b/pycsw/ogc/csw/csw2.py
@@ -1258,8 +1258,8 @@ def harvest(self):
             return self.exceptionreport('InvalidParameterValue',
             'resourcetype', 'Invalid resource type parameter: %s.\
             Allowable resourcetype values: %s' % (self.parent.kvp['resourcetype'],
-            ','.join(self.parent.context.model['operations']['Harvest']['parameters']
-            ['ResourceType']['values'])))
+            ','.join(sorted(self.parent.context.model['operations']['Harvest']['parameters']
+            ['ResourceType']['values']))))
 
         if (self.parent.kvp['resourcetype'].find('opengis.net') == -1 and
             self.parent.kvp['resourcetype'].find('urn:geoss:waf') == -1):

diff --git a/pycsw/ogc/csw/csw3.py b/pycsw/ogc/csw/csw3.py
@@ -1321,8 +1321,8 @@ def harvest(self):
             return self.exceptionreport('InvalidParameterValue',
             'resourcetype', 'Invalid resource type parameter: %s.\
             Allowable resourcetype values: %s' % (self.parent.kvp['resourcetype'],
-            ','.join(self.parent.context.model['operations']['Harvest']['parameters']
-            ['ResourceType']['values'])))
+            ','.join(sorted(self.parent.context.model['operations']['Harvest']['parameters']
+            ['ResourceType']['values']))))
 
         if (self.parent.kvp['resourcetype'].find('opengis.net') == -1 and
             self.parent.kvp['resourcetype'].find('urn:geoss:waf') == -1):