Skip to content

Commit

Permalink
Support indexing the ISO files from the IOOS Catalog (#574)
Browse files Browse the repository at this point in the history
* Catch errors when loading ISO files and improve error logging

Carry through the actual exception when trying to insert records

* Don't run VACUUM ANALYZE in a transaction

* Ignore some shell magic files (direnv, autoenv) and pytest cache

* Allow folders of ISO files in the functionaltests data folder

* Sort the allowed resource types response so it can be tested reliably

* Raise an appropriate exception for response on HTTP exceptions

This turns

```
<ows:ExceptionText>Harvest failed: record parsing failed: local variable 'md' referenced before assignment</ows:ExceptionText>
```

into

```
<ows:ExceptionText>Harvest failed: record parsing failed: 404 Client Error: Not Found for url: http://maps.cera.govt.nz/arcgis/services/CERA/CERA_TechClasses_WGS84/MapServer/WFSServer?service=WFS&amp;request=GetCapabilities&amp;version=2.0.0</ows:ExceptionText>
```

* Add functional test with duplicate fileID records

* Parse both bytes and str meta records. fixes #561

* Better sanitizing and testing of the identifier on export. fixes #566

Return the loades and exported records from the admin functions

Export records in functional tests if an "export" directory exists

I spent a good amount of time trying to integrate this into the testing
suite more cleanrly (not in the _initialize_database function) but came
up with no solution that didn't require a large refator of the testing
code.
  • Loading branch information
kwilcox authored and tomkralidis committed Aug 27, 2018
1 parent d008973 commit f82fa0f
Show file tree
Hide file tree
Showing 25 changed files with 6,843 additions and 31 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ tests/results
**.cache
.coverage
.tox
.pytest_cache

# test configurations
/default.cfg

# pycharm ide
.idea

# shell files
.env
.envrc
67 changes: 53 additions & 14 deletions pycsw/core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,10 +310,13 @@ def setup_db(database, table, home, create_sfsql_tables=True, create_plpythonu_f

def load_records(context, database, table, xml_dirpath, recursive=False, force_update=False):
"""Load metadata records from directory of files to database"""
from sqlalchemy.exc import DBAPIError

repo = repository.Repository(database, context, table=table)

file_list = []

loaded_files = set()
if os.path.isfile(xml_dirpath):
file_list.append(xml_dirpath)
elif recursive:
Expand All @@ -334,11 +337,18 @@ def load_records(context, database, table, xml_dirpath, recursive=False, force_u
# read document
try:
exml = etree.parse(recfile, context.parser)
except etree.XMLSyntaxError as err:
LOGGER.error('XML document "%s" is not well-formed', recfile)
continue
except Exception as err:
LOGGER.exception('XML document is not well-formed')
LOGGER.exception('XML document "%s" is not well-formed', recfile)
continue

record = metadata.parse_record(context, exml, repo)
try:
record = metadata.parse_record(context, exml, repo)
except Exception as err:
LOGGER.exception('Could not parse "%s" as an XML record', recfile)
continue

for rec in record:
LOGGER.info('Inserting %s %s into database %s, table %s ....',
Expand All @@ -347,14 +357,23 @@ def load_records(context, database, table, xml_dirpath, recursive=False, force_u
# TODO: do this as CSW Harvest
try:
repo.insert(rec, 'local', util.get_today_and_now())
LOGGER.info('Inserted')
except RuntimeError as err:
loaded_files.add(recfile)
LOGGER.info('Inserted %s', recfile)
except Exception as err:
if force_update:
LOGGER.info('Record exists. Updating.')
repo.update(rec)
LOGGER.info('Updated')
LOGGER.info('Updated %s', recfile)
loaded_files.add(recfile)
else:
LOGGER.error('ERROR: not inserted %s', err)
if isinstance(err, DBAPIError) and err.args:
# Pull a decent database error message and not the full SQL that was run
# since INSERT SQL statements are rather large.
LOGGER.error('ERROR: %s not inserted: %s', recfile, err.args[0])
else:
LOGGER.error('ERROR: %s not inserted: %s', recfile, err)

return tuple(loaded_files)


def export_records(context, database, table, xml_dirpath):
Expand All @@ -370,6 +389,8 @@ def export_records(context, database, table, xml_dirpath):

dirpath = os.path.abspath(xml_dirpath)

exported_files = set()

if not os.path.exists(dirpath):
LOGGER.info('Directory %s does not exist. Creating...', dirpath)
try:
Expand All @@ -384,11 +405,9 @@ def export_records(context, database, table, xml_dirpath):
context.md_core_model['mappings']['pycsw:Identifier'])

LOGGER.info('Processing %s', identifier)
if identifier.find(':') != -1: # it's a URN
# sanitize identifier
LOGGER.info(' Sanitizing identifier')
identifier = identifier.split(':')[-1]

# sanitize identifier
identifier = util.secure_filename(identifier)
# write to XML document
filename = os.path.join(dirpath, '%s.xml' % identifier)
try:
Expand All @@ -400,10 +419,17 @@ def export_records(context, database, table, xml_dirpath):
with open(filename, 'w') as xml:
xml.write('<?xml version="1.0" encoding="UTF-8"?>\n')
xml.write(str_xml)

except Exception as err:
LOGGER.exception('Error writing to disk')
raise RuntimeError("Error writing to %s" % filename, err)
# Something went wrong so skip over this file but log an error
LOGGER.exception('Error writing %s to disk', filename)
# If we wrote a partial file or created an empty file make sure it is removed
if os.path.exists(filename):
os.remove(filename)
continue
else:
exported_files.add(filename)

return tuple(exported_files)


def refresh_harvested_records(context, database, table, url):
Expand Down Expand Up @@ -452,10 +478,23 @@ def rebuild_db_indexes(database, table):

def optimize_db(context, database, table):
"""Optimize database"""
from sqlalchemy.exc import ArgumentError, OperationalError

LOGGER.info('Optimizing database %s', database)
repos = repository.Repository(database, context, table=table)
repos.engine.connect().execute('VACUUM ANALYZE').close()
connection = repos.engine.connect()
try:
# PostgreSQL
connection.execution_options(isolation_level="AUTOCOMMIT")
connection.execute('VACUUM ANALYZE')
except (ArgumentError, OperationalError):
# SQLite
connection.autocommit = True
connection.execute('VACUUM')
connection.execute('ANALYZE')
finally:
connection.close()
LOGGER.info('Done')


def gen_sitemap(context, database, table, url, output_file):
Expand Down
6 changes: 5 additions & 1 deletion pycsw/core/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import logging
import uuid
from six.moves import range
from six import text_type, binary_type
from six.moves.urllib.parse import urlparse

from geolinks import sniff_link
Expand Down Expand Up @@ -124,7 +125,7 @@ def _set(context, obj, name, value):
def _parse_metadata(context, repos, record):
"""parse metadata formats"""

if isinstance(record, str):
if isinstance(record, binary_type) or isinstance(record, text_type):
exml = etree.fromstring(record, context.parser)
else: # already serialized to lxml
if hasattr(record, 'getroot'): # standalone document
Expand Down Expand Up @@ -584,6 +585,7 @@ def _parse_wmts(context, repos, record, identifier):

def _parse_wfs(context, repos, record, identifier, version):

import requests
from owslib.wfs import WebFeatureService

bboxs = []
Expand All @@ -592,6 +594,8 @@ def _parse_wfs(context, repos, record, identifier, version):

try:
md = WebFeatureService(record, version)
except requests.exceptions.HTTPError as err:
raise
except Exception as err:
if version == '1.1.0':
md = WebFeatureService(record, '1.0.0')
Expand Down
4 changes: 1 addition & 3 deletions pycsw/core/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,7 @@ def insert(self, record, source, insert_date):
self.session.commit()
except Exception as err:
self.session.rollback()
msg = 'Cannot commit to repository'
LOGGER.exception(msg)
raise RuntimeError(msg)
raise

def update(self, record=None, recprops=None, constraint=None):
''' Update a record in the repository based on identifier '''
Expand Down
54 changes: 54 additions & 0 deletions pycsw/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#
# =================================================================

import os
import re
import datetime
import logging
import time
Expand All @@ -51,6 +53,12 @@
ranking_pass = False
ranking_query_geometry = ''

# Lookups for the secure_filename function
# https://github.com/pallets/werkzeug/blob/778f482d1ac0c9e8e98f774d2595e9074e6984d7/werkzeug/utils.py#L30-L31
_filename_ascii_strip_re = re.compile(r'[^A-Za-z0-9_.-]')
_windows_device_files = ('CON', 'AUX', 'COM1', 'COM2', 'COM3', 'COM4', 'LPT1',
'LPT2', 'LPT3', 'PRN', 'NUL')


def get_today_and_now():
"""Get the date, right now, in ISO8601"""
Expand Down Expand Up @@ -338,3 +346,49 @@ def get_anytext(bag):
bag = etree.fromstring(bag, PARSER)
# get all XML element content
return ' '.join([value.strip() for value in bag.xpath('//text()')])


# https://github.com/pallets/werkzeug/blob/778f482d1ac0c9e8e98f774d2595e9074e6984d7/werkzeug/utils.py#L253
def secure_filename(filename):
r"""Pass it a filename and it will return a secure version of it. This
filename can then safely be stored on a regular file system and passed
to :func:`os.path.join`. The filename returned is an ASCII only string
for maximum portability.
On windows systems the function also makes sure that the file is not
named after one of the special device files.
>>> secure_filename("My cool movie.mov")
'My_cool_movie.mov'
>>> secure_filename("../../../etc/passwd")
'etc_passwd'
>>> secure_filename(u'i contain cool \xfcml\xe4uts.txt')
'i_contain_cool_umlauts.txt'
The function might return an empty filename. It's your responsibility
to ensure that the filename is unique and that you abort or
generate a random filename if the function returned an empty one.
.. versionadded:: 0.5
:param filename: the filename to secure
"""
if isinstance(filename, six.text_type):
from unicodedata import normalize
filename = normalize('NFKD', filename).encode('ascii', 'ignore')
if not six.PY2:
filename = filename.decode('ascii')
for sep in os.path.sep, os.path.altsep:
if sep:
filename = filename.replace(sep, ' ')
filename = str(_filename_ascii_strip_re.sub('', '_'.join(
filename.split()))).strip('._')

# on nt a couple of special files are present in each folder. We
# have to ensure that the target file is not such a filename. In
# this case we prepend an underline
if os.name == 'nt' and filename and \
filename.split('.')[0].upper() in _windows_device_files:
filename = '_' + filename

return filename
4 changes: 2 additions & 2 deletions pycsw/ogc/csw/csw2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,8 +1258,8 @@ def harvest(self):
return self.exceptionreport('InvalidParameterValue',
'resourcetype', 'Invalid resource type parameter: %s.\
Allowable resourcetype values: %s' % (self.parent.kvp['resourcetype'],
','.join(self.parent.context.model['operations']['Harvest']['parameters']
['ResourceType']['values'])))
','.join(sorted(self.parent.context.model['operations']['Harvest']['parameters']
['ResourceType']['values']))))

if (self.parent.kvp['resourcetype'].find('opengis.net') == -1 and
self.parent.kvp['resourcetype'].find('urn:geoss:waf') == -1):
Expand Down
4 changes: 2 additions & 2 deletions pycsw/ogc/csw/csw3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1321,8 +1321,8 @@ def harvest(self):
return self.exceptionreport('InvalidParameterValue',
'resourcetype', 'Invalid resource type parameter: %s.\
Allowable resourcetype values: %s' % (self.parent.kvp['resourcetype'],
','.join(self.parent.context.model['operations']['Harvest']['parameters']
['ResourceType']['values'])))
','.join(sorted(self.parent.context.model['operations']['Harvest']['parameters']
['ResourceType']['values']))))

if (self.parent.kvp['resourcetype'].find('opengis.net') == -1 and
self.parent.kvp['resourcetype'].find('urn:geoss:waf') == -1):
Expand Down

0 comments on commit f82fa0f

Please sign in to comment.