From 1edd8c43de9457321fc95c7b40655b8f917b6006 Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Sat, 15 Jul 2017 15:15:30 -0400 Subject: [PATCH 01/10] [#3556] faster _table_metadata view definition --- ckanext/datastore/set_permissions.sql | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ckanext/datastore/set_permissions.sql b/ckanext/datastore/set_permissions.sql index 79f0cba0c3a..56949d000db 100644 --- a/ckanext/datastore/set_permissions.sql +++ b/ckanext/datastore/set_permissions.sql @@ -55,7 +55,6 @@ CREATE OR REPLACE VIEW "_table_metadata" AS dependee.relname AS name, dependee.oid AS oid, dependent.relname AS alias_of - -- dependent.oid AS oid FROM pg_class AS dependee LEFT OUTER JOIN pg_rewrite AS r ON r.ev_class = dependee.oid @@ -63,9 +62,11 @@ CREATE OR REPLACE VIEW "_table_metadata" AS LEFT OUTER JOIN pg_class AS dependent ON d.refobjid = dependent.oid WHERE (dependee.oid != dependent.oid OR dependent.oid IS NULL) AND - (dependee.relname IN (SELECT tablename FROM pg_catalog.pg_tables) - OR dependee.relname IN (SELECT viewname FROM pg_catalog.pg_views)) AND - dependee.relnamespace = (SELECT oid FROM pg_namespace WHERE nspname='public') + -- is a table (from pg_tables view definition) + -- or is a view (from pg_views view definition) + (dependee.relkind = 'r'::"char" OR dependee.relkind = 'v'::"char") + AND dependee.relnamespace = ( + SELECT oid FROM pg_namespace WHERE nspname='public') ORDER BY dependee.oid DESC; ALTER VIEW "_table_metadata" OWNER TO {writeuser}; GRANT SELECT ON "_table_metadata" TO {readuser}; From 83542a9608a87e00ba7fa2183e2820cf236f143f Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Sat, 15 Jul 2017 18:56:39 -0400 Subject: [PATCH 02/10] [#3556] populate_full_text_trigger + migration --- ckanext/datastore/set_permissions.sql | 43 ++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/ckanext/datastore/set_permissions.sql b/ckanext/datastore/set_permissions.sql index 56949d000db..cfea440478a 100644 --- a/ckanext/datastore/set_permissions.sql +++ b/ckanext/datastore/set_permissions.sql @@ -49,6 +49,7 @@ GRANT SELECT ON ALL TABLES IN SCHEMA public TO {readuser}; ALTER DEFAULT PRIVILEGES FOR USER {writeuser} IN SCHEMA public GRANT SELECT ON TABLES TO {readuser}; +-- a view for listing valid table (resource id) and view names CREATE OR REPLACE VIEW "_table_metadata" AS SELECT DISTINCT substr(md5(dependee.relname || COALESCE(dependent.relname, '')), 0, 17) AS "_id", @@ -62,11 +63,45 @@ CREATE OR REPLACE VIEW "_table_metadata" AS LEFT OUTER JOIN pg_class AS dependent ON d.refobjid = dependent.oid WHERE (dependee.oid != dependent.oid OR dependent.oid IS NULL) AND - -- is a table (from pg_tables view definition) - -- or is a view (from pg_views view definition) + -- is a table (from pg_tables view definition) + -- or is a view (from pg_views view definition) (dependee.relkind = 'r'::"char" OR dependee.relkind = 'v'::"char") - AND dependee.relnamespace = ( - SELECT oid FROM pg_namespace WHERE nspname='public') + AND dependee.relnamespace = ( + SELECT oid FROM pg_namespace WHERE nspname='public') ORDER BY dependee.oid DESC; ALTER VIEW "_table_metadata" OWNER TO {writeuser}; GRANT SELECT ON "_table_metadata" TO {readuser}; + +-- _full_text fields are now updated by a trigger when set to NULL +CREATE OR REPLACE FUNCTION populate_full_text_trigger() RETURNS trigger +AS $body$ + BEGIN + IF NEW._full_text IS NOT NULL THEN + RETURN NEW; + END IF; + NEW._full_text := ( + SELECT to_tsvector(string_agg(value, ' ')) + FROM json_each_text(row_to_json(NEW.*)) + WHERE key NOT LIKE '\_%'); + RETURN NEW; + END; +$body$ LANGUAGE plpgsql; + +-- migrate existing tables that don't have full text trigger applied +DO $body$ + BEGIN + EXECUTE coalesce( + (SELECT string_agg( + 'CREATE TRIGGER zfulltext BEFORE INSERT OR UPDATE ON ' || + quote_ident(relname) || 'FOR EACH ROW EXECUTE PROCEDURE ' || + 'populate_full_text_trigger();', ' ') + FROM pg_class + LEFT OUTER JOIN pg_trigger AS t + ON t.tgrelid = relname::regclass AND t.tgname = 'zfulltext' + WHERE relkind = 'r'::"char" AND t.tgname IS NULL + AND relnamespace = ( + SELECT oid FROM pg_namespace WHERE nspname='public')), + 'SELECT 1;'); + END; +$body$; + From 228b9327cb182ce061db191bf24d451f7543faac Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Sat, 15 Jul 2017 19:45:27 -0400 Subject: [PATCH 03/10] [#3556] populate_full_text_trigger owner --- ckanext/datastore/set_permissions.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/ckanext/datastore/set_permissions.sql b/ckanext/datastore/set_permissions.sql index cfea440478a..fd42fc1901b 100644 --- a/ckanext/datastore/set_permissions.sql +++ b/ckanext/datastore/set_permissions.sql @@ -86,6 +86,7 @@ AS $body$ RETURN NEW; END; $body$ LANGUAGE plpgsql; +ALTER FUNCTION populate_full_text_trigger() OWNER TO {writeuser}; -- migrate existing tables that don't have full text trigger applied DO $body$ From 3ee86e9bd1ee9a2bc7285b53c214490ceac80999 Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Sat, 15 Jul 2017 20:17:36 -0400 Subject: [PATCH 04/10] [#3556] datastore_search update whitelist check _table_metadata no longer appears in _table_metadata (and thats just fine) --- ckanext/datastore/logic/action.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ckanext/datastore/logic/action.py b/ckanext/datastore/logic/action.py index 9221f0d751f..d40067bdbe0 100644 --- a/ckanext/datastore/logic/action.py +++ b/ckanext/datastore/logic/action.py @@ -450,16 +450,16 @@ def datastore_search(context, data_dict): res_id = data_dict['resource_id'] - res_exists, real_id = backend.resource_id_from_alias(res_id) - # Resource only has to exist in the datastore (because it could be an - # alias) + if data_dict['resource_id'] not in WHITELISTED_RESOURCES: + res_exists, real_id = backend.resource_id_from_alias(res_id) + # Resource only has to exist in the datastore (because it could be an + # alias) - if not res_exists: - raise p.toolkit.ObjectNotFound(p.toolkit._( - 'Resource "{0}" was not found.'.format(res_id) - )) + if not res_exists: + raise p.toolkit.ObjectNotFound(p.toolkit._( + 'Resource "{0}" was not found.'.format(res_id) + )) - if data_dict['resource_id'] not in WHITELISTED_RESOURCES: # Replace potential alias with real id to simplify access checks if real_id: data_dict['resource_id'] = real_id From eafb8603485f47e3998bd3ffe26271ce394b7e70 Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Sat, 15 Jul 2017 21:37:25 -0400 Subject: [PATCH 05/10] [#3556] rely on populate_full_text_trigger remove _to_full_text, json_get_values in python --- ckanext/datastore/backend/postgres.py | 77 +++++++++++---------------- ckanext/datastore/tests/helpers.py | 2 +- ckanext/datastore/tests/test_db.py | 20 ------- 3 files changed, 31 insertions(+), 68 deletions(-) diff --git a/ckanext/datastore/backend/postgres.py b/ckanext/datastore/backend/postgres.py index e3ecd97b1cb..292ae8074d4 100644 --- a/ckanext/datastore/backend/postgres.py +++ b/ckanext/datastore/backend/postgres.py @@ -349,22 +349,6 @@ def _validate_record(record, num, field_names): }) -def _to_full_text(fields, record): - full_text = [] - ft_types = ['int8', 'int4', 'int2', 'float4', 'float8', 'date', 'time', - 'timetz', 'timestamp', 'numeric', 'text'] - for field in fields: - value = record.get(field['id']) - if not value: - continue - - if field['type'].lower() in ft_types and unicode(value): - full_text.append(unicode(value)) - else: - full_text.extend(json_get_values(value)) - return ' '.join(set(full_text)) - - def _where_clauses(data_dict, fields_types): filters = data_dict.get('filters', {}) clauses = [] @@ -755,19 +739,6 @@ def convert(data, type_name): return unicode(data) -def json_get_values(obj, current_list=None): - if current_list is None: - current_list = [] - if isinstance(obj, list) or isinstance(obj, tuple): - for item in obj: - json_get_values(item, current_list) - elif isinstance(obj, dict): - json_get_values(obj.items(), current_list) - elif obj: - current_list.append(unicode(obj)) - return current_list - - def check_fields(context, fields): '''Check if field types are valid.''' for field in fields: @@ -1027,8 +998,11 @@ def upsert_data(context, data_dict): fields = _get_fields(context, data_dict) field_names = _pluck('id', fields) records = data_dict['records'] - sql_columns = ", ".join(['"%s"' % name.replace( - '%', '%%') for name in field_names] + ['"_full_text"']) + sql_columns = ", ".join( + identifier(name) for name in field_names) + sql_full_text = " || ' ' || ".join( + identifier(name) + "::text" + for name in field_names) if method == _INSERT: rows = [] @@ -1042,13 +1016,12 @@ def upsert_data(context, data_dict): # a tuple with an empty second value value = (json.dumps(value), '') row.append(value) - row.append(_to_full_text(fields, record)) rows.append(row) - sql_string = u'''INSERT INTO "{res_id}" ({columns}) - VALUES ({values}, to_tsvector(%s));'''.format( - res_id=data_dict['resource_id'], - columns=sql_columns, + sql_string = u'''INSERT INTO {res_id} ({columns}) + VALUES ({values});'''.format( + res_id=identifier(data_dict['resource_id']), + columns=sql_columns.replace('%', '%%'), values=', '.join(['%s' for field in field_names]) ) @@ -1104,18 +1077,16 @@ def upsert_data(context, data_dict): used_values = [record[field] for field in used_field_names] - full_text = _to_full_text(fields, record) - if method == _UPDATE: sql_string = u''' UPDATE "{res_id}" - SET ({columns}, "_full_text") = ({values}, to_tsvector(%s)) + SET ({columns}, "_full_text") = ({values}, NULL) WHERE ({primary_key}) = ({primary_value}); '''.format( res_id=data_dict['resource_id'], columns=u', '.join( - [u'"{0}"'.format(field) - for field in used_field_names]), + [identifier(field) + for field in used_field_names]).replace('%', '%%'), values=u', '.join( ['%s' for _ in used_field_names]), primary_key=u','.join( @@ -1124,7 +1095,7 @@ def upsert_data(context, data_dict): ) try: results = context['connection'].execute( - sql_string, used_values + [full_text] + unique_values) + sql_string, used_values + unique_values) except sqlalchemy.exc.DatabaseError as err: raise ValidationError({ u'records': [_programming_error_summary(err)], @@ -1139,10 +1110,10 @@ def upsert_data(context, data_dict): elif method == _UPSERT: sql_string = u''' UPDATE "{res_id}" - SET ({columns}, "_full_text") = ({values}, to_tsvector(%s)) + SET ({columns}, "_full_text") = ({values}, NULL) WHERE ({primary_key}) = ({primary_value}); - INSERT INTO "{res_id}" ({columns}, "_full_text") - SELECT {values}, to_tsvector(%s) + INSERT INTO "{res_id}" ({columns}) + SELECT {values} WHERE NOT EXISTS (SELECT 1 FROM "{res_id}" WHERE ({primary_key}) = ({primary_value})); '''.format( @@ -1159,7 +1130,7 @@ def upsert_data(context, data_dict): try: context['connection'].execute( sql_string, - (used_values + [full_text] + unique_values) * 2) + (used_values + unique_values) * 2) except sqlalchemy.exc.DatabaseError as err: raise ValidationError({ u'records': [_programming_error_summary(err)], @@ -1391,7 +1362,8 @@ def _create_triggers(connection, resource_id, triggers): or "for_each" parameters from triggers list. ''' existing = connection.execute( - u'SELECT tgname FROM pg_trigger WHERE tgrelid = %s::regclass', + u"""SELECT tgname FROM pg_trigger + WHERE tgrelid = %s::regclass AND tgname LIKE 't___'""", resource_id) sql_list = ( [u'DROP TRIGGER {name} ON {table}'.format( @@ -1413,6 +1385,14 @@ def _create_triggers(connection, resource_id, triggers): raise ValidationError({u'triggers': [_programming_error_summary(pe)]}) +def _create_fulltext_trigger(connection, resource_id): + connection.execute( + u'''CREATE TRIGGER zfulltext + BEFORE INSERT OR UPDATE ON {table} + FOR EACH ROW EXECUTE PROCEDURE populate_full_text_trigger()'''.format( + table=identifier(resource_id))) + + def upsert(context, data_dict): ''' This method combines upsert insert and update on the datastore. The method @@ -1804,6 +1784,9 @@ def create(self, context, data_dict): ).fetchone() if not result: create_table(context, data_dict) + _create_fulltext_trigger( + context['connection'], + data_dict['resource_id']) else: alter_table(context, data_dict) if 'triggers' in data_dict: diff --git a/ckanext/datastore/tests/helpers.py b/ckanext/datastore/tests/helpers.py index eb7f19da6c2..de4a2597153 100644 --- a/ckanext/datastore/tests/helpers.py +++ b/ckanext/datastore/tests/helpers.py @@ -26,7 +26,7 @@ def clear_db(Session): SELECT 'drop function ' || quote_ident(proname) || '();' FROM pg_proc INNER JOIN pg_namespace ns ON (pg_proc.pronamespace = ns.oid) - WHERE ns.nspname = 'public' + WHERE ns.nspname = 'public' AND proname != 'populate_full_text_trigger' ''' drop_functions = u''.join(r[0] for r in c.execute(drop_functions_sql)) if drop_functions: diff --git a/ckanext/datastore/tests/test_db.py b/ckanext/datastore/tests/test_db.py index 069bd446e43..c7136091735 100644 --- a/ckanext/datastore/tests/test_db.py +++ b/ckanext/datastore/tests/test_db.py @@ -173,26 +173,6 @@ def test_upsert_with_insert_method_and_invalid_data( backend.InvalidDataError, db.upsert_data, context, data_dict) -class TestJsonGetValues(object): - def test_returns_empty_list_if_called_with_none(self): - assert_equal(db.json_get_values(None), []) - - def test_returns_list_with_value_if_called_with_string(self): - assert_equal(db.json_get_values('foo'), ['foo']) - - def test_returns_list_with_only_the_original_truthy_values_if_called(self): - data = [None, 'foo', 42, 'bar', {}, []] - assert_equal(db.json_get_values(data), ['foo', '42', 'bar']) - - def test_returns_flattened_list(self): - data = ['foo', ['bar', ('baz', 42)]] - assert_equal(db.json_get_values(data), ['foo', 'bar', 'baz', '42']) - - def test_returns_only_truthy_values_from_dict(self): - data = {'foo': 'bar', 'baz': [42, None, {}, [], 'hey']} - assert_equal(db.json_get_values(data), ['foo', 'bar', 'baz', '42', 'hey']) - - class TestGetAllResourcesIdsInDatastore(object): @classmethod def setup_class(cls): From d4384761340178daf0e466a626e74af284a1434a Mon Sep 17 00:00:00 2001 From: David Read Date: Fri, 25 Aug 2017 15:56:34 +0000 Subject: [PATCH 06/10] [#3785] Fix whitespace needed after column names. --- ckanext/datastore/set_permissions.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/datastore/set_permissions.sql b/ckanext/datastore/set_permissions.sql index fd42fc1901b..e1a8747c95a 100644 --- a/ckanext/datastore/set_permissions.sql +++ b/ckanext/datastore/set_permissions.sql @@ -94,7 +94,7 @@ DO $body$ EXECUTE coalesce( (SELECT string_agg( 'CREATE TRIGGER zfulltext BEFORE INSERT OR UPDATE ON ' || - quote_ident(relname) || 'FOR EACH ROW EXECUTE PROCEDURE ' || + quote_ident(relname) || ' FOR EACH ROW EXECUTE PROCEDURE ' || 'populate_full_text_trigger();', ' ') FROM pg_class LEFT OUTER JOIN pg_trigger AS t From 021d3d96b54c885cd6182cdb3a60d622204ff936 Mon Sep 17 00:00:00 2001 From: David Read Date: Mon, 28 Aug 2017 08:33:49 +0000 Subject: [PATCH 07/10] [#3786] Add release notes about running set-permissions again after upgrading. --- CHANGELOG.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 30b5a1d0526..aa23d03be11 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,32 @@ Changelog --------- +v.X.X (TBA) +=========== + +Note: Those upgrading to this version of CKAN who run the DataStore need to + re-run the 'datastore set-permissions' command. For a package install run it + like this: + + sudo ckan datastore set-permissions | sudo -u postgres psql --set ON_ERROR_STOP=1 + + Or for a source install: + + . /usr/lib/ckan/default/bin/activate + paster --plugin=ckan datastore set-permissions -c /etc/ckan/default/development.ini + + Developers who run the tests should also do: + + paster datastore set-permissions -c test-core.ini | sudo -u postgres psql + + Failure to run 'datastore set-permissions' will find that new and updated + datasets will not be searchable in DataStore and the logs will contain this + error: + + ProgrammingError: (psycopg2.ProgrammingError) function populate_full_text_trigger() does not exist + [SQL: 'CREATE TRIGGER zfulltext\n BEFORE INSERT OR UPDATE ON "78bf9251-cc2a-4fcf-b761-c2e08b074f18"\n FOR EACH ROW EXECUTE PROCEDURE populate_full_text_trigger()'] + + v2.7.0 2017-08-02 ================= From fa1ad07db8e18e60c5090071947af918212d1280 Mon Sep 17 00:00:00 2001 From: David Read Date: Mon, 28 Aug 2017 08:45:09 +0000 Subject: [PATCH 08/10] [#3786] Release notes tightened up. --- CHANGELOG.rst | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index aa23d03be11..b7a18890082 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,30 +7,20 @@ Changelog --------- -v.X.X (TBA) -=========== - -Note: Those upgrading to this version of CKAN who run the DataStore need to - re-run the 'datastore set-permissions' command. For a package install run it - like this: - - sudo ckan datastore set-permissions | sudo -u postgres psql --set ON_ERROR_STOP=1 - - Or for a source install: - - . /usr/lib/ckan/default/bin/activate - paster --plugin=ckan datastore set-permissions -c /etc/ckan/default/development.ini - - Developers who run the tests should also do: +vX.X.X (TBA) +============ - paster datastore set-permissions -c test-core.ini | sudo -u postgres psql +Note: This version requires re-running the 'datastore set-permissions' command + (assuming you run DataStore). See: + http://docs.ckan.org/en/ckan-2.7.0/maintaining/datastore.html#set-permissions - Failure to run 'datastore set-permissions' will find that new and updated - datasets will not be searchable in DataStore and the logs will contain this - error: + Otherwise new and updated datasets will not be searchable in DataStore and + the logs will contain this error: ProgrammingError: (psycopg2.ProgrammingError) function populate_full_text_trigger() does not exist - [SQL: 'CREATE TRIGGER zfulltext\n BEFORE INSERT OR UPDATE ON "78bf9251-cc2a-4fcf-b761-c2e08b074f18"\n FOR EACH ROW EXECUTE PROCEDURE populate_full_text_trigger()'] + + CKAN developers should also re-run set-permissions on the test database: + http://docs.ckan.org/en/ckan-2.7.0/contributing/test.html#set-up-the-test-databases v2.7.0 2017-08-02 From 557f2eaa59aef8d027c88dc877503e4b3f722a15 Mon Sep 17 00:00:00 2001 From: David Read Date: Mon, 28 Aug 2017 08:59:14 +0000 Subject: [PATCH 09/10] [#3786] Release notes links fixed. --- CHANGELOG.rst | 11 +++++------ doc/contributing/test.rst | 4 ++++ doc/maintaining/datastore.rst | 2 ++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b7a18890082..69fdb28f345 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,20 +7,19 @@ Changelog --------- -vX.X.X (TBA) -============ +v?? (TBA) +========= Note: This version requires re-running the 'datastore set-permissions' command - (assuming you run DataStore). See: - http://docs.ckan.org/en/ckan-2.7.0/maintaining/datastore.html#set-permissions + (assuming you run DataStore). See: :ref:`datastore-set-permissions` Otherwise new and updated datasets will not be searchable in DataStore and - the logs will contain this error: + the logs will contain this error:: ProgrammingError: (psycopg2.ProgrammingError) function populate_full_text_trigger() does not exist CKAN developers should also re-run set-permissions on the test database: - http://docs.ckan.org/en/ckan-2.7.0/contributing/test.html#set-up-the-test-databases + :ref:`datastore-test-set-permissions` v2.7.0 2017-08-02 diff --git a/doc/contributing/test.rst b/doc/contributing/test.rst index 11bfa226233..20d61b74a06 100644 --- a/doc/contributing/test.rst +++ b/doc/contributing/test.rst @@ -40,6 +40,7 @@ environment: pip install -r |virtualenv|/src/ckan/dev-requirements.txt +.. _datastore-test-set-permissions: ~~~~~~~~~~~~~~~~~~~~~~~~~ Set up the test databases @@ -55,6 +56,9 @@ Create test databases: sudo -u postgres createdb -O |database_user| |test_database| -E utf-8 sudo -u postgres createdb -O |database_user| |test_datastore| -E utf-8 + +Set the permissions:: + paster datastore set-permissions -c test-core.ini | sudo -u postgres psql This database connection is specified in the ``test-core.ini`` file by the diff --git a/doc/maintaining/datastore.rst b/doc/maintaining/datastore.rst index d402aa6c354..7eda592b5e6 100644 --- a/doc/maintaining/datastore.rst +++ b/doc/maintaining/datastore.rst @@ -120,6 +120,8 @@ if necessary, for example: Replace ``pass`` with the passwords you created for your |database_user| and |datastore_user| database users. +.. _datastore-set-permissions: + Set permissions --------------- From 3ce191e8d3396857100f765a3b36d3f2bedf25f6 Mon Sep 17 00:00:00 2001 From: Ian Ward Date: Thu, 31 Aug 2017 08:25:29 -0400 Subject: [PATCH 10/10] [#3786] remove unused variable --- ckanext/datastore/backend/postgres.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ckanext/datastore/backend/postgres.py b/ckanext/datastore/backend/postgres.py index 292ae8074d4..8abcf6b00ec 100644 --- a/ckanext/datastore/backend/postgres.py +++ b/ckanext/datastore/backend/postgres.py @@ -1000,9 +1000,6 @@ def upsert_data(context, data_dict): records = data_dict['records'] sql_columns = ", ".join( identifier(name) for name in field_names) - sql_full_text = " || ' ' || ".join( - identifier(name) + "::text" - for name in field_names) if method == _INSERT: rows = []