From 7afe194339f1d0c35f9c8edc1a58b0380ca7d331 Mon Sep 17 00:00:00 2001 From: Andrei Duhnea Date: Tue, 12 Jun 2018 16:49:33 +0300 Subject: [PATCH] [refs #95885] Extract languages and organizations --- search/management/commands/_metadata.py | 40 ++++++++++++++++++--- search/management/commands/load_metadata.py | 10 ++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/search/management/commands/_metadata.py b/search/management/commands/_metadata.py index 356aadd..ce64266 100644 --- a/search/management/commands/_metadata.py +++ b/search/management/commands/_metadata.py @@ -12,6 +12,8 @@ DDataSource, DNutsLevel, DKeyword, + DLanguage, + Organization, ) @@ -32,7 +34,11 @@ def float_or_none(value): def comma_string_to_list(value): - return [e.strip() for e in value.split(',') if e.strip()] + """ + Splits `value` on commas and `and`s. + """ + data = [e.strip() for e in value.split(',') if e.strip()] + return [e.strip() for tok in data for e in tok.split(' and ') if e.strip()] @attr.s @@ -53,11 +59,11 @@ class MetadataRecord: resource_locator_internal = attr.ib() resource_locator_internal2 = attr.ib() resource_locator_external = attr.ib() - responsible_organisation = attr.ib() - organisation_email = attr.ib() + responsible_organization = attr.ib() + organization_email = attr.ib() resource_title = attr.ib() resource_description = attr.ib() - language = attr.ib() + languages = attr.ib(converter=comma_string_to_list) year_published = attr.ib(converter=int_or_none) year_data_collection_start = attr.ib(converter=int_or_none) year_data_collection_end = attr.ib(converter=int_or_none) @@ -190,3 +196,29 @@ def update_keywords(records): existing = [o.name for o in DKeyword.objects.only('name').filter(name__in=data)] data = [d for d in data if d not in existing] return _update_data(DKeyword, 'name', data) + + +def update_languages(records): + data = set([lang for r in records for lang in r.languages]) + existing = [o.name for o in DLanguage.objects.only('name').filter(name__in=data)] + data = [d for d in data if d not in existing] + return _update_data(DLanguage, 'name', data) + + +def update_organizations(records): + # Organization.responsible_person is not populated, as the Excel data + # seems to include that in the email column, with no consistent format. + orgs = set( + [ + (r.responsible_organization.strip(), r.organization_email.strip()) + for r in records + if r.responsible_organization.strip() + ] + ) + + new = 0 + for o in orgs: + if not Organization.objects.filter(name=o[0], email=o[1]).exists(): + Organization.objects.create(name=o[0], email=o[1]) + new += 1 + return new diff --git a/search/management/commands/load_metadata.py b/search/management/commands/load_metadata.py index 795e511..073a285 100644 --- a/search/management/commands/load_metadata.py +++ b/search/management/commands/load_metadata.py @@ -15,6 +15,8 @@ update_data_sources, update_nuts_levels, update_keywords, + update_languages, + update_organizations, ) defusedxml.defuse_stdlib() @@ -66,6 +68,14 @@ def update_dictionaries(self, records): if new > 0: self.stdout.write(f'Added {new} keywords') + new = update_languages(records) + if new > 0: + self.stdout.write(f'Added {new} languages') + + new = update_organizations(records) + if new > 0: + self.stdout.write(f'Added {new} organizations') + def handle(self, *args, **options): start_row = options['startrow'] try: