From 7afe194339f1d0c35f9c8edc1a58b0380ca7d331 Mon Sep 17 00:00:00 2001
From: Andrei Duhnea <andrei@duhnea.net>
Date: Tue, 12 Jun 2018 16:49:33 +0300
Subject: [PATCH] [refs #95885] Extract languages and organizations

---
 search/management/commands/_metadata.py     | 40 ++++++++++++++++++---
 search/management/commands/load_metadata.py | 10 ++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/search/management/commands/_metadata.py b/search/management/commands/_metadata.py
index 356aadd..ce64266 100644
--- a/search/management/commands/_metadata.py
+++ b/search/management/commands/_metadata.py
@@ -12,6 +12,8 @@
     DDataSource,
     DNutsLevel,
     DKeyword,
+    DLanguage,
+    Organization,
 )
 
 
@@ -32,7 +34,11 @@ def float_or_none(value):
 
 
 def comma_string_to_list(value):
-    return [e.strip() for e in value.split(',') if e.strip()]
+    """
+    Splits `value` on commas and `and`s.
+    """
+    data = [e.strip() for e in value.split(',') if e.strip()]
+    return [e.strip() for tok in data for e in tok.split(' and ') if e.strip()]
 
 
 @attr.s
@@ -53,11 +59,11 @@ class MetadataRecord:
     resource_locator_internal = attr.ib()
     resource_locator_internal2 = attr.ib()
     resource_locator_external = attr.ib()
-    responsible_organisation = attr.ib()
-    organisation_email = attr.ib()
+    responsible_organization = attr.ib()
+    organization_email = attr.ib()
     resource_title = attr.ib()
     resource_description = attr.ib()
-    language = attr.ib()
+    languages = attr.ib(converter=comma_string_to_list)
     year_published = attr.ib(converter=int_or_none)
     year_data_collection_start = attr.ib(converter=int_or_none)
     year_data_collection_end = attr.ib(converter=int_or_none)
@@ -190,3 +196,29 @@ def update_keywords(records):
     existing = [o.name for o in DKeyword.objects.only('name').filter(name__in=data)]
     data = [d for d in data if d not in existing]
     return _update_data(DKeyword, 'name', data)
+
+
+def update_languages(records):
+    data = set([lang for r in records for lang in r.languages])
+    existing = [o.name for o in DLanguage.objects.only('name').filter(name__in=data)]
+    data = [d for d in data if d not in existing]
+    return _update_data(DLanguage, 'name', data)
+
+
+def update_organizations(records):
+    # Organization.responsible_person is not populated, as the Excel data
+    # seems to include that in the email column, with no consistent format.
+    orgs = set(
+        [
+            (r.responsible_organization.strip(), r.organization_email.strip())
+            for r in records
+            if r.responsible_organization.strip()
+        ]
+    )
+
+    new = 0
+    for o in orgs:
+        if not Organization.objects.filter(name=o[0], email=o[1]).exists():
+            Organization.objects.create(name=o[0], email=o[1])
+            new += 1
+    return new
diff --git a/search/management/commands/load_metadata.py b/search/management/commands/load_metadata.py
index 795e511..073a285 100644
--- a/search/management/commands/load_metadata.py
+++ b/search/management/commands/load_metadata.py
@@ -15,6 +15,8 @@
     update_data_sources,
     update_nuts_levels,
     update_keywords,
+    update_languages,
+    update_organizations,
 )
 
 defusedxml.defuse_stdlib()
@@ -66,6 +68,14 @@ def update_dictionaries(self, records):
         if new > 0:
             self.stdout.write(f'Added {new} keywords')
 
+        new = update_languages(records)
+        if new > 0:
+            self.stdout.write(f'Added {new} languages')
+
+        new = update_organizations(records)
+        if new > 0:
+            self.stdout.write(f'Added {new} organizations')
+
     def handle(self, *args, **options):
         start_row = options['startrow']
         try: