# Metadata fra Geodata-info.dk
Denne notebook er tænkt til at hente data fra [geodata-info.dk](https://www.geodata-info.dk) (GDI) så egne geodata kan beriges udfra den nøgle (Identifikator) GDI tildeler deres datasæt. Det kræver således at man selv har tagget sine geodata med den nøglen og at disse kan udstilles, så data kan beriges med metadata. Vi arbejder med [Mapcentias GeoCloud2](http://www.mapcentia.com/dk/produkt/) som bruger [PostgreSQL](https://www.postgresql.org/), så derfor vil denne notebook være rettet mod at snakke med denne platform.
    
### Om Geodata-info.dk
Geodata-info.dk er den danske geoportal, der gør det muligt for professionelle brugere samt borgere med interesse for geodata at søge efter geodatasæt og geodatatjenester. Geodata-info.dk omfatter desuden den danske søgetjeneste i henhold til INSPIRE-direktivet (Forordning nr. 976/2009 for så vidt angår nettjenesterne).

In [1]:
import requests
import xml.etree.ElementTree as ET
import json
import sys
import pandas as pd
sys.path.append('/python/')
import connections as con

In [38]:
class Geometadata:
    """
    Henter metadata om tabeller fra geodata-info.dk
    """
    def __init__(self):
        self.fileIdentifier_path = "gmd:fileIdentifier/gco:CharacterString"
        self.title_path = "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString"
        self.abstract_path = "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString"
        self.organisationName_path = "gmd:contact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString"
    
    def get_xml(self,  gid):
        url = f"https://geodata-info.dk/srv/api/records/{gid}/formatters/xml"
        try:
            r = requests.request("GET", url)
        except Exception as e:
            print(e)
            
        root = ET.fromstring(r.text)
        return root
    
    def find_property(self, xml, xml_path):
        
        namespaces = {
            'gmd': 'http://www.isotc211.org/2005/gmd',
            'gco': 'http://www.isotc211.org/2005/gco'
        }
               
        prop = xml.find(xml_path, namespaces)
        
        return prop.text
        
    def extract_metadata_from_xml(self, gid, properties=['fileIdentifier', 'title', 'abstract', 'organisationName']):
    
        """
        Henter metadata udfra metadata id fra geodata-info.dk
        fileIdentifier
        title
        abstract
        organisationName  
        """
        
        root = self.get_xml(gid)
        
        props = {}

        for prop in properties:
            prop = str(prop).lower()
            
            if prop == "fileidentifier":
                props.update({"fileIdentifier" : self.find_property(root, self.fileIdentifier_path)})
            elif prop == "title":
                props.update({"title" : self.find_property(root, self.title_path)})
            elif prop == "abstract":
                props.update({"abstract" : self.find_property(root, self.abstract_path)})
            elif prop == "organisationname":
                props.update({"organisationName" : self.find_property(root, self.organisationName_path)})
                
        return props

## Hent data fra Postgres
Henter data ind og tilretter DF

In [39]:
query = """
    select *
    from metadata.geodatainfo
    """

In [40]:
df = con.sql_to_dataframe('production', query)

In [41]:
gdi = df[df.geodatainfo_id.notnull()].drop(['id', 'web_doku'], axis=1)

In [43]:
gdi.head()

Unnamed: 0,schema_name,table_name,geodatainfo_id
0,job_dagi,komkoder,f8fa213b-20a7-4912-8a40-7867b98d4df0
1,job_dagi,kommune,f8fa213b-20a7-4912-8a40-7867b98d4df0
2,job_dagi,retskr,f8fa213b-20a7-4912-8a40-7867b98d4df0
3,job_dagi,region,f8fa213b-20a7-4912-8a40-7867b98d4df0
4,job_dagi,postnummer,f8fa213b-20a7-4912-8a40-7867b98d4df0


## Berig med metadata fra Geodata-info

In [47]:
meta = Geometadata()

In [32]:
properties=['title', 'abstract', 'organisationName']

Hent data fra geodatainfo

In [49]:
gdi[properties] = gdi['geodatainfo_id'].apply(lambda x: pd.Series(meta.extract_metadata_from_xml(x, properties).values()))

Opdater kolonner med tilsvarende data

In [51]:
gdi['title'] = gdi.title.apply(lambda x: list(x)[0]) 
gdi['abstract'] = gdi.abstract.apply(lambda x: list(x)[1]) 
gdi['organisationName'] = gdi.organisationName.apply(lambda x: list(x)[2]) 

In [53]:
gdi.head()

Unnamed: 0,schema_name,table_name,geodatainfo_id,title,abstract,organisationName
0,job_dagi,komkoder,f8fa213b-20a7-4912-8a40-7867b98d4df0,DAGI_REF,Danmarks Administrative Geografiske Inddeling ...,Styrelsen for Dataforsyning og Effektivisering...
1,job_dagi,kommune,f8fa213b-20a7-4912-8a40-7867b98d4df0,DAGI_REF,Danmarks Administrative Geografiske Inddeling ...,Styrelsen for Dataforsyning og Effektivisering...
2,job_dagi,retskr,f8fa213b-20a7-4912-8a40-7867b98d4df0,DAGI_REF,Danmarks Administrative Geografiske Inddeling ...,Styrelsen for Dataforsyning og Effektivisering...
3,job_dagi,region,f8fa213b-20a7-4912-8a40-7867b98d4df0,DAGI_REF,Danmarks Administrative Geografiske Inddeling ...,Styrelsen for Dataforsyning og Effektivisering...
4,job_dagi,postnummer,f8fa213b-20a7-4912-8a40-7867b98d4df0,DAGI_REF,Danmarks Administrative Geografiske Inddeling ...,Styrelsen for Dataforsyning og Effektivisering...


### Output til Postgres

In [54]:
engine = con.engine('production')

In [55]:
gdi.to_sql('gdi_meta', con=engine, schema='proj_anba14')

## Opdatering i Postgres

Opdater meta i  GC2 settings tabellen 
```sql
with cte as (
	select "_key_",  replace(regexp_replace(c.abstract, E'[\\n\\r\\t]+', ' ', 'g' ), '"', '') abstract
	from SETTINGS.GEOMETRY_COLUMNS_JOIN a, jsonb_array_elements_text(tags) b
	join proj_anba14.gdi_meta c 
	on concat('_gdi:', c.geodatainfo_id) = b.value
)
update SETTINGS.GEOMETRY_COLUMNS_JOIN as a set meta = 
	case 
		when meta is null then concat('{"meta_desc": "', cte.abstract , '"}')::jsonb
		else meta || jsonb_build_object(concat('{"meta_desc": "', cte.abstract , '"}'))::jsonb
	end
from cte
where a."_key_" = cte._key_

```

# Gammelt

In [None]:
query = """
    select "_key_", RIGHT(value, length(value) - 5) id
    from SETTINGS.GEOMETRY_COLUMNS_JOIN, jsonb_array_elements_text(tags)
    where value like '_gdi:%%'
"""

In [None]:
def add_metadata(gid, properties, df):
    """
    Add metadata columns to dataframe
    """
    data = meta.get_meta_data(gid, properties)
    print(data)
   
    for key in data:
        #print(df[key], data[key])
        df[key] = data[key]

# HENT DATA WFS

Tabelnavn (typename) fra WFS'er GC2 scheduler jobs udtrækkes

In [None]:
wfs = pd.read_csv('data/wfs.csv')

In [None]:
def typename(url):
    for item in url.split('&'):
        if item.split('=')[0].lower() == 'typename':
            return (item.split('=')[1])

In [None]:
wfs['typename'] = wfs['url'].apply(lambda x: typename(x))

In [None]:
wfs.head()

## FIND uuid for hver typename

In [14]:
def get_uuid(typename):
    url = "https://www.geodata-info.dk/srv/dan/q"

    querystring = {
        "_content_type":"json",
        "any":typename,
        "bucket":"s101",
        "facet.q":"",
        "fast":"index",
        "from":"1",
        "resultType":"details",
        "sortBy":"relevance",
        "to":"20"
    }

    headers = {
        'Cache-Control': "no-cache"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    j = response.json()
    print(j)
    
#    try:
#        return j['metadata']['geonet:info']['uuid']
#    except Exception as e:
#        print(e)

In [15]:
get_uuid('bes_naturtyper')

{'@from': '1', '@to': '3', '@selected': '0', 'summary': {'@count': '3', '@type': 'local', 'dimension': [{'@name': 'type', '@label': 'types', 'category': [{'@value': 'service', '@label': 'Service', '@count': '2'}, {'@value': 'service-ERSTAT MED view ELLER download', '@label': 'service-ERSTAT MED view ELLER download', '@count': '2'}, {'@value': 'dataset', '@label': 'Datasæt', '@count': '1'}]}, {'@name': 'topicCat', '@label': 'topicCats', 'category': {'@value': 'environment', '@label': 'Miljø', '@count': '1'}}, {'@name': 'inspireThemeURI', '@label': 'inspireThemesURI', 'category': [{'@value': 'http://rdfdata.eionet.europa.eu/inspirethemes/themes/17', '@label': 'http://rdfdata.eionet.europa.eu/inspirethemes/themes/17', '@count': '1'}, {'@value': 'http://rdfdata.eionet.europa.eu/inspirethemes/themes/9', '@label': 'http://rdfdata.eionet.europa.eu/inspirethemes/themes/9', '@count': '1'}]}, {'@name': 'inspireThemeCluster', '@label': 'inspireThemeCluster', 'category': [{'@value': 'bioManagement

In [None]:
wfs['uuid'] = wfs.typename.apply(get_uuid)

In [None]:
wfs[wfs.uuid != 'None']

In [None]:
def get_uuid(typename):
    url = "https://www.geodata-info.dk/srv/dan/q"

    querystring = {
        "_content_type":"json",
        "any":typename,
        "bucket":"s101",
        "facet.q":"",
        "fast":"index",
        "from":"1",
        "resultType":"details",
        "sortBy":"relevance",
        "to":"20"
    }

    headers = {
        'Cache-Control': "no-cache"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    j = response.json()
    
    return j

In [None]:
get_uuid('dmp:AA_BES_LINJER')