In [11]:
!pip install xmltodict




In [12]:
import pandas as pd
import xmltodict
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Let's define some functions to easily retrieve data from NERC


In [13]:
# Some function to deal with NERC
def get_nvs_variable_info(id=None,
                          variable=None,
                          vocabulary=None,
                          nvs_url="http://vocab.nerc.ac.uk/collection/",
                          version="current",
                          format_output="?_profile=skos&_mediatype=application/ld+json"
                          ):
    """
    Method to parse the json format from the NERC NVS servers
    """
    if id:
        url = id
    else:
        # Define the base of the URL
        url = nvs_url + '/' + vocabulary + '/' + version

        # Add the optional variable
        if variable:
            url = url + '/' + variable

    # Get the response from the NERC servers
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    response = session.get(url + '/' + format_output)
    return response.json()

def split_nerc_id(id_url):
    """
    Small method to parse the NERC url for each variables to extract information
    """
    # Split the order ids to extract name and vocab
    id_list = id_url.split('/')
    val = ['http', 'empty', 'nerc_url', 'type', 'vocabulary', 'version', 'variable', 'unknown']
    return dict(zip(val, id_list))


# Combine MEDS and BIO Variables 
Merge the two and retrieve more information

In [14]:
# Format MEDS and BIO data

meds_list = 'https://raw.githubusercontent.com/cioos-siooc/cioos-siooc_data_transform/odf_transform/odf_transform/vocabulary/meds_pcodes_20191212_mods_utf8.csv'
bio_list = 'https://raw.githubusercontent.com/cioos-siooc/cioos-siooc_data_transform/odf_transform/odf_transform/vocabulary/bio_gf3_p01_mapping_2.7.2.xlsx'

# Get MEDS List
df_meds = pd.read_csv(meds_list) \
        .dropna(how='all', axis='index')\
        .rename({'CODE': 'GF3_CODE'}, axis='columns')
        

# Get BIO 
df_bio = pd.read_excel(bio_list) \
    .rename({'GF3(BIO) code': 'GF3_CODE',
             'standard_name':'CF_CODE'}, axis='columns') \
    .dropna(how='all', axis='index')
# Specify that the BIO file is related to BIO
df_bio['OWNER'] = 'BIO'  # Add bio as own to bio list

# Match BIO GF3 CODES to MEDS
df_meds_bio =  df_meds.loc[df_meds['OWNER']=='BIO'].add_prefix('MEDS:')\
                    .merge(df_bio.add_prefix('BIO:'),
                           how='outer', 
                           left_on='MEDS:GF3_CODE',
                           right_on='BIO:GF3_CODE')

# We'll assume that if a same CODE exist for other OWNERS with the same MEDS UNITS it should have the same values than for BIO
matching_variables = ['MEDS:GF3_CODE','MEDS:UNITS_DESCRIPTOR_E']
df_dfo = pd.merge(df_meds.reset_index().add_prefix('MEDS:'),
                  df_meds_bio[set(df_bio.add_prefix('BIO:').columns).union(matching_variables)],
                  how='outer',\
                  on=matching_variables)\
                  .rename({'GF3_CODE':'BIO:GF3_CODE'},axis='columns')

df_dfo['OWNER'] = df_dfo['MEDS:OWNER'].fillna(df_dfo['BIO:OWNER'])
df_dfo['GF3_CODE'] = df_dfo['MEDS:GF3_CODE'].fillna(df_dfo['BIO:GF3_CODE'])
df_dfo.head()

Unnamed: 0,MEDS:index,MEDS:GF3_CODE,MEDS:OWNER,MEDS:STATUS,MEDS:CATEGORY_MEANING_E,MEDS:CATEGORY_MEANING_F,MEDS:UNITS_DESCRIPTOR_E,MEDS:UNITS_DESCRIPTOR_F,MEDS:CF_CODE,MEDS:NETCDF_LONG_NAME,MEDS:WMO_CODE_TABLE_ID,MEDS:CONVENTION,BIO:P06 name,BIO:GF3_CODE,BIO:P06 urn,BIO:units,BIO:P01 name,BIO:P01 urn,BIO:CF_CODE,BIO:OWNER,OWNER,GF3_CODE
0,0.0,ABA$,MEDS,A,Instrument characteristics,Caractéristiques de l'instrument,number,nombre,,,,,,,,,,,,,MEDS,ABA$
1,1.0,ABSH,BIO,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,BIO,ABSH
2,2.0,ABSH,IML,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,IML,ABSH
3,3.0,ABSH,MEDS,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,absolute_humidity,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,MEDS,ABSH
4,4.0,AC2$,MEDS,A,General purpose,Tout usage,text,texte,,unique_number_HHMMSS_profNo,,,,,,,,,,,MEDS,AC2$


# Retrieve the CF Standard Name Table and compare it to the DFO ones

In [15]:
# Get CF variable list and alias
response = requests.get('https://cfconventions.org/Data/cf-standard-names/77/src/cf-standard-name-table.xml', stream=True)
response.raw.decode_content = True
cf_dict = xmltodict.parse(response.text)

# Convert to dataframes
cf_alias_dict = {entry['@id']:entry['entry_id'] for entry in cf_dict['standard_name_table']['alias']}
df_cf = pd.DataFrame(cf_dict['standard_name_table']['entry'])
df_cf['version_number'] = cf_dict['standard_name_table']['version_number']
df_cf['last_modified'] = cf_dict['standard_name_table']['last_modified']
df_cf['institution'] = cf_dict['standard_name_table']['institution']
df_cf['contact'] = cf_dict['standard_name_table']['contact']
print(str(len(df_cf))+' standard_name available')

4460 standard_name available


In [16]:
# Review CF names provided by MEDS and BIO
# Replace CF alias by standard name if an alias is used instead
df_dfo[['MEDS:CF_CODE','BIO:CF_CODE']]= df_dfo[['MEDS:CF_CODE','BIO:CF_CODE']].replace(cf_alias_dict)

# Transfer CF names from MEDS and BIO to a general CF_CODE
# MEDS CF available in CF77
replace_rule = df_dfo['MEDS:CF_CODE'].isin(df_cf['@id'])
df_dfo.loc[replace_rule,'CF_CODE']  = df_dfo.loc[replace_rule,'MEDS:CF_CODE']
# BIO CF available in CF77           
replace_rule = df_dfo['BIO:CF_CODE'].isin(df_cf['@id']) & df_dfo['CF_CODE'].isna()
df_dfo.loc[replace_rule,'CF_CODE'] = df_dfo.loc[replace_rule,'BIO:CF_CODE']

df_dfo.head()

Unnamed: 0,MEDS:index,MEDS:GF3_CODE,MEDS:OWNER,MEDS:STATUS,MEDS:CATEGORY_MEANING_E,MEDS:CATEGORY_MEANING_F,MEDS:UNITS_DESCRIPTOR_E,MEDS:UNITS_DESCRIPTOR_F,MEDS:CF_CODE,MEDS:NETCDF_LONG_NAME,MEDS:WMO_CODE_TABLE_ID,MEDS:CONVENTION,BIO:P06 name,BIO:GF3_CODE,BIO:P06 urn,BIO:units,BIO:P01 name,BIO:P01 urn,BIO:CF_CODE,BIO:OWNER,OWNER,GF3_CODE,CF_CODE
0,0.0,ABA$,MEDS,A,Instrument characteristics,Caractéristiques de l'instrument,number,nombre,,,,,,,,,,,,,MEDS,ABA$,
1,1.0,ABSH,BIO,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,BIO,ABSH,specific_humidity
2,2.0,ABSH,IML,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,IML,ABSH,specific_humidity
3,3.0,ABSH,MEDS,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,absolute_humidity,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,MEDS,ABSH,specific_humidity
4,4.0,AC2$,MEDS,A,General purpose,Tout usage,text,texte,,unique_number_HHMMSS_profNo,,,,,,,,,,,MEDS,AC2$,


# Retrieve the NERC ID related to each standard name in P07

In [17]:
# Retrieve NERC P07 matching ID
p07_dict = get_nvs_variable_info(vocabulary='P07')
df_P07 = pd.DataFrame.from_dict(p07_dict)
df_P07['CF_CODE'] = df_P07['http://www.w3.org/2004/02/skos/core#prefLabel'].apply(pd.Series)[0].apply(pd.Series)['@value']
df_dfo = df_dfo.merge(df_P07[['@id','CF_CODE']], on='CF_CODE',how='left').rename({'@id':'NERC:P07 id'},axis='columns')


# Retrieve NERC ID for each P01 available

In [18]:
# Retrieve NERC P01 ID from BIO:P01 urn
p01_dict = get_nvs_variable_info(vocabulary='P01')
df_P01 = pd.DataFrame.from_dict(p01_dict)
df_P01['NERC:P01 urn'] = df_P01['@id'].str.extract('(/[A-Z0-9a-z]*/$)')[0].str.replace('/','')
df_dfo = df_dfo.merge(df_P01[['@id','NERC:P01 urn']],
                      left_on='BIO:P01 urn', right_on='NERC:P01 urn',
                      how='left').rename({'@id':'NERC:P01 id'},axis='columns')

In [19]:
df_dfo.head()

Unnamed: 0,MEDS:index,MEDS:GF3_CODE,MEDS:OWNER,MEDS:STATUS,MEDS:CATEGORY_MEANING_E,MEDS:CATEGORY_MEANING_F,MEDS:UNITS_DESCRIPTOR_E,MEDS:UNITS_DESCRIPTOR_F,MEDS:CF_CODE,MEDS:NETCDF_LONG_NAME,MEDS:WMO_CODE_TABLE_ID,MEDS:CONVENTION,BIO:P06 name,BIO:GF3_CODE,BIO:P06 urn,BIO:units,BIO:P01 name,BIO:P01 urn,BIO:CF_CODE,BIO:OWNER,OWNER,GF3_CODE,CF_CODE,NERC:P07 id,NERC:P01 id,NERC:P01 urn
0,0.0,ABA$,MEDS,A,Instrument characteristics,Caractéristiques de l'instrument,number,nombre,,,,,,,,,,,,,MEDS,ABA$,,,,
1,1.0,ABSH,BIO,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,BIO,ABSH,specific_humidity,http://vocab.nerc.ac.uk/collection/P07/current...,http://vocab.nerc.ac.uk/collection/P01/current...,CHUMZZ01
2,2.0,ABSH,IML,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,IML,ABSH,specific_humidity,http://vocab.nerc.ac.uk/collection/P07/current...,http://vocab.nerc.ac.uk/collection/P01/current...,CHUMZZ01
3,3.0,ABSH,MEDS,A,Meteorology,Météorologie,gram per metre cube,gramme par mètre cube,,absolute_humidity,,,grams per cubic metre,ABSH,UGMC,g/m^3,Relative humidity of the atmosphere,CHUMZZ01,specific_humidity,BIO,MEDS,ABSH,specific_humidity,http://vocab.nerc.ac.uk/collection/P07/current...,http://vocab.nerc.ac.uk/collection/P01/current...,CHUMZZ01
4,4.0,AC2$,MEDS,A,General purpose,Tout usage,text,texte,,unique_number_HHMMSS_profNo,,,,,,,,,,,MEDS,AC2$,,,,


In [20]:
# Save the result to a json dictionary make searchable by OWNER(organization) and GF3_CODE
df_dfo.set_index(['OWNER','GF3_CODE']).sort_index().to_json('dfo_vocabulary_list.json',orient='records')

# And a CSV file for easily review result
df_dfo.set_index(['OWNER','GF3_CODE']).sort_index().to_csv('dfo_vocabulary_list.csv')