In [69]:
import pandas as pd
import numpy as np
import os

import urllib
from glob import glob
import yaml
import re
import copy
import json
import requests
from tqdm import tqdm
from requests.exceptions import HTTPError

from utils import utils

- OWL ontologies
- Websites:
  - OLS
  - Ontobee


# Functions


In [70]:
# Clean list columns into single string
def join_strings(string):
    try:
        return ",".join(string)
    except:
        return ""


def sort_lists(l):
    try:
        return ",".join(sorted(np.unique(l)))
    except:
        return l


validation_coder = {
    "number": "regex search ([0-9]+\.[0-9]*.?)|([0-9]+)",
    "integer": "regex search ([0-9]+)",
    "string": "",
}

In [71]:
# parse url into api call
def url_to_api_call(url):
    url = url.replace("ols4", "ols4/api")

    test = urllib.parse.urlparse(url)

    url_parts = list(urllib.parse.urlparse(url))

    iri = urllib.parse.parse_qs(test.query)["iri"][0]

    new_iri = urllib.parse.quote_plus(urllib.parse.quote_plus(iri))

    url_parts[2] = url_parts[2].replace("classes", "terms")

    url_parts[2] = "/".join([url_parts[2], new_iri])

    url_parts[4] = ""

    new_url = urllib.parse.urlunparse(url_parts)

    return new_url

In [72]:
def get_response(u, params=None):
    """_summary_

    Args:
        u (_type_): _description_
        params (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_
    """
    try:
        response = requests.get(u, params=params)

        # If the response was successful, no Exception will be raised
        response.raise_for_status()

    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Python 3.6
    except Exception as err:
        print(f"Other error occurred: {err}")  # Python 3.6
    else:
        print(response)
        return response

# Pull in current data model


In [73]:
dm = utils.load_and_backup_dm("../EL.data.model.csv", output_dir="../backups")

# these ontologies are too large
indexes = dm[
    dm["Source"].str.contains(
        "https://www.ebi.ac.uk/ols4/ontologies/mondo|https://www.ebi.ac.uk/ols4/ontologies/maxo|https://www.ebi.ac.uk/ols4/ontologies/hp",
        regex=True,
        na=False,
    )
].index.tolist()

dm.loc[indexes, "Valid Values"] = np.nan
dm.loc[indexes, "Description"] = (
    dm.loc[indexes, "Description"] + " Please see the source ontology."
)
dm.loc[indexes, "Validation Rules"] = "str"

dm.loc[dm["Properties"] == "Valid Value", "Validation Rules"] = np.nan

dm["Required"] = dm["Required"].replace("False,True", "True")

dm["Source"] = dm["Source"].replace(
    "https://ontobee.org/ontology/NCITiri=http://purl.obolibrary.org/obo/NCIT_C62690https://www.ebi.ac.uk/ols4/ontologies/edam/terms?iri=http%3A%2F%2Fedamontology.org%2Fdata_1045",
    "https://ontobee.org/ontology/NCITiri=http://purl.obolibrary.org/obo/NCIT_C62690, https://www.ebi.ac.uk/ols4/ontologies/edam/terms?iri=http%3A%2F%2Fedamontology.org%2Fdata_1045",
)

dm = dm.replace("-The%20life%20stage", "", regex=True)

In [74]:
# Attributes with ontologies
dm_test = dm[
    (
        dm["Source"]
        .fillna("")
        .str.contains("http", regex=True, flags=re.IGNORECASE, na=False)
    )
    & (dm["Source"].str.contains("purl|ebi", regex=True, na=False))
    & (
        dm["Valid Values"].str.contains(
            "not", flags=re.IGNORECASE, regex=True, na=False
        )
    )
].reset_index(drop=True)

dm_test["Source"] = dm_test["Source"].str.replace("ols", "ols4", regex=True)
dm_test["Source"] = dm_test["Source"].str.replace(
    "terms", "terms?", regex=True)

dm_test["Valid Values"] = dm_test["Valid Values"].str.split(",")

with pd.option_context("display.max_colwidth", None):
    display(dm_test)

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,module
0,acquisitionMode,"The specific aspect of a mass spectrometer method by which mass ranges are selected and possibly dissociated (full scan, MSn, SIM, MRM, etc.).","[Not Specified, Not applicable, Not collected, Unknown]",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,https://www.ebi.ac.uk/ols4/ontologies/ms/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1003213,STRING,https://www.ebi.ac.uk/ols/ontologies/ms/termsiri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1003213,False,,
1,acquisitionSoftware,The name of the acquisition software used,"[Not Specified, Not applicable, Not collected, Unknown]",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001455,STRING,http://purl.obolibrary.org/obo/MS_1001455,False,,
2,cellType,Indicate the cell type.,"[Not applicable, Not collected, OtherCellType, Unknown]",True,,,unspecified,,"Biospecimen human,Biospecimen nonHuman",unspecified,"https://bioportal.bioontology.org/ontologies/CCF/p=classes&conceptid=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type,https://www.ebi.ac.uk/ols4/ontologies/ccf/terms?iri=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type",STRING,"https://bioportal.bioontology.org/ontologies/CCF/p=classes&conceptid=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type,https://www.ebi.ac.uk/ols/ontologies/ccf/termsiri=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type",False,,
3,commonName,"The biological species common name the individual belongs to (ex. ""Horned Lark"").note: As a default, the valid scientific name for the species should be indicated.","[Not applicable, Not collected, Unknown]",True,,,unspecified,,Individual nonHuman,unspecified,https://www.ebi.ac.uk/ols4/ontologies/edam/terms?iri=http%3A%2F%2Fedamontology.org%2Fdata_1874,STRING,https://www.ebi.ac.uk/ols/ontologies/edam/termsiri=http%3A%2F%2Fedamontology.org%2Fdata_1874,False,,
4,databaseName,"The name of the search database (nr, SwissProt or est_human, and/or mass spectral library).","[HMDB, MassBank of North America (MoNA), Metlin, NIST17, Not Specified, Not applicable, Not collected, OtherDatabaseNameDatabaseName, Unknown]",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001013,STRING,http://purl.obolibrary.org/obo/MS_1001013,False,,
5,databaseSource,"The name of the organization, project, or laboratory from where the database is obtained (UniProt, NCBI, EBI, other).","[Not Specified, Not applicable, Not collected, OtherDatabaseSourceDatabaseSource, Unknown]",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001012,STRING,http://purl.obolibrary.org/obo/MS_1001012,False,,
6,diagnosis,Indicate the disease or condition.,"[Not applicable, Not collected, OtherDiagnosis, Unknown]",False,,,DataProperty,,Individual Human,unspecified,"https://www.ebi.ac.uk/ols4/ontologies/hp,https://www.ebi.ac.uk/ols4/ontologies/maxo,https://www.ebi.ac.uk/ols4/ontologies/mondo",STRING,"https://www.ebi.ac.uk/ols/ontologies/hp,https://www.ebi.ac.uk/ols/ontologies/maxo,https://www.ebi.ac.uk/ols/ontologies/mondo",True,,
7,extractionMethod,The name of the process used to separate a desired component of an input material from the remainder,"[Not Specified, Not applicable, Not collected, OtherExtractionMethodExtractionMethod, Unknown]",True,,,unspecified,,"Metabolomics Human,Microbiome",unspecified,http://purl.obolibrary.org/obo/OBI_0302884,STRING,http://purl.obolibrary.org/obo/OBI_0302884,False,,
8,lifeStage,The life stage of the individual. note: Other values are possible depending on life stage terminology for individual species. Please let the data curation team know.,"[Adult, Juvenile, Not applicable, Not collected, OtherLifeStage, Post-Juvenile, Unknown]",True,,,unspecified,,Individual nonHuman,unspecified,"-The%20life%20stage,Biological Collections Ontology,https://www.animalaudiograms.org/audiogram_metadata_scheme#:~:text=range%3A%20female%3B%20male-,https://www.ebi.ac.uk/ols4/ontologies/pato,life%20stage",STRING,"-The%20life%20stage,Biological Collections Ontology,https://www.animalaudiograms.org/audiogram_metadata_scheme#:~:text=range%3A%20female%3B%20male-,https://www.ebi.ac.uk/ols/ontologies/pato,life%20stage",False,,
9,modificationParameters,Modification parameters for the search engine run. [ PSI: PI http://www.w3.org/2002/07/owl#Axiom ],"[Not Specified, Not applicable, Not collected, OtherModificationParametersModificationParameters, Unknown]",True,,,unspecified,,Proteomics,unspecified,https://www.ebi.ac.uk/ols4/ontologies/ms/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001055,STRING,https://www.ebi.ac.uk/ols/ontologies/ms/termsiri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001055,False,,


# OLS

https://www.ebi.ac.uk/ols4/help  
replace ols with ols4


In [75]:
# to use for searching ols4 for terms
base_url = "http://www.ebi.ac.uk/ols4/api/terms"

## Functions


purl case


In [76]:
def simplify_response(response):
    """Get the terms from the response. Helps to simplify parsing the json"""

    j = response.json()["_embedded"]["terms"]

    if len(j) == 1:
        return j[0]
    else:
        print(f"Response has {len(j)} terms")
        return j


def iri_request(iri):
    """Get call to iri site"""
    response = get_response(base_url, params={"iri": iri})

    return response


def get_iri(url):
    """Extract iri from url"""
    test = urllib.parse.urlparse(url)

    params = urllib.parse.parse_qs(test.query)

    iri = params["iri"][0]

    return iri


def find_defining_ontology(terms):
    """look for defining ontology"""
    if len(terms) > 1:
        for t in terms:
            if t["is_defining_ontology"] == True:
                return t
            else:
                pass
    elif len(terms) == 1:
        return terms[0]
    else:
        return None


# term = find_defining_ontology(terms)
# term.


def get_all_terms(url, json_response):
    """get all terms from pages"""
    pages = json_response["page"]["totalPages"]
    size = json_response["page"]["size"]

    # get the original link
    # url = json_response['_links']['self']['href']

    terms_list = []

    for i in tqdm(range(pages)):
        response = requests.get(url, params={"page": i, "size": size})

        temp_d = response.json()

        for term in temp_d["_embedded"]["terms"]:
            terms_list.append(term)

    print("Length of terms: ", len(terms_list), sep="\t")

    return terms_list


# terms_list = get_all_terms(url, json_response)
def get_labels_from_terms_list(terms_list):
    vv = []

    for t in terms_list:
        # if t['has_children'] == False:
        vv.append(t)

    labels = []

    for i in vv:
        labels.append(i["label"])

    labels = sorted(np.unique(labels))

    return labels


def purl_main(iri):
    response = iri_request(iri)
    json_response = response.json()
    terms = json_response["_embedded"]["terms"]
    term = find_defining_ontology(terms)

    # get descendants
    url = term["_links"]["hierarchicalDescendants"]["href"]
    d = get_response(url)
    json_response = d.json()

    terms_list = get_all_terms(url, json_response)

    valid_values = get_labels_from_terms_list(terms_list)

    return valid_values

# Start extraction


In [77]:
with pd.option_context("display.max_colwidth", 0):
    display(dm_test.iloc[4, :])

Attribute              databaseName                                                                                                                                  
Description            The name of the search database (nr, SwissProt or est_human, and/or mass spectral library).                                                   
Valid Values           [HMDB, MassBank of North America (MoNA), Metlin, NIST17, Not Specified, Not applicable, Not collected, OtherDatabaseNameDatabaseName, Unknown]
Required               True                                                                                                                                          
DependsOn              NaN                                                                                                                                           
DependsOn Component    NaN                                                                                                                                           
Prop

In [78]:
# {
#     # could replace with this link to get higher level terms
#     "https://www.ebi.ac.uk/ols4/ontologies/maxo": "http://purl.obolibrary.org/obo/MONDO_0700096"
# }

# dm_test["Source"] = dm_test["Source"].str.replace(
#     "https://www.ebi.ac.uk/ols4/ontologies/mondo|https://www.ebi.ac.uk/ols4/ontologies/maxo|https://www.ebi.ac.uk/ols4/ontologies/hp",
#     "LLFS data dictionary",
#     regex=True,
# )

In [79]:
dm_test["purl"] = (
    dm_test["Source"]
    .str.split(",")
    .apply(
        lambda x: sorted(
            np.unique([y for y in x if bool(re.search("ebi|purl", y))]))
    )
)

In [80]:
other_vvs = ["Other", "Unknown", "Not Available", "Not Given"]

In [81]:
dm.query('Attribute.str.contains("diagnosis")', engine="python")

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,module
395,diagnosis,Indicate the disease or condition.,"Not applicable,Not collected,OtherDiagnosis,Un...",False,,,DataProperty,,Individual Human,unspecified,"https://www.ebi.ac.uk/ols/ontologies/hp,https:...",STRING,"https://www.ebi.ac.uk/ols/ontologies/hp,https:...",True,,
396,diagnosisStatus,Whether the individual has been diagnosed with...,"False,Not applicable,Not collected,TrueDiagnos...",True,,,unspecified,,Individual Human,unspecified,Sage Bionetworks,STRING,Sage Bionetworks,True,,


## Main extraction


In [86]:
dm_test["extraction_status"] = ""

errors = []

for i, v in dm_test.iterrows():
    storage = dm_test.loc[i, "Valid Values"]
    for url in v["purl"]:
        print(url)
        # for world countries
        if dm_test.at[i, "extraction_status"] == "done":
            next
        else:
            if (
                url
                == "https://wits.worldbank.org/countryprofile/metadata/en/country/all"
            ):
                x = pd.read_excel(
                    io="http://wits.worldbank.org/data/public/WITSCountryProfile-Country_Indicator_ProductMetada-en.xlsx",
                    sheet_name="Country-Metadata",
                )

                dm_test.at[i, "Valid Values"] = sorted(
                    storage + x["Country Code"].tolist()
                )

                dm_test.at[i, "extraction_status"] = "done"

            elif "terms" in url and "ebi.ac.uk" in url:
                url = url_to_api_call(url)

                result = get_response(url)

                try:
                    json_result = result.json()

                    url_descendents = json_result["_links"]["hierarchicalDescendants"][
                        "href"
                    ]

                    json_result = get_response(url_descendents).json()

                    terms_list = get_all_terms(url_descendents, json_result)

                    result = get_labels_from_terms_list(terms_list)

                    dm_test.at[i, "Valid Values"] = sorted(storage + result)

                    dm_test.at[i, "extraction_status"] = "done"

                except Exception as e:
                    dm_test.at[i, "extraction_status"] = "error"
                    print(e)
                    next

            elif "http://purl" in url:
                try:
                    result = purl_main(url)

                    dm_test.at[i, "Valid Values"] = sorted(storage + result)

                    dm_test.at[i, "extraction_status"] = "done"
                except Exception as e:
                    print(e)

            else:
                dm_test.at[i, "extraction_status"] = "error"
                print("Error")
                errors.append(url)

        print("-" * 20)

dm_test["Valid Values"] = (
    dm_test["Valid Values"].apply(lambda x: ",".join(x)).apply(utils.clean_list)
)

errors

https://www.ebi.ac.uk/ols4/ontologies/ms/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1003213


<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


Length of terms: 	8
--------------------
http://purl.obolibrary.org/obo/MS_1001455
<Response [200]>
<Response [200]>


100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


Length of terms: 	25
--------------------
https://bioportal.bioontology.org/ontologies/CCF/p=classes&conceptid=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type
Error
--------------------
https://www.ebi.ac.uk/ols4/ontologies/ccf/terms?iri=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type
<Response [200]>
<Response [200]>


100%|██████████| 67/67 [01:28<00:00,  1.32s/it]


Length of terms: 	1321
--------------------
https://www.ebi.ac.uk/ols4/ontologies/edam/terms?iri=http%3A%2F%2Fedamontology.org%2Fdata_1874
<Response [200]>
'hierarchicalDescendants'
--------------------
http://purl.obolibrary.org/obo/MS_1001013
<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


Length of terms: 	11
--------------------
http://purl.obolibrary.org/obo/MS_1001012
<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:00<00:00,  1.32it/s]


Length of terms: 	3
--------------------
https://www.ebi.ac.uk/ols4/ontologies/hp
Error
--------------------
https://www.ebi.ac.uk/ols4/ontologies/maxo
Error
--------------------
https://www.ebi.ac.uk/ols4/ontologies/mondo
Error
--------------------
http://purl.obolibrary.org/obo/OBI_0302884
<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


Length of terms: 	10
--------------------
https://www.ebi.ac.uk/ols4/ontologies/pato
Error
--------------------
https://www.ebi.ac.uk/ols4/ontologies/ms/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001055
<Response [200]>
<Response [200]>


100%|██████████| 2/2 [00:01<00:00,  1.18it/s]


Length of terms: 	21
--------------------
http://purl.obolibrary.org/obo/NCIT_C156434
<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


Length of terms: 	1
--------------------
http://purl.obolibrary.org/obo/NCIT_C13018
<Response [200]>
<Response [200]>


100%|██████████| 18/18 [00:21<00:00,  1.20s/it]


Length of terms: 	359
--------------------
https://www.ebi.ac.uk/ols4/ontologies/pride/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FPRIDE_0000514&viewMode=All&siblings=false
HTTP error occurred: 404 Client Error:  for url: https://www.ebi.ac.uk/ols4/api/ontologies/pride/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FPRIDE_0000514
'NoneType' object has no attribute 'json'
--------------------
http://purl.obolibrary.org/obo/NCIT_C28421
<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:00<00:00,  1.46it/s]


Length of terms: 	2
--------------------
https://ontobee.org/ontology/NCITiri=http://purl.obolibrary.org/obo/NCIT_C62690https://www.ebi.ac.uk/ols4/ontologies/edam/terms?iri=http%3A%2F%2Fedamontology.org%2Fdata_1045
HTTP error occurred: 500 Server Error: Internal Server Error for url: https://ontobee.org/ontology/NCITiri=http://purl.obolibrary.org/obo/NCIT_C62690https://www.ebi.ac.uk/ols4/api/ontologies/edam/terms/http%253A%252F%252Fedamontology.org%252Fdata_1045
'NoneType' object has no attribute 'json'
--------------------
https://www.ebi.ac.uk/ols4/ontologies/edam/terms?iri=http%3A%2F%2Fedamontology.org%2Fdata_1045
<Response [200]>
'hierarchicalDescendants'
--------------------
https://www.ebi.ac.uk/ols4/ontologies/ccf/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0004537&lang=en&viewMode=PreferredRoots&siblings=false
<Response [200]>
<Response [200]>


100%|██████████| 2/2 [00:02<00:00,  1.29s/it]


Length of terms: 	22
--------------------
http://purl.obolibrary.org/obo/NCIT_C70713
<Response [200]>
<Response [200]>


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


Length of terms: 	17
--------------------
http://purl.obolibrary.org/obo/TAXRANK_0000010
<Response [200]>
'hierarchicalDescendants'
--------------------
http://purl.obolibrary.org/obo/NCIT_C45378
<Response [200]>
'hierarchicalDescendants'
--------------------
http://purl.obolibrary.org/obo/NCIT_C12801
<Response [200]>
<Response [200]>


100%|██████████| 26/26 [00:31<00:00,  1.20s/it]

Length of terms: 	507
--------------------
http://purl.obolibrary.org/obo/NCIT_C70713
--------------------





['https://bioportal.bioontology.org/ontologies/CCF/p=classes&conceptid=http%3A%2F%2Fpurl.org%2Fccf%2Fcell_type',
 'https://www.ebi.ac.uk/ols4/ontologies/hp',
 'https://www.ebi.ac.uk/ols4/ontologies/maxo',
 'https://www.ebi.ac.uk/ols4/ontologies/mondo',
 'https://www.ebi.ac.uk/ols4/ontologies/pato']

In [97]:
no_vvs = ["diagnosis", "extractionMethod", "taxon", "commonName"]
dm_test.loc[dm_test["Attribute"].isin(no_vvs), "Valid Values"] = np.nan

In [None]:
# dm_test['Required'] = dm_test['Required'].replace('False,True', 'True')

In [99]:
# drop extra columns
dm_test = dm_test.drop(columns=["purl", "extraction_status"])

In [100]:
# join new valid values df with current dm
dm_test

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,module
0,acquisitionMode,The specific aspect of a mass spectrometer met...,"Not Specified,Not applicable,Not collected,Unk...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,https://www.ebi.ac.uk/ols4/ontologies/ms/terms...,STRING,https://www.ebi.ac.uk/ols/ontologies/ms/termsi...,False,,
1,acquisitionSoftware,The name of the acquisition software used,"4000 Series Explorer Software,4700 Explorer,63...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001455,STRING,http://purl.obolibrary.org/obo/MS_1001455,False,,
2,cellType,Indicate the cell type.,"A2 amacrine cell,Acinar cell,Adipocyte,Afferen...",True,,,unspecified,,"Biospecimen human,Biospecimen nonHuman",unspecified,https://bioportal.bioontology.org/ontologies/C...,STRING,https://bioportal.bioontology.org/ontologies/C...,False,,
3,commonName,The biological species common name the individ...,,True,,,unspecified,,Individual nonHuman,unspecified,https://www.ebi.ac.uk/ols4/ontologies/edam/ter...,STRING,https://www.ebi.ac.uk/ols/ontologies/edam/term...,False,,
4,databaseName,"The name of the search database (nr, SwissProt...","HMDB,MassBank of North America (MoNA),Metlin,N...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001013,STRING,http://purl.obolibrary.org/obo/MS_1001013,False,,
5,databaseSource,"The name of the organization, project, or labo...","DB source EBI,DB source NCBI,DB source UniProt...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001012,STRING,http://purl.obolibrary.org/obo/MS_1001012,False,,
6,diagnosis,Indicate the disease or condition.,,False,,,DataProperty,,Individual Human,unspecified,"https://www.ebi.ac.uk/ols4/ontologies/hp,https...",STRING,"https://www.ebi.ac.uk/ols/ontologies/hp,https:...",True,,
7,extractionMethod,The name of the process used to separate a des...,,True,,,unspecified,,"Metabolomics Human,Microbiome",unspecified,http://purl.obolibrary.org/obo/OBI_0302884,STRING,http://purl.obolibrary.org/obo/OBI_0302884,False,,
8,lifeStage,The life stage of the individual. note: Other ...,"Adult,Juvenile,Not applicable,Not collected,Ot...",True,,,unspecified,,Individual nonHuman,unspecified,"-The%20life%20stage,Biological Collections Ont...",STRING,"-The%20life%20stage,Biological Collections Ont...",False,,
9,modificationParameters,Modification parameters for the search engine ...,"Fixed modification,H2O neutral loss,H3PO4 neut...",True,,,unspecified,,Proteomics,unspecified,https://www.ebi.ac.uk/ols4/ontologies/ms/terms...,STRING,https://www.ebi.ac.uk/ols/ontologies/ms/termsi...,False,,


In [105]:
# dm.loc[dm["Attribute"] == dm_test.loc[0, "Attribute"],] =
replacements = dm_test.set_index("Attribute").to_dict()

In [114]:
for k, v in replacements.items():
    for a, av in v.items():
        dm.loc[dm["Attribute"] == a, k] = av

In [115]:
dm.loc[dm["Attribute"].isin(dm_test["Attribute"])]

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,module
359,acquisitionMode,The specific aspect of a mass spectrometer met...,"Not Specified,Not applicable,Not collected,Unk...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,https://www.ebi.ac.uk/ols4/ontologies/ms/terms...,STRING,https://www.ebi.ac.uk/ols/ontologies/ms/termsi...,False,,
360,acquisitionSoftware,The name of the acquisition software used,"4000 Series Explorer Software,4700 Explorer,63...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001455,STRING,http://purl.obolibrary.org/obo/MS_1001455,False,,
379,cellType,Indicate the cell type.,"A2 amacrine cell,Acinar cell,Adipocyte,Afferen...",True,,,unspecified,,"Biospecimen human,Biospecimen nonHuman",unspecified,https://bioportal.bioontology.org/ontologies/C...,STRING,https://bioportal.bioontology.org/ontologies/C...,False,,
382,commonName,The biological species common name the individ...,,True,,,unspecified,,Individual nonHuman,unspecified,https://www.ebi.ac.uk/ols4/ontologies/edam/ter...,STRING,https://www.ebi.ac.uk/ols/ontologies/edam/term...,False,,
391,databaseName,"The name of the search database (nr, SwissProt...","HMDB,MassBank of North America (MoNA),Metlin,N...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001013,STRING,http://purl.obolibrary.org/obo/MS_1001013,False,,
392,databaseSource,"The name of the organization, project, or labo...","DB source EBI,DB source NCBI,DB source UniProt...",True,,,unspecified,,"Metabolomics Human,Proteomics",unspecified,http://purl.obolibrary.org/obo/MS_1001012,STRING,http://purl.obolibrary.org/obo/MS_1001012,False,,
395,diagnosis,Indicate the disease or condition.,,False,,,DataProperty,,Individual Human,unspecified,"https://www.ebi.ac.uk/ols4/ontologies/hp,https...",STRING,"https://www.ebi.ac.uk/ols/ontologies/hp,https:...",True,,
407,extractionMethod,The name of the process used to separate a des...,,True,,,unspecified,,"Metabolomics Human,Microbiome",unspecified,http://purl.obolibrary.org/obo/OBI_0302884,STRING,http://purl.obolibrary.org/obo/OBI_0302884,False,,
439,lifeStage,The life stage of the individual. note: Other ...,"Adult,Juvenile,Not applicable,Not collected,Ot...",True,,,unspecified,,Individual nonHuman,unspecified,"-The%20life%20stage,Biological Collections Ont...",STRING,"-The%20life%20stage,Biological Collections Ont...",False,,
455,modificationParameters,Modification parameters for the search engine ...,"Fixed modification,H2O neutral loss,H3PO4 neut...",True,,,unspecified,,Proteomics,unspecified,https://www.ebi.ac.uk/ols4/ontologies/ms/terms...,STRING,https://www.ebi.ac.uk/ols/ontologies/ms/termsi...,False,,


In [124]:
dm = dm.replace("-The%20life%20stage", "", regex=True)

dm[["Source", "Ontology"]] = (
    dm[["Source", "Ontology"]].fillna("").applymap(utils.clean_list).replace("", np.nan)
)

# Update data model


In [127]:
sum(dm.duplicated(subset="Attribute"))

0

In [None]:
dm.reset_index(drop=True, inplace=True)

In [None]:
# write out new data model
dm.to_csv("../EL.data.model.csv")