# Setup


- Fix inconsistencies in parent column


In [1]:
import pandas as pd
import numpy as np
import os

from glob import glob
import yaml
import re
import copy
import json
from thefuzz import fuzz

# Custom package
from utils import utils

In [2]:
with open("./local_configs/notebook_config.yaml", "r") as f:
    config = yaml.safe_load(f)

# paths to import files
schematic_config = config["paths"]["schematic"]
csv_model = config["file_names"]["csv_model"]
json_model = config["file_names"]["json_model"]

print(
    "Schematic config: ",
    schematic_config,
    "\n",
    "CSV model: ",
    csv_model,
    "\n",
    "JSON LD Model: ",
    json_model,
)

Schematic config:  ./config.yml 
 CSV model:  EL.data.model.csv 
 JSON LD Model:  EL.data.model.jsonld


## Hard Coded stuff


In [3]:
validation_coder = {
    "number": "regex search ([0-9]+\.[0-9]*.?)|([0-9]+)",
    "integer": "regex search ([0-9]+)",
    "string": "",
}

In [4]:
base_cols = [
    "Attribute",
    "Description",
    "Valid Values",
    "DependsOn",
    "Properties",
    "Required",
    "Template",
    "Parent",
    "DependsOn Component",
    "Source",
    "Validation Rules",
]

keep_cols = [
    "Attribute",
    "Description",
    "Valid Values",
    "Required",
    "DependsOn",
    "DependsOn Component",
    "Properties",
    "Validation Rules",
    "Template",
    "Parent",
    "Source",
    "Type",
    "Ontology",
    "multivalue",
]

list_cols = [
    "UsedIn",
    "DependsOn",
    "Properties",
    "Validation Rules",
    "Template",
    "Parent",
    "Source",
    "Type",
    "Ontology",
    "Required",
    "multivalue",
]

hard_coded_valid_values = [
    {
        "attribute": "visitCode",
        "valid_value": "1,2,3,4,Other,Unknown,Not collected,Not applicable",
    },
    {"attribute": "tissueWeight", "valid_value": ""},
    {"attribute": "consentGroupID", "valid_value": "1,2,3"},
    {"attribute": "samplingAge", "valid_value": ""},
    {"attribute": "specimenAge", "valid_value": ""},
    {"attribute": "age", "valid_value": ""},
]

template_hard_coded_attrs = {
    "Attribute": "",
    "Description": "",
    "Valid Values": "",
    "Required": "False",
    "Validation Rules": "",
    "Template": "",
    "Parent": "",
    "Source": "",
    "Type": "",
    "Ontology": "",
    "UsedIn": "",
}

hard_coded_attrs = [
    {
        "Attribute": "Other",
        "Description": "When value is not apart of the list",
        "Valid Values": "",
        "Required": "False",
        "Validation Rules": "",
        "Template": "",
        "Parent": "",
        "Source": "",
        "Type": "",
        "Ontology": "",
        "UsedIn": "",
    },
    {
        "Attribute": "",
        "Description": "",
        "Valid Values": "",
        "Required": "False",
        "Validation Rules": "",
        "Template": "",
        "Parent": "",
        "Source": "",
        "Type": "",
        "Ontology": "",
        "UsedIn": "",
    },
    {
        "Attribute": "",
        "Description": "",
        "Valid Values": "",
        "Required": "False",
        "Validation Rules": "",
        "Template": "",
        "Parent": "",
        "Source": "",
        "Type": "",
        "Ontology": "",
        "UsedIn": "",
    },
    {
        "Attribute": "",
        "Description": "",
        "Valid Values": "",
        "Required": "False",
        "Validation Rules": "",
        "Template": "",
        "Parent": "",
        "Source": "",
        "Type": "",
        "Ontology": "",
        "UsedIn": "",
    },
]

In [5]:
# hard coded dictionary
recoder_valid_values = {
    re.compile("Not Specified", flags=re.IGNORECASE): "Not Specified",
    re.compile("(Other$)", flags=re.IGNORECASE): "Other",
    re.compile("lipid", re.IGNORECASE): "Lipid",
    re.compile("plasma", re.IGNORECASE): "Plasma",
    re.compile("protein", re.IGNORECASE): "Protein",
    re.compile("saliva", re.IGNORECASE): "Saliva",
    re.compile("serum", re.IGNORECASE): "Serum",
    re.compile("sputum", re.IGNORECASE): "Sputum",
    re.compile("urine", re.IGNORECASE): "Urine",
    re.compile(
        "^0x Visium Spatial Gene Expression"
    ): "10x Visium Spatial Gene Expression",
    re.compile("falseFalseFALSEtrueTrueTRUE	"): "True, False",
    re.compile("TRUE|TRUEDiagnosisStatus", re.IGNORECASE): "True",
    re.compile("FALSE|FASLSE", re.IGNORECASE): "False",
    re.compile("UnknownNot collected"): "Unknown, Not collected",
    re.compile(r"\u200b\u200b"): "",
    re.compile(
        "The Health,Aging,and Body Composition Study \(HealthABC\)"
    ): "The Health and Aging and Body Composition Study (HealthABC)",
    re.compile("Not Hispanic or latinoEthnicity"): "Not Hispanic or latino",
    re.compile("Hispanic or latinoEthnicity"): "Hispanic or latino",
    re.compile(
        "HPO, MONDO, MAXO codes or labels \(not listed for purposes of this RFC\)"
    ): "HPO and MONDO and MAXO codes or labels (not listed for purposes of this RFC)",
    re.escape(
        r"Possible values are listed under the instrument model term.OtherMsInstrumentModel"
    ): "OtherMsInstrumentModel",
    re.compile(
        r"Possible values are listed under the cleavage agent nameOtherCleavageAgents"
    ): "OtherCleavageAgents",
    re.compile(r"Possible values are listed under modification parameters"): "",
    re.compile("Uknown"): "Unknown",
    re.compile("OtherControlType", re.IGNORECASE): "OtherControlType",
    re.compile("OtherMsAnalyteType", re.IGNORECASE): "OtherMsAnalyteType",
}

# Functions


In [6]:
# Clean list columns into single string
def join_strings(string):
    try:
        return ",".join(string)
    except:
        return ""


def clean_list(string):
    """Takes a list represented as a string and returns only unique values found

    Args:
        string (str): list represented as string

    Returns:
        string: list as string of unique values
    """

    new_list = string.split(",")
    new_list = [n.strip() for n in new_list if n != "nan"]
    new_list = ",".join(sorted(list(np.unique(new_list)))).strip(",")
    return new_list


def search_df(df, pattern):
    mask = np.column_stack(
        [df[col].str.contains(pattern, na=False, flags=re.IGNORECASE) for col in df]
    )

    df = df.loc[mask.any(axis=1)]

    with pd.option_context("display.max_colwidth", None):
        display(df)

    return df


def find_row(df, attribute):
    """Get indexes of the dataframe"""
    indexes = df.index[
        df["Attribute"].str.contains(
            "(^" + re.escape(attribute) + "$)", flags=re.IGNORECASE
        )
    ].tolist()
    if len(indexes) != 0:
        return indexes
    else:
        print(attribute)
        return None


def replace_valid_value(df, indexes, regex_dict, attribute):
    """Alter the dataframe valid values with the replacement value"""
    if indexes == None:
        return df

    elif len(indexes) > 0:
        regex_dict = regex_dict[attribute]

        for index in indexes:
            df.loc[index, "Valid Values"] = re.sub(
                **regex_dict, string=df.loc[index, "Valid Values"]
            )

            # print(df.loc[index, 'Valid Values'])
        return df
    else:
        return df


def code_equals_values(df, regex_dict, attribute):
    print("attribute: ", attribute)

    indexes = find_row(df, attribute)

    print("Index: ", indexes)

    if indexes == None:
        return df
    else:
        df = replace_valid_value(df, indexes, regex_dict, attribute)
        return df


def rewrite_df_value(df, col_name, search_term, col_value, new_value):
    try:
        df.loc[df[df[col_name] == search_term].index[0], col_value] = new_value
        return df
    except:
        return df

In [7]:
# Unzip compressed folder if downloaded from Google Drive
# %unzip 'RFC Tables-20230620T181152Z-001.zip'

# Collect RFCs


In [8]:
# Get all the RFC file paths
file_paths = glob("../_data/RFC Tables/*")

Create Data Model for Schematic


In [9]:
dm = pd.DataFrame()

# parse through files to create complete data model
for fp in file_paths:
    file_name = os.path.basename(fp)

    temp = pd.read_excel(fp)

    # Create file_name column to check
    temp.insert(loc=0, column="file_name", value=file_name)

    # Create new columnn for data model name
    temp.insert(
        loc=1,
        column="dm",
        value=re.sub(
            "\s\s+",
            " ",
            re.sub(
                "_",
                " ",
                re.sub(
                    "(EL)|(RFC)|(\.xlsx)|([Aa]ssay)|([Dd]ata [Mm]odel)", "", file_name
                ),
            ).strip(),
        ),
    )

    dm = pd.concat([dm, temp])

In [10]:
# initial cleaning
dm[["required", "multivalue"]] = (
    dm[["required", "multivalue"]]
    .fillna(False)
    .astype(str)
    .replace({"1.0": True, "0.0": False})
)

dm.fillna("")
dm.reset_index(drop=True, inplace=True)
dm.head()

Unnamed: 0,file_name,dm,key,description,valid values,required,requires,multivalue,type,concept source ontology,note,Unnamed: 9,Unnamed: 10,ontology,term id
0,EL Assay_ scRNAseq data model.xlsx,scRNAseq,specimenID,Identifying string linked to a particular samp...,n/a (unique to each data contributor),True,"scRNAseq, Biospecimen",False,string,Sage Bionetworks,,,Sage Bionetworks,,
1,EL Assay_ scRNAseq data model.xlsx,scRNAseq,sampleType,The type of sample collected,"Amniotic Fluid,\nAppendix,\nB cell,\nBasophils...",True,scRNAseq,False,string,"Sage Bionetworks,\nImmPort","The sample types are adopted from Uberon, Cell...",,,,
2,EL Assay_ scRNAseq data model.xlsx,scRNAseq,specifySampleType,"If ""other"" is selected list the type of sample",,False,"scRNAseq,\nsampleType = other",False,string,Sage Bionetworks,,,,,
3,EL Assay_ scRNAseq data model.xlsx,scRNAseq,measurementTechnique,The measurement technique describing the assay...,"16S rRNA gene sequencing,\n1D Gel,\n2D Gel,\nA...",True,scRNAseq,False,string,Sage Bionetworks,,,,,
4,EL Assay_ scRNAseq data model.xlsx,scRNAseq,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,"scRNAseq,\nmeasurementTechnique = other",False,string,Sage Bionetworks,,,,,


Data model clean up


In [11]:
# collapse presumed ontology columns and join with existing
dm.loc[:, "ontology"] = dm.iloc[:, 11:].bfill(axis=1).iloc[:, 0]

dm["ontology"] = (
    dm[["concept source ontology", "ontology"]]
    .fillna("")
    .apply(lambda x: ",".join([y.strip() for y in x.unique() if len(y) > 0]), axis=1)
)

# if unique values are provided by data contributor then add this note in the ontology

dm["ontology"] = (
    dm.loc[
        dm["valid values"].str.contains(
            "(n/a \(unique to each data contributor\))", na=False
        ),
        "ontology",
    ]
    + ","
    + "Data Contributor"
)

  dm["valid values"].str.contains(


In [12]:
# new ontologies:
ontology_list = []
for i, v in dm[["concept source ontology", "ontology"]].fillna("").iterrows():
    ontology_list.append(",".join(v))

dm["ontology"] = ontology_list
dm["ontology"] = dm["ontology"].str.strip(",")

In [13]:
dm = dm.apply(
    lambda x: x.str.replace(
        pat="\n|(n/a \(unique to each data contributor\))", repl="", regex=True
    ).str.split(","),
    axis=1,
)

dm.head()

Unnamed: 0,file_name,dm,key,description,valid values,required,requires,multivalue,type,concept source ontology,note,Unnamed: 9,Unnamed: 10,ontology,term id
0,[EL Assay_ scRNAseq data model.xlsx],[scRNAseq],[specimenID],[Identifying string linked to a particular sam...,[],[True],"[scRNAseq, Biospecimen]",[False],[string],[Sage Bionetworks],,,[Sage Bionetworks],"[Sage Bionetworks, Sage Bionetworks, Data Cont...",
1,[EL Assay_ scRNAseq data model.xlsx],[scRNAseq],[sampleType],[The type of sample collected],"[Amniotic Fluid, Appendix, B cell, Basophils, ...",[True],[scRNAseq],[False],[string],"[Sage Bionetworks, ImmPort]","[The sample types are adopted from Uberon, Ce...",,,"[Sage Bionetworks, ImmPort]",
2,[EL Assay_ scRNAseq data model.xlsx],[scRNAseq],[specifySampleType],"[If ""other"" is selected list the type of sample]",,[False],"[scRNAseq, sampleType = other]",[False],[string],[Sage Bionetworks],,,,[Sage Bionetworks],
3,[EL Assay_ scRNAseq data model.xlsx],[scRNAseq],[measurementTechnique],[The measurement technique describing the assa...,"[16S rRNA gene sequencing, 1D Gel, 2D Gel, Arr...",[True],[scRNAseq],[False],[string],[Sage Bionetworks],,,,[Sage Bionetworks],
4,[EL Assay_ scRNAseq data model.xlsx],[scRNAseq],[specifyMeasurementTechnique],"[If ""other"" is selected list the name of the m...",,[False],"[scRNAseq, measurementTechnique = other]",[False],[string],[Sage Bionetworks],,,,[Sage Bionetworks],


In [14]:
# revert lists back to strings
dm = dm.applymap(lambda x: join_strings(x))

# Rename columns with DCA standards
dm_schema_cols = {
    "dm": "Template",
    "key": "Attribute",
    "description": "Description",
    "valid values": "Valid Values",
    "required": "Required",
    "requires": "DependsOn Component",
    "concept source ontology": "Source",
    "ontology": "Ontology",
    "type": "Type",
}

dm = dm.rename(dm_schema_cols, axis=1)

# drop unimportant columns
r = re.compile("Unnamed*", re.IGNORECASE)

# Add additional required columns for DCA
dm["Properties"] = ""
dm["Validation Rules"] = dm["Type"].map(validation_coder)
dm["DependsOn"] = ""
dm["Parent"] = ""
# dm['DependsOn Component'] = ""

dm = dm[keep_cols]

dm.head()

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue
0,specimenID,Identifying string linked to a particular samp...,,True,,"scRNAseq, Biospecimen",,,scRNAseq,,Sage Bionetworks,string,"Sage Bionetworks,Sage Bionetworks,Data Contrib...",False
1,sampleType,The type of sample collected,"Amniotic Fluid,Appendix,B cell,Basophils,Bone,...",True,,scRNAseq,,,scRNAseq,,"Sage Bionetworks,ImmPort",string,"Sage Bionetworks,ImmPort",False
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,,"scRNAseq,sampleType = other",,,scRNAseq,,Sage Bionetworks,string,Sage Bionetworks,False
3,measurementTechnique,The measurement technique describing the assay...,"16S rRNA gene sequencing,1D Gel,2D Gel,Array,B...",True,,scRNAseq,,,scRNAseq,,Sage Bionetworks,string,Sage Bionetworks,False
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,,"scRNAseq,measurementTechnique = other",,,scRNAseq,,Sage Bionetworks,string,Sage Bionetworks,False


In [15]:
# fixing "not listed" columns
dm["Valid Values"] = dm["Valid Values"].str.replace(
    "Genbank common names (not listed for purposes of this RFC)Unknown",
    "Genbank common names (not listed for purposes of this RFC),Unknown",
    regex=False,
)

dm.loc[dm["Valid Values"].str.contains("not listed"), "Valid Values"] = (
    dm.loc[dm["Valid Values"].str.contains("not listed"), "Valid Values"]
    .str.split(")")
    .apply(
        lambda x: ",".join(
            [y.strip(",") for y in x if not bool(re.search("not listed", y))]
        )
    )
)

# QC
dm.loc[dm["Valid Values"].str.contains("not listed", na=False), "Valid Values"]

Series([], Name: Valid Values, dtype: object)

In [16]:
# Dropping measurement technique
dm = dm.drop(
    index=dm.query('Attribute == "measurementTechnique"').index.values
).reset_index(drop=True)

In [17]:
# combine duplicated attributes
dm = dm.groupby("Attribute").agg(lambda x: ",".join(set(x.astype(str)))).reset_index()

# found extra commas in strings at beginning and end
dm = dm.applymap(lambda x: x.strip(","))

template_recoder = {
    re.compile("genotyping", re.IGNORECASE): "Genotyping",
    re.compile("proteomics", re.IGNORECASE): "Proteomics",
}

dm = dm.replace(template_recoder, regex=True)

In [18]:
# QA Check
dm["Template"].unique()

array(['Metabolomics Human', 'Metabolomics Human,Proteomics',
       'Individual Human', 'Biospecimen nonHuman', 'Individual nonHuman',
       'Biospecimen nonHuman,Biospecimen human', 'Proteomics',
       'Individual Human,Individual nonHuman',
       'bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'Microbiome,bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'Microbiome,Metabolomics Human',
       'Individual Human,Biospecimen nonHuman,Individual nonHuman,Biospecimen human',
       'Microbiome,RNAseq,scRNAseq,bsSeq (bisulfite-seq WGBS methylseq methylomics),Whole Genome Sequencing',
       'scRNAseq,Whole Genome Sequencing,Microbiome,bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'Microbiome,Metabolomics Human,RNAseq,scRNAseq,bsSeq (bisulfite-seq WGBS methylseq methylomics),Whole Genome Sequencing,Proteomics',
       'Genotyping',
       'scRNAseq,Whole Genome Sequencing,RNAseq,bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'scRNAseq,Whole Gen

In [19]:
display(dm.dtypes)
display(dm.head())
display(dm.Template.unique())

Attribute              object
Description            object
Valid Values           object
Required               object
DependsOn              object
DependsOn Component    object
Properties             object
Validation Rules       object
Template               object
Parent                 object
Source                 object
Type                   object
Ontology               object
multivalue             object
dtype: object

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue
0,acquisitionBatchID,"Acquisition batch identifier, provided by the ...",,False,,mass spec metabolmics,,,Metabolomics Human,,Sage Bionetworks,string,Sage Bionetworks,
1,acquisitionBatchSize,The number of samples,,False,,mass spec metabolmics,,,Metabolomics Human,,Sage Bionetworks,string,Sage Bionetworks,
2,acquisitionBatchSizeUnit,The unit of measurement for number of samples ...,"AFU,AI,AU/ml,DK units/ml,bpg/dl,g/l,gm,HAU,IU,...",False,,"mass spec metabolmics,acquisitionBatchSize",,,Metabolomics Human,,Sage Bionetworks,string,Sage Bionetworks,
3,acquisitionMode,The specific aspect of a mass spectrometer met...,"Unknown,Not collected,Not applicable,Not speci...",True,,"mass spec Proteomics,mass spec metabolmics",,,"Metabolomics Human,Proteomics",,https://www.ebi.ac.uk/ols/ontologies/ms/terms?...,string,https://www.ebi.ac.uk/ols/ontologies/ms/terms?...,False
4,acquisitionSoftware,The name of the acquisition software used,"Unknown,Not collected,Not applicable,Not speci...",True,,"mass spec Proteomics,mass spec metabolmics",,,"Metabolomics Human,Proteomics",,http://purl.obolibrary.org/obo/MS_1001455,string,http://purl.obolibrary.org/obo/MS_1001455,False


array(['Metabolomics Human', 'Metabolomics Human,Proteomics',
       'Individual Human', 'Biospecimen nonHuman', 'Individual nonHuman',
       'Biospecimen nonHuman,Biospecimen human', 'Proteomics',
       'Individual Human,Individual nonHuman',
       'bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'Microbiome,bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'Microbiome,Metabolomics Human',
       'Individual Human,Biospecimen nonHuman,Individual nonHuman,Biospecimen human',
       'Microbiome,RNAseq,scRNAseq,bsSeq (bisulfite-seq WGBS methylseq methylomics),Whole Genome Sequencing',
       'scRNAseq,Whole Genome Sequencing,Microbiome,bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'Microbiome,Metabolomics Human,RNAseq,scRNAseq,bsSeq (bisulfite-seq WGBS methylseq methylomics),Whole Genome Sequencing,Proteomics',
       'Genotyping',
       'scRNAseq,Whole Genome Sequencing,RNAseq,bsSeq (bisulfite-seq WGBS methylseq methylomics)',
       'scRNAseq,Whole Gen

In [20]:
# Reorder Columns based on DCA Standards
dm = dm.loc[:, keep_cols]

# Clean up

## DependsOn Component


In [21]:
recoder = {
    "metabolmics": "metabolomics",
    "(mass spec proteomics)": "Proteomics",
    "(mass spec metabolomics)": "Metabolomics Human",
    "(assay_otheruseTreatment? = Yes)": "assay_other, useTreatment? = Yes",
    "OtherUnknown": "Other, Unknown",
    "falseFalseFALSEtrueTrueTRUE": "TRUE, FALSE",
    "Hispanic or latinoEthnicity": "Hispanic or Latino",
    re.compile("Forwardreverse", flags=re.IGNORECASE): "forward,reverse",
    re.compile("singleEndpairedEnd"): "singleEnd, pairedEnd",
    re.compile("(WGS)"): "Whole Genome Sequencing",
    re.compile("\?"): "",
    "Zeiss LSM 980Other": "Zeiss LSM 980,Other",
    "bsSeqsampleType = other": "bsSeq, sampleType = other",
    re.compile(
        "HPO, MONDO, MAXO codes or labels \(not listed for purposes of this RFC\)"
    ): "HPO and MONDO and MAXO codes or labels (not listed for purposes of this RFC)",
    "The Health, Aging, and Body Composition Study \(HealthABC\)": "The Health and Aging and Body Composition Study (HealthABC)",
}

# 'mass spec metabolomics,measurementTechnique = other'
# falseFalseFALSEtrueTrueTRUE

In [22]:
dm = dm.apply(lambda x: x.replace(recoder, regex=True), axis=1)

In [23]:
with pd.option_context("display.max_colwidth", None):
    display(dm[dm["Valid Values"].str.contains("and Body Composition Study")])

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue
16,cohort,Name of the cohort the individual belongs to,"Other, Unknown,Not collected,Not applicable,Centenarian, US Family, Denmark Family, The Osteoporotic Fractures in Men (MrOS) Study, Study of Osteoporotic Fractures (SOF), The Health and Aging and Body Composition Study (HealthABC), Cardiovascular Health Study (CHS),Other,Unknown,Not collected,Not applicable",True,,Individual,,,"Individual Human,Individual nonHuman",,Sage Bionetworks,string,Sage Bionetworks,True


In [24]:
with pd.option_context("display.max_colwidth", None):
    display(dm.query('Attribute == "ethnicity"'))

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue
37,ethnicity,Ethnicity of individual,"Not Hispanic or Latino, Hispanic or Latino, Prefer not to answer,Other,Unknown,Not collected,Not applicable",True,,Individual,,,Individual Human,,"Sage Bionetworks,https://www.synapse.org/#!Synapse:syn25878249",string,"Sage Bionetworks,https://www.synapse.org/#!Synapse:syn25878249",False


In [25]:
# QA check
dm[dm["DependsOn Component"].str.contains(
    "metabolomics", case=False, na=False)]

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue
0,acquisitionBatchID,"Acquisition batch identifier, provided by the ...",,False,,mass spec metabolomics,,,Metabolomics Human,,Sage Bionetworks,string,Sage Bionetworks,
1,acquisitionBatchSize,The number of samples,,False,,mass spec metabolomics,,,Metabolomics Human,,Sage Bionetworks,string,Sage Bionetworks,
2,acquisitionBatchSizeUnit,The unit of measurement for number of samples ...,"AFU,AI,AU/ml,DK units/ml,bpg/dl,g/l,gm,HAU,IU,...",False,,"mass spec metabolomics,acquisitionBatchSize",,,Metabolomics Human,,Sage Bionetworks,string,Sage Bionetworks,
3,acquisitionMode,The specific aspect of a mass spectrometer met...,"Unknown,Not collected,Not applicable,Not speci...",True,,"mass spec Proteomics,mass spec metabolomics",,,"Metabolomics Human,Proteomics",,https://www.ebi.ac.uk/ols/ontologies/ms/termsi...,string,https://www.ebi.ac.uk/ols/ontologies/ms/termsi...,False
4,acquisitionSoftware,The name of the acquisition software used,"Unknown,Not collected,Not applicable,Not speci...",True,,"mass spec Proteomics,mass spec metabolomics",,,"Metabolomics Human,Proteomics",,http://purl.obolibrary.org/obo/MS_1001455,string,http://purl.obolibrary.org/obo/MS_1001455,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,specimenID,Identifying string linked to a particular samp...,,True,,"Biospecimen,Genotyping ,Biospecimen,Biospecime...",,,"Microbiome,Metabolomics Human,Biospecimen nonH...",,Sage Bionetworks,string,"Sage Bionetworks,Sage Bionetworks,Data Contrib...","false,False"
156,spectrometerFrequency,The frequency at which a spectrometer causes h...,,True,,"mass spec Proteomics,mass spec metabolomics",,regex search ([0-9]+\.[0-9]*.)|([0-9]+),"Metabolomics Human,Proteomics",,"Sage Bionetworks,DSLWG,Sage Bionetworks",number,"Sage Bionetworks,DSLWG,Sage Bionetworks",False
159,technologyPlatformVersion,"The specific version (application, manufacture...","0x Visium Spatial Gene Expression,​​Affymetrix...",True,,"mass spec metabolomics,RNAseq,scRNAseq,bsSeq,m...",,,"Microbiome,Metabolomics Human,RNAseq,scRNAseq,...",,"http://purl.obolibrary.org/obo/NCIT_C45378,Sag...",string,"http://purl.obolibrary.org/obo/NCIT_C45378,Sag...","true,False"
175,vacuumPressure,The recorded vacuum pressure value,,False,,"mass spec metabolomics,hasIonizationSource = Yes",,regex search ([0-9]+\.[0-9]*.)|([0-9]+),Metabolomics Human,,"Sage Bionetworks,Proposed minimum metadata rel...",number,"Sage Bionetworks,Proposed minimum metadata rel...",


## Cleaning other values and equal values

Removing illegal characters


Remove any special characters


In [26]:
dm["Attribute"] = dm["Attribute"].str.replace("\(|\)|\?", "", regex=True)

In [27]:
dm["Valid Values"] = (
    dm["Valid Values"]
    .apply(
        lambda x: re.sub(
            "^0x Visium Spatial Gene Expression",
            "10x Visium Spatial Gene Expression",
            x,
        )
    )
    .apply(clean_list)
)

Clean up equals in depends on


In [28]:
def create_new_value(old_value):
    new_vals = old_value.split("=")
    new_vals = [nv.strip() for nv in new_vals]
    # convert to camel case
    nv = new_vals[1].capitalize() + new_vals[0][0].upper() + new_vals[0][1:]
    return nv

In [29]:
def recode_yes_no(v):
    if v.lower() == "yes":
        return "TRUE"
    elif v.lower() == "no":
        return "FALSE"
    else:
        return v


dm["Valid Values"] = (
    dm["Valid Values"]
    .apply(lambda x: x.split(","))
    .apply(lambda x: ",".join([recode_yes_no(y) for y in x]))
)

In [30]:
# Split list to process other values
# Find the other columns in the data model
others = dm[dm["DependsOn Component"].str.contains("=", na=False)].copy()

others["DependsOn Component Original"] = others["DependsOn Component"].str.split(",")

# Create series of equals values to use for new attributes/ valid values relationship
others["equals_series"] = others["DependsOn Component Original"].apply(
    lambda x: [y for y in x if bool(re.search("=", y))][0]
)

# others["equals_attribute"] = others["equals_series"].apply(create_new_value)

others[["baseAttribute", "equalsValue"]] = (
    others["equals_series"]
    .str.split("=", expand=True)
    .apply(lambda x: [y.strip() for y in x])
    .rename({0: "base_attribute", 1: "equalsValue"}, axis=1)
)

# Deciding to use true and false for all yes/no values
recoder = {
    re.compile("^[Yy]es", flags=re.IGNORECASE): "TRUE",
    re.compile("true", flags=re.IGNORECASE): "TRUE",
    re.compile("^[Nn]o", flags=re.IGNORECASE): "FALSE",
}

others["equalsValue"] = others["equalsValue"].replace(recoder, regex=True)

others["newDescription"] = others[["baseAttribute", "equalsValue"]].apply(
    lambda x: f"When {x[0].strip()} = {x[1].strip()}", axis=1
)

others["equalsAttribute"] = others[["baseAttribute", "equalsValue"]].apply(
    lambda x: f"{x[1].strip()}{x[0].strip()[0].upper()+x[0].strip()[1:]}", axis=1
)
others["DependsOn Component"] = ""
others["Properties"] = "dataProperty"
others.loc[others["equalsValue"] == "other", "Parent"] = "Specification"

In [31]:
equals_df = others.copy()
equals_df = equals_df.drop(columns=["DependsOn", "Description"])

equals_df = equals_df.rename(
    {
        "Attribute": "DependsOn",
        "newDescription": "Description",
        "equalsAttribute": "Attribute",
    },
    axis=1,
)[base_cols]

equals_df["DependsOn Component"] = ""

equals_df["Valid Values"] = ""

equals_df["Properties"] = "ValidValue"

Update base attribute equals values


In [32]:
# Create mapping
temp = others["equals_series"].str.split("=", expand=True)
temp = temp.apply(lambda x: x.str.strip(), axis=1)
temp = temp.rename({0: "base_attribute", 1: "value_to_replace"}, axis=1)
temp["value_to_replace"] = temp["value_to_replace"].str.capitalize()

# new value
temp["new_value"] = others["equalsAttribute"]
temp.reset_index(drop=True, inplace=True)

temp = temp.drop_duplicates()

In [33]:
replacements = {}

for i, x in temp.iterrows():
    ba, vtr, nv = x
    replacements[ba] = {
        "pattern": re.compile("(" + vtr + ")", flags=re.IGNORECASE),
        "repl": nv,
    }

replacements

{'captivityStatus': {'pattern': re.compile(r'(Captive)',
             re.IGNORECASE|re.UNICODE),
  'repl': 'captiveCaptivityStatus'},
 'hasAssayControl': {'pattern': re.compile(r'(Yes)', re.IGNORECASE|re.UNICODE),
  'repl': 'TRUEHasAssayControl'},
 'diagnosisStatus': {'pattern': re.compile(r'(True)',
             re.IGNORECASE|re.UNICODE),
  'repl': 'TRUEDiagnosisStatus'},
 'ethnicity': {'pattern': re.compile(r'(Hispanic or latino)',
             re.IGNORECASE|re.UNICODE),
  'repl': 'Hispanic or LatinoEthnicity'},
 'hasIonizationSource': {'pattern': re.compile(r'(Yes)',
             re.IGNORECASE|re.UNICODE),
  'repl': 'TRUEHasIonizationSource'},
 'msTarget': {'pattern': re.compile(r'(Targeted)', re.IGNORECASE|re.UNICODE),
  'repl': 'TargetedMsTarget'},
 'useReagent': {'pattern': re.compile(r'(Yes)', re.IGNORECASE|re.UNICODE),
  'repl': 'TRUEUseReagent'},
 'acquisitionBatchSizeUnit': {'pattern': re.compile(r'(Other)',
             re.IGNORECASE|re.UNICODE),
  'repl': 'otherAcquisitionB

In [34]:
for attribute in replacements.keys():
    dm = code_equals_values(dm, replacements, attribute)
    print("-" * 20)

attribute:  captivityStatus
Index:  [13]
--------------------
attribute:  hasAssayControl
Index:  [49]
--------------------
attribute:  diagnosisStatus
Index:  [29]
--------------------
attribute:  ethnicity
Index:  [37]
--------------------
attribute:  hasIonizationSource
Index:  [50]
--------------------
attribute:  msTarget
Index:  [71]
--------------------
attribute:  useReagent
Index:  [173]
--------------------
attribute:  acquisitionBatchSizeUnit
Index:  [2]
--------------------
attribute:  batchSizeUnit
Index:  [11]
--------------------
attribute:  controlType
Index:  [19]
--------------------
attribute:  dnaBatchSizeUnit
Index:  [34]
--------------------
attribute:  databaseName
Index:  [25]
--------------------
attribute:  databaseSource
Index:  [26]
--------------------
attribute:  digestionMethod
Index:  [30]
--------------------
attribute:  experimentalBatchSizeUnit
experimentalBatchSizeUnit
Index:  None
--------------------
attribute:  extractionMethod
Index:  [40]
------

  df["Attribute"].str.contains(


## Add derived attributes from "=" valid values


In [35]:
# dm["Properties"] = "dataProperty"
dm.update(others[base_cols])
dm = pd.concat([dm, equals_df], ignore_index=True)

In [36]:
# Do not need
dm["DependsOn Component"] = ""

# Valid Values Work


In [37]:
# # Not sure what happened here in the RFCs
dm["Valid Values"] = dm["Valid Values"].replace(recoder_valid_values)

In [38]:
# valid values that contain other
pattern = "([Oo]ther)"

pure_others = dm[
    dm["Valid Values"].str.contains(pattern, flags=re.IGNORECASE, regex=True)
].copy()

pure_others.loc[:, "replacement_value"] = pure_others.loc[:, "Attribute"].apply(
    lambda x: "Other" + (x[0].upper() + x[1:])
)

regex_dict = {}

for i, r in pure_others.iterrows():
    regex_dict[r["Attribute"]] = {
        "pattern": re.compile(pattern, flags=re.IGNORECASE),
        "repl": r["replacement_value"],
    }

# json_formatted = json.dumps(regex_dict, indent=4)
# print(json_formatted)

for attribute in regex_dict.keys():
    dm = code_equals_values(dm, regex_dict, attribute)
    print("-" * 20)

attribute:  acquisitionBatchSizeUnit
Index:  [2]
--------------------
attribute:  batchSizeUnit
Index:  [11]
--------------------
attribute:  captivityStatus
Index:  [13]
--------------------
attribute:  cellType
Index:  [14]
--------------------
attribute:  cleavageAgents
Index:  [15]
--------------------
attribute:  cohort
Index:  [16]
--------------------
attribute:  consentGroupID
Index:  [18]
--------------------
attribute:  controlType
Index:  [19]
--------------------
attribute:  conversionRatioUnits
Index:  [21]
--------------------
attribute:  countryCode
Index:  [22]
--------------------
attribute:  databaseName
Index:  [25]
--------------------
attribute:  databaseSource
Index:  [26]
--------------------
attribute:  diagnosis
Index:  [28]
--------------------
attribute:  digestionMethod
Index:  [30]
--------------------
attribute:  dnaBatchID
Index:  [32]
--------------------
attribute:  dnaBatchSizeUnit
Index:  [34]
--------------------
attribute:  enrichmentMethod
Index:  

  dm["Valid Values"].str.contains(pattern, flags=re.IGNORECASE, regex=True)
  df["Attribute"].str.contains(


## Cleanup valid values


In [39]:
# Fuzzy matching to find misspellings
# Fuzzy matching

valid_values = ",".join(dm["Valid Values"])
valid_values = valid_values.split(",")
valid_values = list(np.unique(valid_values))
valid_values = [v.strip() for v in valid_values if len(v) > 0]


scores = {}
for v in valid_values:
    scores[v] = {}
    for v2 in valid_values:
        if v == v2:
            next
        else:
            score = fuzz.ratio(v.lower(), v2.lower())
            if score == 100:
                scores[v][v2] = score
    if len(scores[v]) == 0:
        scores.pop(v)

# create recoding variables off fuzzy matching
new_values_recoded = []

for v in scores.values():
    new_values_recoded.append(list(v.keys())[0].title())

new_values_recoded = np.unique(new_values_recoded)

# recoder_valid_values = {}
for nv in new_values_recoded:
    recoder_valid_values[re.compile(nv, flags=re.IGNORECASE)] = nv
    # recoder_valid_values.append(value_add)

dm[["Valid Values", "multivalue"]] = (
    dm[["Valid Values", "multivalue"]]
    .replace(recoder_valid_values, regex=True)
    .fillna("")
    .applymap(lambda x: clean_list(x))
)

In [40]:
dm["Valid Values"] = (
    dm["Valid Values"].apply(lambda x: clean_list(x)
                             ).apply(lambda x: x.split(","))
)

# Expand each valid value into its own row
dm_vv = dm.explode("Valid Values")

# join valid values back together in data model as string
dm["Valid Values"] = dm["Valid Values"].apply(lambda x: ",".join(x).strip(","))

# Group valid values to create unique attribute and trace where value is used in another attribute and template
dm_vv = (
    dm_vv.dropna(subset="Valid Values")
    .groupby("Valid Values")
    .agg(lambda x: ",".join(set(x.astype(str))).strip(","))
    .reset_index()
)

dm_vv["Properties"] = "ValidValue"
dm_vv["Required"] = "False"

# rename for concatenating with data model
dm_vv = dm_vv.rename({"Valid Values": "Attribute",
                     "Attribute": "UsedIn"}, axis=1)

dm_vv[list_cols] = dm_vv[list_cols].applymap(clean_list)

dm_vv = dm_vv[dm_vv["Attribute"] != ""].reset_index(drop=True)

# clean up type
dm_vv["Type"] = "STRING"

# cleanup multivalue
dm_vv["multivalue"] = "False"

dm_vv.head()

Unnamed: 0,Attribute,UsedIn,Description,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue
0,10x,libraryPreparationMethod,Method by which library was prepared,False,,,ValidValue,,"Microbiome,RNAseq,Whole Genome Sequencing,bsSe...",,Sage Bionetworks,STRING,Sage Bionetworks,False
1,10x Visium Spatial Gene Expression,"platformLocation,technologyPlatformVersion","The specific version (application, manufacture...",False,,,ValidValue,,"Metabolomics Human,Microbiome,Proteomics,RNAse...",,"DSLWG,Sage Bionetworks,http://purl.obolibrary....",STRING,"DSLWG,Sage Bionetworks,http://purl.obolibrary....",False
2,AFU,"acquisitionBatchSizeUnit,batchSizeUnit,convers...","Unit of treatment amount,Unit of lens voltages...",False,,,ValidValue,regex search ([0-9]+\.[0-9]*.)|([0-9]+),"Genotyping,Metabolomics Human,Microbiome,RNAse...",,"ImmPort,Proposed minimum metadata relative to ...",STRING,"ImmPort,Proposed minimum metadata relative to ...",False
3,AI,"acquisitionBatchSizeUnit,batchSizeUnit,convers...","Unit of treatment amount,Unit of lens voltages...",False,,,ValidValue,regex search ([0-9]+\.[0-9]*.)|([0-9]+),"Genotyping,Metabolomics Human,Microbiome,RNAse...",,"ImmPort,Proposed minimum metadata relative to ...",STRING,"ImmPort,Proposed minimum metadata relative to ...",False
4,AIBL pool,controlType,Control samples suitable for normalization and...,False,,,ValidValue,,Metabolomics Human,,https://doi.org/10.1101/2020.05.19.105197https...,STRING,https://doi.org/10.1101/2020.05.19.105197https...,False


In [41]:
dm_vv.shape

(435, 14)

In [42]:
# dm["Valid Values"] = (
#     dm["Valid Values"]
#     .replace(recoder_valid_values, regex=True)
#     .apply(lambda x: clean_list(x))
# )

# valid_values = list(np.unique(",".join(dm["Valid Values"]).split(",")))

# valid_values = [v.strip() for v in valid_values if len(v) > 0]

# valid_values_df = pd.DataFrame({"Attribute": pd.Series(valid_values)})

# valid_values_df["Properties"] = "validValue"
# valid_values_df["Required"] = "False"

# valid_values_df = valid_values_df[
#     ~valid_values_df["Attribute"].isin(dm["Attribute"].tolist())
# ]

# valid_values_df

In [43]:
# adding valid values found in attribute columns
print(dm.shape)

dm2 = pd.concat([dm, dm_vv], axis=0, ignore_index=True)

display(dm2.shape)

(239, 14)


(674, 15)

In [44]:
print(sum(dm2.duplicated(subset="Attribute", keep=False)))

# dm2.loc[dm2.duplicated(subset="Attribute", keep=False),].sort_values(by="Attribute")

29


In [45]:
# Create measurement unit attributes
# Separate out measurement units
r = re.compile("(^Other)")

measurement_units = np.unique(
    ",".join(
        dm.loc[
            dm["Valid Values"].str.contains("units", regex=True), "Valid Values"
        ].values.tolist()
    ).split(",")
)

measurement_units = [
    x
    for x in measurement_units
    if x not in ["Not Specified", "Other", "Unknown", "Not Available"]
    and not bool(r.search(x))
]

dm2.loc[dm2["Attribute"].isin(measurement_units), "Parent"] = "MeasurementUnit"
dm2.loc[dm2["Attribute"].isin(measurement_units), "Description"] = "Measurement unit"
dm2.loc[dm2["Attribute"].isin(measurement_units), "Type"] = "STRING"
dm2.loc[dm2["Attribute"].isin(measurement_units), "multivalue"] = False

In [46]:
# Nonsense attributes
dm2 = dm2.drop(
    index=dm2.loc[dm2["Attribute"].str.contains(
        "Possible values"),].index.tolist()
)

In [47]:
# Fuzzy matching to find misspellings
# Fuzzy matching

valid_values = dm2["Attribute"].replace(
    recoder_valid_values, regex=True).tolist()

scores = {}
for v in valid_values:
    scores[v] = {}
    for v2 in valid_values:
        if v == v2:
            next
        else:
            score = fuzz.ratio(v.lower(), v2.lower())
            if score == 100:
                scores[v][v2] = score
    if len(scores[v]) == 0:
        scores.pop(v)

scores

{'f': {'F': 100}, 'F': {'f': 100}}

In [48]:
dm2["Attribute"] = dm2["Attribute"].replace(recoder_valid_values, regex=True)

dm2[["Valid Values", "multivalue"]] = (
    dm2[["Valid Values", "multivalue"]]
    .fillna("")
    .astype(str)
    .replace(recoder_valid_values, regex=True)
    .applymap(lambda x: clean_list(x))
)

In [49]:
dm2 = (
    dm2.dropna(subset="Attribute")
    .groupby("Attribute")
    .agg(lambda x: ",".join(set(x.astype(str))).strip(","))
    .reset_index()
)

dm2 = dm2.drop(index=dm2[dm2["Attribute"] == "f"].index.tolist()).reset_index(drop=True)

dm2[list_cols] = dm2[list_cols].applymap(clean_list)

print(f"dm2 shape: {dm2.shape}")
print(f'Duplicates: {sum(dm2.duplicated(subset="Attribute", keep=False))}')

dm2 shape: (648, 15)
Duplicates: 0


# Check columns for Speical Characters


In [50]:
check_cols = ["Attribute"]

mask = np.column_stack(
    [dm2[col].str.contains("\(|\)", na=False) for col in dm2[check_cols]]
)

with pd.option_context("display.max_colwidth", None):
    display(dm2[check_cols].loc[mask.any(axis=1)])

Unnamed: 0,Attribute
39,Cardiovascular Health Study (CHS)
43,Cerebrospinal Fluid (CSF)
111,Illumina Infinium MethylationEPIC BeadChip v1.0 (850k)
112,Illumina Infinium MethylationEPIC BeadChip v2.0 (935k)Illumina MiSeq
159,MassBank of North America (MoNA)
309,Study of Osteoporotic Fractures (SOF)
320,The Health and Aging and Body Composition Study (HealthABC)
321,The Osteoporotic Fractures in Men (MrOS) Study


# Create Manifests in data model


In [51]:
unique_templates = np.unique(
    ",".join(dm2["Template"].dropna().values.tolist()).split(",")
)
templates = pd.DataFrame({"Attribute": unique_templates})
templates["Required"] = "True"
templates["Properties"] = "dataType"
templates["Parent"] = "Template"
templates["Description"] = templates["Attribute"].apply(
    lambda x: f"Metadata template for {x}"
)

# Adjust bseq
templates.loc[
    templates["Attribute"] == "bsSeq (bisulfite-seq WGBS methylseq methylomics)",
    "Description",
] = "Template for bisulfite-seq WGBS methylseq methylomics"
templates.loc[
    templates["Attribute"] == "bsSeq (bisulfite-seq WGBS methylseq methylomics)",
    "Attribute",
] = "bsSeq"

# update templates column for other attributes
dm2["Template"] = dm2["Template"].str.replace(
    "bsSeq (bisulfite-seq WGBS methylseq methylomics)", "bsSeq", regex=False
)

# create depends On column for templates
for i, m in enumerate(templates["Attribute"]):
    dm2["Template"].str.contains(m)
    templates.loc[i, "DependsOn"] = ",".join(
        dm2.loc[
            dm2["Template"].str.contains(m, regex=False, na=False), "Attribute"
        ].tolist()
    )

In [52]:
dm2 = pd.concat([dm2, templates], axis=0, ignore_index=True)

print(dm2.shape)

(660, 15)


In [53]:
dm2[["Valid Values", "Template", "Ontology", "Source"]] = (
    dm2[["Valid Values", "Template", "Ontology", "Source"]]
    .fillna("")
    .applymap(clean_list)
)

In [54]:
# Recode required columns and fix spelling mistakes
required_recoder = {"0.0": "False", "1.0": "True", "FASLSE": "False"}

dm2["Required"] = dm2["Required"].replace(required_recoder)

Last bit of cleanup


In [55]:
# Remove measurement technique dependency from biospecimen human
dm2.loc[dm2["Attribute"] == "Biospecimen human", "DependsOn"] = (
    dm2.loc[dm2["Attribute"] == "Biospecimen human", "DependsOn"]
    .values[0]
    .replace("measurementTechnique,", "")
)
bio_measure_technique_index = dm2.query(
    'Attribute == "measurementTechnique" and Parent == "Biospecimen human"'
)

if len(bio_measure_technique_index.index) > 0:
    dm2 = dm2.drop(
        index=bio_measure_technique_index.index[0]).reset_index(drop=True)

In [56]:
# Remove measurement technique dependency from biospecimen human
dm2.loc[dm2["Attribute"] == "Biospecimen human", "DependsOn"] = (
    dm2.loc[dm2["Attribute"] == "Biospecimen human", "DependsOn"]
    .values[0]
    .replace("specifyMeasurementTechnique,", "")
)
bio_measure_technique_index = dm2.query(
    'Attribute == "specifyMeasurementTechnique" and Parent == "Biospecimen human"'
)


if len(bio_measure_technique_index.index) > 0:
    dm2 = dm2.drop(
        index=bio_measure_technique_index.index[0]).reset_index(drop=True)

In [57]:
# Remove measurement technique dependency from biospecimen human
dm2.loc[dm2["Attribute"] == "Biospecimen human", "DependsOn"] = (
    dm2.loc[dm2["Attribute"] == "Biospecimen human", "DependsOn"]
    .values[0]
    .replace("OtherMeasurementTechnique,", "")
)
bio_measure_technique_index = dm2.query(
    'Attribute == "OtherMeasurementTechnique"')


if len(bio_measure_technique_index.index) > 0:
    dm2 = dm2.drop(
        index=bio_measure_technique_index.index[0]).reset_index(drop=True)

In [58]:
dm2.loc[dm2[dm2["Attribute"] == "visitCode"].index.values[0], "Valid Values"] = ""

dm2.loc[dm2[dm2["Attribute"] == "visitCode"].index.values[0],
        "Validation Rules"] = ""

Extra comma at beginning of valid values


# Validation Rules


In [59]:
mixed_attrs = [
    {"attribute": "tissueWeight", "val_type": "mixed float", "regex": "regex search"},
    {"attribute": "tissueVolume", "val_type": "mixed float", "regex": "regex search"},
    {"attribute": "specimenAge", "val_type": "mixed integer", "regex": "regex search"},
    {"attribute": "samplingAge", "val_type": "mixed integer", "regex": "regex search"},
    {"attribute": "age", "val_type": "mixed integer", "regex": "regex search"},
]

for ma in mixed_attrs:
    attribute = ma["attribute"]
    val_type = ma["val_type"]

    # get indexes for new validation rules based on attribute
    indexes = dm2[dm2["Attribute"] == attribute].index.tolist()

    for i in indexes:
        if val_type == "integer":
            first_part = "[0-9]+"
        elif val_type == "float":
            first_part = "^\d*?\.?\d$"
        elif val_type == "mixed integer":
            regex = "regex search"
            num_match = "^\d*?"
        elif val_type == "mixed float":
            regex = "regex search"
            num_match = "^\d*?\.?\d$"
            # All valid values are applicable
            new_string = (
                regex
                + num_match
                + "|"
                + ""
                + "|".join(dm2.loc[i, "Valid Values"].split(","))
            )

        dm2.loc[i, "Validation Rules"] = new_string

In [60]:
dm2["Validation Rules"].unique().tolist()

['',
 'regex search ([0-9]+\\.[0-9]*.)|([0-9]+)',
 'regex search ([0-9]+),regex search ([0-9]+\\.[0-9]*.)|([0-9]+)',
 'regex search ([0-9]+)',
 'regex search^\\d*?\\.?\\d$|Not applicable|Not collected|Unknown',
 nan]

## Building Dependencies


In [61]:
dependencies = {
    "specimenID": "matchAtLeastOne Biospecimenhuman.specimenID value",
    "individualID": "matchExactlyOne IndividualHuman.individualID set",
}

In [62]:
for k, v in dependencies.items():
    indexes = dm2[dm2["Attribute"] == k]["Validation Rules"].index.values
    dm2.loc[indexes, "Validation Rules"] = v

In [63]:
dm2.loc[3, "Valid Values"] = dm2.loc[3, "Valid Values"].replace(
    ",Whole Genome Sequencing", ""
)

# Create File Annotations Attributes


In [64]:
# base file annotations
base_file_annotations = {"resourceType": "",
                         "isReleased": False, "fileType": ""}

In [65]:
with pd.option_context("display.max_colwidth", None):
    display(dm2.query('Attribute == "visitCode"'))

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn
646,visitCode,"Indicate which longitudinal visit for the individual the data comes from, provided by the data contributor's data dictionary",,True,,,,,Individual Human,,Sage Bionetworks,number,"Data Contributor,Sage Bionetworks",False,


# Write out new data model


In [66]:
for h in hard_coded_valid_values:
    dm2.loc[
        dm2[dm2["Attribute"] == h["attribute"]].index.values[0], "Valid Values"
    ] = h["valid_value"]

In [67]:
checks = [c["attribute"] for c in hard_coded_valid_values]

with pd.option_context("display.max_colwidth", None):
    display(dm2.query(f"Attribute in @checks"))

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn
362,age,"Age of the individual (age in years of the individual at first recorded study event (enrollment, visit, observation, sample collection, survey completion, etc.)",,True,,,,regex search^\d*?\.?\d$|Not applicable|Not collected|Unknown,Individual Human,,Sage Bionetworks,string,Sage Bionetworks,False,
383,consentGroupID,"Indicate the consent group for the individual, provided by the data contributor's data dictionary",123,True,,,,,Individual Human,,Sage Bionetworks,string,"Data Contributor,Sage Bionetworks",False,
552,samplingAge,"The calculated age of the sample, measurement is determined or coded by the data contributor.",,True,,,,regex search^\d*?\.?\d$|Not applicable|Not collected|Unknown,"Biospecimen human,Biospecimen nonHuman",,Sage Bionetworks,string,Sage Bionetworks,False,
603,specimenAge,The subject's age at the time of specimen extraction is the individual's age (since birth) at the time a given specimen was extracted. Measured as age in years.,,True,,,,regex search^\d*?\.?\d$|Not applicable|Not collected|Unknown,"Biospecimen human,Biospecimen nonHuman",,Sage Bionetworks,string,Sage Bionetworks,False,
618,tissueWeight,The mass of the tissue specimen. Measured in mg.,,True,,,,regex search^\d*?\.?\d$|Not applicable|Not collected|Unknown,"Biospecimen human,Biospecimen nonHuman",,Sage Bionetworks,string,Sage Bionetworks,False,
646,visitCode,"Indicate which longitudinal visit for the individual the data comes from, provided by the data contributor's data dictionary","1,2,3,4,Other,Unknown,Not collected,Not applicable",True,,,,,Individual Human,,Sage Bionetworks,number,"Data Contributor,Sage Bionetworks",False,


In [68]:
print(dm2.shape)

dm2 = (
    dm2.fillna("").drop_duplicates(subset=["Attribute"]).reset_index(drop=True)
)  # 'DependsOn', 'Properties',

display(dm2.shape)

(660, 15)


(660, 15)

In [69]:
dm2["Type"] = dm2["Type"].str.upper()

In [70]:
dm2[dm2["Attribute"] == "Other"]

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn


In [71]:
dm2.sample(10)

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn
405,experiementalBatchSize,The number of samples,,False,,,,regex search ([0-9]+\.[0-9]*.)|([0-9]+),Metabolomics Human,,Sage Bionetworks,NUMBER,Sage Bionetworks,,
510,pcrCycles,Number of PCR cycles to amplify transposased D...,,True,,,,regex search ([0-9]+\.[0-9]*.)|([0-9]+),bsSeq,,Sage Bionetworks,NUMBER,Sage Bionetworks,False,
141,Leukocytes,The type of sample collected,,False,,,ValidValue,,"Genotyping,Metabolomics Human,Microbiome,Prote...",,"ImmPort,Sage Bionetworks",STRING,"ImmPort,Sage Bionetworks",False,sampleType
621,transcriptType,The type of transcript reported,"Not Specified,Not applicable,Not collected,Oth...",True,,,,,"RNAseq,Whole Genome Sequencing,scRNAseq",,ImmPort,STRING,ImmPort,False,
325,TruSeq,Method by which library was prepared,,False,,,ValidValue,,"Microbiome,RNAseq,Whole Genome Sequencing,bsSe...",,Sage Bionetworks,STRING,Sage Bionetworks,False,libraryPreparationMethod
239,OtherRepositoryNameRepositoryName,The public repository name for the transcript ...,,False,,,ValidValue,,"RNAseq,Whole Genome Sequencing,scRNAseq",,Sage Bionetworks,STRING,Sage Bionetworks,False,repositoryName
207,OtherConsentGroupID,"Indicate the consent group for the individual,...",,False,,,ValidValue,,Individual Human,,Sage Bionetworks,STRING,"Data Contributor,Sage Bionetworks",False,consentGroupID
323,Tonsil,The type of sample collected,,False,,,ValidValue,,"Genotyping,Metabolomics Human,Microbiome,Prote...",,"ImmPort,Sage Bionetworks",STRING,"ImmPort,Sage Bionetworks",False,sampleType
550,samplePrepProtocol,An internet address that may provide more deta...,,False,,,,,"Metabolomics Human,Proteomics",,"DSLWG,Sage Bionetworks",STRING,"DSLWG,Sage Bionetworks",False,
411,fractionIdentifier,Identifier string that describes the sample fr...,,True,,,,regex search ([0-9]+\.[0-9]*.)|([0-9]+),Proteomics,,"MS Ontology,Sage Bionetworks,The Proteomics Sa...",NUMBER,"MS Ontology,Sage Bionetworks,The Proteomics Sa...",False,


# Add ADKP attrs to dm


In [72]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/adknowledgeportal/data-models/main/AD.model.csv"
)

# preprocess AD data model to remove duplicates
df = df.sort_values(by=["Attribute", "Valid Values"]).reset_index(drop=True)
df = df.drop_duplicates(keep="first", subset=["Attribute"])

df.loc[
    df.query('Attribute.str.contains("template")',
             engine="python").index.tolist(),
    "Properties",
] = "template"

attrs_interest = [
    "analysisType",
    "analysisType",
    "analytical covariates",
    "assay",
    "assay",
    "biospecimen",
    "consortium",
    "data dictionary",
    "dataSubtype",
    "dataType",
    "fileFormat",
    "grant",
    "ID mapping",
    "individual",
    "isConsortiumAnalysis",
    "isModelSystem",
    "isMultiSpecimen",
    "libraryPrep",
    "libraryType",
    "manifest",
    "manifest",
    "metadata",
    "metadataType",
    "modelSystemName",
    "modelSystemType",
    "platform",
    "project",
    "protocol",
    "protocol",
    "resourceType",
    "type",
]

metadataTypes = [
    "analytical covariates",
    "assay",
    "biospecimen",
    "data dictionary",
    "ID mapping",
    "individual",
    "manifest",
    "protocol",
]

df_new_attrs = df.query("Attribute in @attrs_interest").copy()

df_new_attrs["Properties"] = "BaseAnnotation"

## Cleanup data model attributes to fit ELITE data model


In [73]:
df_new_attrs = rewrite_df_value(
    df_new_attrs, "Attribute", "study", "Valid Values", "LLFS,ILO,LG,LC"
)

df_new_attrs = rewrite_df_value(
    df_new_attrs, "Attribute", "consortium", "Valid Values", "ELITE"
)

df_new_attrs = rewrite_df_value(
    df_new_attrs,
    "Attribute",
    "metadataType",
    "Valid Values",
    "analytical covariates, assay, biospecimen, data dictionary, ID mapping, individual, manifest, protocol",
)

In [74]:
# recode Parent
recoder = {
    "metadataType": "MetadataType",
    "dataProperty": "DataProperty",
    "dataType": "DataType",
    "dataSubtype": "DataSubtype",
}

df_new_attrs = df_new_attrs.replace(recoder)

df_new_attrs = df_new_attrs.rename(
    columns={"Parent": "Properties", "Properties": "Parent"}
)
df_new_attrs.head()

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Parent,Required,Properties,DependsOn Component,Source,Validation Rules,columnType,module
295,ID mapping,A file or table that maps data identifiers to ...,,,BaseAnnotation,,MetadataType,,http://edamontology.org/operation_3282,,string,experimentalData
730,analysisType,Type of analysis,"ANOVA, assessment, batch effect correction, ch...",,BaseAnnotation,False,DataProperty,,sage.annotations-analysis.analysisType-0.0.14,,string,analysis
731,analytical covariates,A file that contains a combination or subset o...,,,BaseAnnotation,,MetadataType,,Sage Bionetworks,,string,experimentalData
745,assay,The technology used to generate the data in th...,"10x multiome, 16SrRNAseq, active avoidance lea...",,BaseAnnotation,True,DataProperty,,sage.annotations-experimentalData.assay-0.0.26,,string,experimentalData
794,biospecimen,Metadata describing properties of specimens co...,,,BaseAnnotation,,MetadataType,,Sage Bionetworks,,string,experimentalData


# Merge new attributes with existiing data model


In [75]:
dm2.query("Attribute in @attrs_interest")

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn
364,assay,The analysis or technology used to generate th...,TBD,,,,,,Biospecimen nonHuman,,Sage Bionetworks,STRING,Sage Bionetworks,,
436,libraryPrep,The general strategy by which the library was ...,"Chromium Single Cell 3',DNALibraryConstruction...",True,,,,,"Microbiome,Whole Genome Sequencing,bsSeq,scRNAseq",,Sage Bionetworks,STRING,Sage Bionetworks,False,


In [76]:
# # Only needed for the first time since the Valid values were TBD
dm2 = rewrite_df_value(dm2, "Attribute", "assay", "Valid Values", np.nan)

In [77]:
dm2 = dm2.replace("", np.nan)

In [78]:
# Add valid values from the AD model
dm2["Valid Values"] = dm2["Valid Values"].fillna(
    dm2["Attribute"].map(df_new_attrs.set_index("Attribute")["Valid Values"])
)

In [79]:
# Add new attributes from AD model
dm_new = pd.concat([dm2, df_new_attrs])

In [80]:
dm_new

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,columnType,module
0,10x,Method by which library was prepared,,False,,,ValidValue,,"Microbiome,RNAseq,Whole Genome Sequencing,bsSe...",,Sage Bionetworks,STRING,Sage Bionetworks,False,libraryPreparationMethod,,
1,10x Visium Spatial Gene Expression,"The specific version (application, manufacture...",,False,,,ValidValue,,"Metabolomics Human,Microbiome,Proteomics,RNAse...",,"DSLWG,Sage Bionetworks,http://purl.obolibrary....",STRING,"DSLWG,Sage Bionetworks,http://purl.obolibrary....",False,"platformLocation,technologyPlatformVersion",,
2,AFU,Measurement unit,,False,,,ValidValue,regex search ([0-9]+\.[0-9]*.)|([0-9]+),"Genotyping,Metabolomics Human,Microbiome,RNAse...",MeasurementUnit,"ImmPort,Proposed minimum metadata relative to ...",STRING,"ImmPort,Proposed minimum metadata relative to ...",False,"acquisitionBatchSizeUnit,batchSizeUnit,convers...",,
3,AI,Measurement unit,,False,,,ValidValue,regex search ([0-9]+\.[0-9]*.)|([0-9]+),"Genotyping,Metabolomics Human,Microbiome,RNAse...",MeasurementUnit,"ImmPort,Proposed minimum metadata relative to ...",STRING,"ImmPort,Proposed minimum metadata relative to ...",False,"acquisitionBatchSizeUnit,batchSizeUnit,convers...",,
4,AIBL pool,Control samples suitable for normalization and...,,False,,,ValidValue,,Metabolomics Human,,https://doi.org/10.1101/2020.05.19.105197https...,STRING,https://doi.org/10.1101/2020.05.19.105197https...,False,controlType,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,modelSystemName,,"3xTg-AD, 5XFAD, AB42, Abca7A1527GAPOE4Trem2R47...",False,,,DataProperty,,,BaseAnnotation,sage.annotations-neuro.modelSystemName-0.0.8,,,,,string,neuro
1124,modelSystemType,Type of model system.,"animal, cerebral organoid, immortalized cell l...",False,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.modelSystemT...,,,,,string,experimentalData
1207,platform,"The specific version (manufacturer, model, etc...","Affy5.0, Affy6.0, Affymetrix Human Gene 1.0 ST...",True,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.platform-0.0.26,,,,,string,experimentalData
1233,protocol,A plan specification which has sufficient leve...,,,,,MetadataType,,,BaseAnnotation,http://purl.obolibrary.org/obo/OBI_0000272,,,,,string,experimentalData


In [81]:
dm_new.query('Properties == "DataProperty"')

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,columnType,module
730,analysisType,Type of analysis,"ANOVA, assessment, batch effect correction, ch...",False,,,DataProperty,,,BaseAnnotation,sage.annotations-analysis.analysisType-0.0.14,,,,,string,analysis
745,assay,The technology used to generate the data in th...,"10x multiome, 16SrRNAseq, active avoidance lea...",True,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.assay-0.0.26,,,,,string,experimentalData
848,consortium,The name of the consortium,ELITE,True,,,DataProperty,,,BaseAnnotation,sage.annotations-sageCommunity.consortium-0.0.5,,,,,string,sageCommunity
870,DataSubtype,"Further qualification of dataType, which may b...","dataMatrix, metadata, normalized, processed, r...",False,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.dataSubtype-...,,,,,string,experimentalData
871,DataType,Types of input/output data in bioinformatics p...,"Volume, Weight, Pharmacokinetic Study, genomic...",False,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.dataType-0.0.3,,,,,string,experimentalData
936,fileFormat,"Defined format of the data file, typically cor...","7z, ab1, abf, avi, bai, bam, bash script, bcf,...",False,,,DataProperty,,,BaseAnnotation,sage.annotations-sageCommunity.fileFormat-0.0.12,,,,,string,sageCommunity
972,grant,"Grant number including activity code, institut...",,False,,,DataProperty,,,BaseAnnotation,sage.annotations-neuro.grant-0.0.2,,,,,string,neuro
1028,isModelSystem,,"TRUE, FALSE",False,,,DataProperty,,,BaseAnnotation,sage.annotations-neuro.isModelSystem-0.0.2,,,,,boolean,neuro
1029,isMultiSpecimen,Boolean flag indicating whether or not a file ...,"TRUE, FALSE",False,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.isMultiSpeci...,,,,,boolean,experimentalData
1049,libraryPrep,The general strategy by which the library was ...,"amplicon, cellHashing, Chromium Single Cell 3'...",True,,,DataProperty,,,BaseAnnotation,sage.annotations-ngs.libraryPrep-0.0.13,,,,,string,ngs


In [82]:
dm_new.query('Attribute == "studyCode"')

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,columnType,module
611,studyCode,"Unique identifier for the study, assigned by t...","ILO,LC,LG,LLFS",True,,,,,"Individual Human,Individual nonHuman",,Sage Bionetworks,STRING,Sage Bionetworks,True,,,


In [83]:
print(sum(dm_new.duplicated(subset="Attribute")))
dm_new[dm_new.duplicated(subset="Attribute", keep=False)]

2


Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,columnType,module
364,assay,The analysis or technology used to generate th...,"10x multiome, 16SrRNAseq, active avoidance lea...",,,,,,Biospecimen nonHuman,,Sage Bionetworks,STRING,Sage Bionetworks,,,,
436,libraryPrep,The general strategy by which the library was ...,"Chromium Single Cell 3',DNALibraryConstruction...",True,,,,,"Microbiome,Whole Genome Sequencing,bsSeq,scRNAseq",,Sage Bionetworks,STRING,Sage Bionetworks,False,,,
745,assay,The technology used to generate the data in th...,"10x multiome, 16SrRNAseq, active avoidance lea...",True,,,DataProperty,,,BaseAnnotation,sage.annotations-experimentalData.assay-0.0.26,,,,,string,experimentalData
1049,libraryPrep,The general strategy by which the library was ...,"amplicon, cellHashing, Chromium Single Cell 3'...",True,,,DataProperty,,,BaseAnnotation,sage.annotations-ngs.libraryPrep-0.0.13,,,,,string,ngs


In [84]:
dm_new["Validation Rules"] = dm_new["Validation Rules"].replace(
    "regex search ([0-9]+),regex search ([0-9]+\\.[0-9]*.)|([0-9]+)",
    "regex search ([0-9]+\\.[0-9]*.)|([0-9]+)",
)
dm_new[
    dm_new["Validation Rules"]
    == "regex search ([0-9]+),regex search ([0-9]+\\.[0-9]*.)|([0-9]+)"
]

# recode Parent
recoder = {
    "metadataType": "MetadataType",
    "dataProperty": "DataProperty",
    "dataType": "DataType",
    "dataSubtype": "DataSubtype",
}

dm_new = dm_new.replace(recoder)

In [85]:
dm_new.drop(columns="columnType", inplace=True)

In [86]:
print(dm_new.shape)

dm_new = dm_new.drop_duplicates(subset=["Attribute"], keep="first")

print(dm_new.shape)

(684, 16)
(682, 16)


In [87]:
dm_new[["Parent", "Properties", "Type"]] = dm_new[
    ["Parent", "Properties", "Type"]
].fillna("unspecified")

dm_new["Required"] = dm_new["Required"].fillna("False")

In [88]:
# clean up template depends on
dm_new["DependsOn"] = (
    dm_new["DependsOn"].fillna("").apply(clean_list).replace("", np.nan)
)

In [89]:
remove_values = ["False", "Not applicable", "Not collected", "Unknown", "Other"]

dm_new.loc[dm_new["Parent"] == "Template", "DependsOn"] = dm_new.loc[
    dm_new["Parent"] == "Template", "DependsOn"
].apply(lambda x: ",".join([y for y in x.split(",") if y not in remove_values]))

In [90]:
dm_new.loc[dm_new["Parent"] == "Template",]

Unnamed: 0,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Template,Parent,Source,Type,Ontology,multivalue,UsedIn,module
648,Biospecimen human,Metadata template for Biospecimen human,,True,"Cerebrospinal Fluid (CSF),OtherCellType,OtherF...",,DataType,,,Template,,unspecified,,,,
649,Biospecimen nonHuman,Metadata template for Biospecimen nonHuman,,True,"Cerebrospinal Fluid (CSF),OtherCellType,OtherF...",,DataType,,,Template,,unspecified,,,,
650,Genotyping,Metadata template for Genotyping,,True,"AFU,AI,AU/ml,Amniotic Fluid,Appendix,B cell,Ba...",,DataType,,,Template,,unspecified,,,,
651,Individual Human,Metadata template for Individual Human,,True,"American Indian or Alaska Native,Asian,BU,Bird...",,DataType,,,Template,,unspecified,,,,
652,Individual nonHuman,Metadata template for Individual nonHuman,,True,"Adult,Bird,Cardiovascular Health Study (CHS),C...",,DataType,,,Template,,unspecified,,,,
653,Metabolomics Human,Metadata template for Metabolomics Human,,True,"10x Visium Spatial Gene Expression,AFU,AI,AIBL...",,DataType,,,Template,,unspecified,,,,
654,Microbiome,Metadata template for Microbiome,,True,"10x,10x Visium Spatial Gene Expression,AFU,AI,...",,DataType,,,Template,,unspecified,,,,
655,Proteomics,Metadata template for Proteomics,,True,"10x Visium Spatial Gene Expression,Affymetrix ...",,DataType,,,Template,,unspecified,,,,
656,RNAseq,Metadata template for RNAseq,,True,"10x,10x Visium Spatial Gene Expression,AFU,AI,...",,DataType,,,Template,,unspecified,,,,
657,Whole Genome Sequencing,Metadata template for Whole Genome Sequencing,,True,"10x,10x Visium Spatial Gene Expression,AFU,AI,...",,DataType,,,Template,,unspecified,,,,


In [91]:
dm_new = dm_new.reset_index(drop=True)
dm_new.to_csv("../" + csv_model, index=False)

In [92]:
# Convert CSV to JSON LD
print(f'schematic schema convert {"../" + csv_model} --output_jsonld {json_model}')

!schematic schema convert {"../" + csv_model} --output_jsonld {"../" + json_model}

schematic schema convert ../EL.data.model.csv --output_jsonld EL.data.model.jsonld
Starting schematic...
Done adding requirements and value ranges to attributes
The Data Model was created and saved to '../EL.data.model.jsonld' location.
