In [43]:
import os
import pandas as pd
import numpy as np
import re
import datetime
import pathlib

# custom package(s)
from utils import utils, synapse_login

In [44]:
syn = synapse_login.main()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (3.0.0) is available. Your version (2.7.2) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 3.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Nicholas Lee!



In [45]:
dm_name = "../EL.data.model.csv"
output_path = os.path.join("../backups", dm_name)

dm_path = pathlib.Path(dm_name).resolve()
output_path = pathlib.Path(output_path).resolve()
dm = utils.load_and_backup_dm(dm_path, output_path)

In [46]:
# for any attributes that do not currently have a module name. In an attempt to remove the bracket tab in the webpage
dm.loc[dm["Module"].isna(), "Module"] = "Unspecified"

In [47]:
np.sort(dm["Module"].dropna().unique())

array(['Assay', 'BaseAnnotation', 'Instrument', 'Metadata',
       'Model Organism', 'Ontology', 'Other', 'Project',
       'Sample Metadata', 'Template', 'Unit', 'Unspecified',
       'Valid Value'], dtype=object)

In [48]:
dm.loc[dm["Module"] == "Boolean", "Module"] = "Valid Value"

In [9]:
# Update descriptions
dm.loc[dm["Module"].isin(["Instrument"]), "Description"] = "Instrument Model Name"
dm.loc[dm["Module"].isin(["Study"]), "Description"] = "Project name"

In [10]:
dm.loc[dm["Module"].isin(["Study"]), "Module"] = "Project"

In [11]:
dm.loc[~dm["DependsOn"].isna(),]

Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
178,178,OtherSampleType,"If ""other"" is selected list the type of sample",,specifySampleType,,False,ValidValue,,Sage Bionetworks,,Other,,
179,179,OtherTechnologyPlatformVersion,"If ""other"" list the name of the platform version",,specifyPlatformVersion,,False,ValidValue,,Sage Bionetworks,,Other,,
180,180,OtherPlatformLocation,"If ""other"" list the name of the platform location",,specifyPlatformLocation,,False,ValidValue,,Sage Bionetworks,,Other,,
181,181,OtherDnaBatchSizeUnit,"If ""other"" list unit of measure",,specifyDNABatchSizeUnit,,False,ValidValue,,Sage Bionetworks,,Other,,
182,182,OtherLibraryPrep,"If ""other"" list the name of the general strate...",,specifyLibraryPrep,,False,ValidValue,,Sage Bionetworks,,Other,,
183,183,OtherLibraryPreparationMethod,"If ""other"" list the name of the library prepar...",,specifyLibraryPreparationMethod,,False,ValidValue,,Sage Bionetworks,,Other,,
184,184,OtherLibraryVersion,"If ""other"" list the name of the library version",,specifyLibraryVersion,,False,ValidValue,,Sage Bionetworks,,Other,,
185,185,OtherSequencingBatchSizeUnit,"If ""other"" list unit of measure",,specifySequencingBatchSizeUnit,,False,ValidValue,,Sage Bionetworks,,Other,,
186,186,OtherReadLengthUnits,"If ""other"" provide the unit of measure",,specifyReadLengthUnits,,False,ValidValue,,Sage Bionetworks,,Other,,
187,187,OtherRepositoryName,"If ""other"" list the name of the repository",,specifyRepositoryName,,False,ValidValue,,Sage Bionetworks,,Other,,


In [12]:
# Pull grant information

In [13]:
# Fix Other values
dm.loc[
    dm["Attribute"].str.contains(
        "^other|^specify", regex=True, flags=re.IGNORECASE),
    "Module",
] = "Other"

In [14]:
# Fix "specify" values
dm.loc[
    dm["Attribute"].str.contains(
        "^specify", regex=True, flags=re.IGNORECASE), "Parent"
] = "specification"

In [15]:
dm.loc[
    dm["Attribute"].str.contains(
        "^other|^specify", regex=True, flags=re.IGNORECASE),
]

Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
2,2,specifySampleType,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
3,3,specifyMeasurementTechnique,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
5,5,specifyPlatformVersion,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
7,7,specifyPlatformLocation,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
12,12,specifyDNABatchSizeUnit,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,438,OtherConversionRatioUnits,,,,,False,ValidValue,,,,Other,,
439,439,OtherExperiementalBatchSizeUnit,,,,,False,ValidValue,,,,Other,,
440,440,OtherLensVoltagesUnit,,,,,False,ValidValue,,,,Other,,
441,441,OtherVacuumPressureUnit,,,,,False,ValidValue,,,,Other,,


In [16]:
# Fix metadata module annotation
dm.loc[
    dm["Module"].str.contains(
        "Race|Ethnicity", regex=True, flags=re.IGNORECASE, na=False
    ),
    "Module",
] = "Metadata"

In [17]:
# fix Possible values are listed under the cleavage
with pd.option_context("display.max_colwidth", None):
    display(dm[dm["Attribute"].str.contains("Possible values are")])
    display(dm[dm["Valid Values"].str.contains(
        "Possible values are", na=False)])

Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology


Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology


In [18]:
# fixing more values

old_values = {
    "msInstrumentModel": "Possible values are listed under the instrument model term.OtherMsInstrumentModel,Unknown,Not collected,Not applicable,Not Specified",
    "modificationParameters": "Possible values are listed under modification parameters,OtherModificationParameters,Unknown,Not collected,Not applicable,Not Specified",
    "cleavageAgents": "Possible values are listed under the cleavage agent nameOtherCleavageAgents,Unknown,Not collected,Not applicable,Not Specified	",
}


# fixing values
dm.loc[dm["Attribute"] == "msInstrumentModel", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "msInstrumentModel", "Validation Rules"] = "str"

dm.loc[dm["Attribute"] == "modificationParameters", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "modificationParameters", "Validation Rules"] = "str"

dm.loc[dm["Attribute"] == "cleavageAgents", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "cleavageAgents", "Validation Rules"] = "str"
dm.loc[dm["Attribute"] == "cleavageAgents", "Parent"] = ""

In [19]:
dm = dm.drop(dm[dm["Attribute"].str.contains("Possible values are")].index).reset_index(
    drop=True
)

In [20]:
# fix measurement units and change to just units
dm.loc[dm["Module"] == "Measurement Unit", "Module"] = "Unit"
dm.loc[dm["Module"] == "Unit", "Type"] = "NUMERIC"

In [21]:
# add study name attribute
dm.loc[dm["Attribute"].str.contains("study"),]

Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
158,158,studyCode,"Unique identifier for the study, assigned by t...","LC,LG,ILO,LLFS",,,True,DataProperty,,Sage Bionetworks,,Metadata,,
596,596,study pool,,,,,False,ValidValue,,,,Unspecified,,
646,646,studyName,Name of studies found in project,"MRGWAS,ELPSCRNA,Aging-PheWAS,Organoid scRNAseq",,,,validValues,,,,Unspecified,,


In [22]:
dm.loc[dm["Attribute"].str.contains("study"),]

Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
158,158,studyCode,"Unique identifier for the study, assigned by t...","LC,LG,ILO,LLFS",,,True,DataProperty,,Sage Bionetworks,,Metadata,,
596,596,study pool,,,,,False,ValidValue,,,,Unspecified,,
646,646,studyName,Name of studies found in project,"MRGWAS,ELPSCRNA,Aging-PheWAS,Organoid scRNAseq",,,,validValues,,,,Unspecified,,


In [23]:
# Later get from synapse
new_attribute = {
    "Attribute": ["studyName"],
    "Description": ["Name of studies found in project"],
    "Valid Values": ["MRGWAS,ELPSCRNA,Aging-PheWAS,Organoid scRNAseq"],
    "DependsOn": [np.nan],
    "Properties": [np.nan],
    "Required": [np.nan],
    "Parent": ["validValues"],
    "DependsOn Component": [np.nan],
    "Source": [np.nan],
    "Validation Rules": [np.nan],
    "Module": [np.nan],
    "Type": [np.nan],
    "Ontology": [np.nan],
}

dm = pd.concat([dm, pd.DataFrame.from_dict(new_attribute)])

In [24]:
recode_parent = {
    "dataProperty": "DataProperty",
    "specification": "Specification",
    "validValue": "ValidValue",
    "template": "Template",
}

dm["Parent"] = dm["Parent"].replace(recode_parent)

'Metadata', 'Unspecified', 'Other', 'Omics', 'Assay Metadata',
'Instrument', 'Unit', 'Assay', 'Sample Type', 'Model Organism',
'Ontology', 'Project', 'Organ', 'Boolean', 'Tissue', 'Template',
'BaseAnnotation',


In [25]:
dm.loc[
    dm["Module"].isin(["Organ", "Tissue", "Sample Type"]), "Module"
] = "Sample Metadata"

In [26]:
np.sort(dm["Module"].dropna().unique())

array(['Assay', 'BaseAnnotation', 'Instrument', 'Metadata',
       'Model Organism', 'Ontology', 'Other', 'Project',
       'Sample Metadata', 'Template', 'Unit', 'Unspecified',
       'Valid Value'], dtype=object)

In [27]:
dm.loc[dm["Module"].isin(["Omics"]), "Module"] = "Assay"

In [28]:
dm.loc[dm["Module"].isin(["Omnics", "Assay Metadata"]), "Module"] = "Assay"

In [29]:
keep_cols = [
    "Attribute",
    "Description",
    "Valid Values",
    "DependsOn",
    "Properties",
    "Required",
    "Parent",
    "DependsOn Component",
    "Source",
    "Validation Rules",
    "Module",
    "Type",
    "Ontology",
]

In [30]:
# write out data model
dm = dm[keep_cols]

dm.drop_duplicates(subset=["Attribute"], inplace=True)

dm.reset_index(drop=True, inplace=True)

dm.to_csv("../EL.data.model.csv", index=False)

In [32]:
# Updates 2023-09-21
dm.loc[dm["Attribute"] == "TRUE", "DependsOn"] = np.nan
# clean up source column
dm["Source"] = (
    dm["Source"]
    .fillna("")
    .str.split(",")
    .apply(lambda x: ",".join(sorted(np.unique(x))))
)
dm["Valid Values"] = (
    dm["Valid Values"]
    .fillna("")
    .str.split(",")
    .apply(lambda x: ",".join([y.strip() for y in x]))
)
dm["DependsOn"] = (
    dm["DependsOn"]
    .fillna("")
    .str.split(",")
    .apply(lambda x: ",".join([y.strip() for y in x]))
)

# update data model to remove not listed for purposes of this RFC
dm.loc[
    dm["Attribute"].str.contains("RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Module",
] = "Ontology"

dm.loc[
    dm["Attribute"].str.contains("RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Description",
] = "External ontology used for populating values"

dm.loc[
    dm["Attribute"].str.contains("RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Parent",
] = "Ontology"

dm.loc[
    dm["Attribute"].str.contains("RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Ontology",
] = "Self"

dm.loc[
    dm["Attribute"].str.contains("RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Attribute",
] = (
    dm.loc[
        dm["Attribute"].str.contains("RFC", regex=True, flags=re.IGNORECASE, na=False),
        "Attribute",
    ]
    .str.split("(")
    .apply(lambda x: x[0].strip())
)

dm.to_csv("../EL.data.model.csv", index=False)

In [35]:
import yaml
import pathlib

with open("./local_configs/notebook_config.yaml", "r") as f:
    config = yaml.safe_load(f)

csv_model = pathlib.Path("../" + config["file_names"]["csv_model"]).resolve()
json_model = pathlib.Path("../" + config["file_names"]["json_model"]).resolve()

In [36]:
# convert csv model to jsonld
!schematic schema convert {csv_model} --output_jsonld {json_model}

Starting schematic...
Done adding requirements and value ranges to attributes
The Data Model was created and saved to '/Users/nlee/Documents/Projects/ELITE/ELITE-data-models/EL.data.model.jsonld' location.
