In [1]:
import os
import pandas as pd
import numpy as np
import re
import datetime
import pathlib

# custom package(s)
from utils import utils, synapse_login

import yaml
import pathlib

with open("./local_configs/notebook_config.yaml", "r") as f:
    config = yaml.safe_load(f)

csv_model = pathlib.Path("../" + config["file_names"]["csv_model"]).resolve()
json_model = pathlib.Path("../" + config["file_names"]["json_model"]).resolve()

In [4]:
syn = synapse_login.main()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (3.1.0) is available. Your version (2.7.0) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 3.1.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Nicholas Lee!



In [2]:
dm_name = "../EL.data.model.csv"
output_dir = pathlib.Path("../backups").resolve()

dm_path = pathlib.Path(dm_name).resolve()
dm = utils.load_and_backup_dm(dm_path, output_dir)
# dm = dm.drop(columns=["Unnamed: 0"])

In [10]:
np.sort(dm["module"].dropna().unique())

array(['analysis', 'experimentalData', 'neuro', 'ngs', 'sageCommunity'],
      dtype=object)

Updates 2023-09-06


In [None]:
atrs_to_fix = [
    "sequencingBatchID",  # needs to accept numerical values
    "libraryVersion",  # needs to accept numerical values
]
vv = ""
vr = "regex search ^[0-9]+|(Unknown)|(Not collected)|(Not applicable)|(Not Specified)"
for a in atrs_to_fix:
    fix_index = dm_elite.query("Attribute == @a").index[0]
    dm_elite.loc[fix_index, "Valid Values"] = vv
    dm_elite.loc[dm_elite.query(
        "Attribute == @a").index[0], "Validation Rules"] = vr

In [None]:
dm_elite.query("Attribute in @atrs_to_fix")

In [None]:
# Updates 9-23-2023

In [55]:
# Update descriptions
dm.loc[dm["Module"].isin(["Instrument"]),
       "Description"] = "Instrument Model Name"
dm.loc[dm["Module"].isin(["Study"]), "Description"] = "Project name"
dm.loc[dm["Module"].isin(["Study"]), "Module"] = "Project"

In [71]:
# Pull grant information
grants = syn.tableQuery("SELECT * FROM syn51209786").asDataFrame()["grantNumber"]

grantIds = []
for i in grants:
    grantIds += i

grantIds = ",".join([i.strip() for i in grantIds])

dm.loc[
    dm["Attribute"].str.contains("grant", flags=re.IGNORECASE), "Valid Values"
] = grantIds

In [57]:
# Fix Other values
dm.loc[
    dm["Attribute"].str.contains("^other|^specify", regex=True, flags=re.IGNORECASE),
    "Module",
] = "Other"

# Fix "specify" values
dm.loc[
    dm["Attribute"].str.contains("^specify", regex=True, flags=re.IGNORECASE), "Parent"
] = "Specification"

In [58]:
# Fix metadata module annotation
dm.loc[
    dm["Module"].str.contains(
        "Race|Ethnicity", regex=True, flags=re.IGNORECASE, na=False
    ),
    "Module",
] = "Metadata"

In [59]:
# fix Possible values are listed under the cleavage
with pd.option_context("display.max_colwidth", None):
    display(dm[dm["Attribute"].str.contains("Possible values are")])
    display(dm[dm["Valid Values"].str.contains("Possible values are", na=False)])

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology,UsedIn


Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology,UsedIn


In [60]:
# fixing more values

old_values = {
    "msInstrumentModel": "Possible values are listed under the instrument model term.OtherMsInstrumentModel,Unknown,Not collected,Not applicable,Not Specified",
    "modificationParameters": "Possible values are listed under modification parameters,OtherModificationParameters,Unknown,Not collected,Not applicable,Not Specified",
    "cleavageAgents": "Possible values are listed under the cleavage agent nameOtherCleavageAgents,Unknown,Not collected,Not applicable,Not Specified	",
}

# fixing values
dm.loc[dm["Attribute"] == "msInstrumentModel", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "msInstrumentModel", "Validation Rules"] = "str"

dm.loc[dm["Attribute"] == "modificationParameters", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "modificationParameters", "Validation Rules"] = "str"

dm.loc[dm["Attribute"] == "cleavageAgents", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "cleavageAgents", "Validation Rules"] = "str"
dm.loc[dm["Attribute"] == "cleavageAgents", "Parent"] = ""

In [61]:
# Are not actual attributes. Was a comment found in the data model
dm = dm.drop(dm[dm["Attribute"].str.contains("Possible values are")].index).reset_index(
    drop=True
)

In [62]:
# fix measurement units and change to just units
dm.loc[dm["Module"] == "Measurement Unit", "Module"] = "Unit"
dm.loc[dm["Module"] == "Unit", "Type"] = "NUMERIC"

In [65]:
# Capitallized first letter
recode_parent = {
    "dataProperty": "DataProperty",
    "specification": "Specification",
    "validValue": "ValidValue",
    "template": "Template",
}

dm["Parent"] = dm["Parent"].replace(recode_parent)

'Metadata', 'Unspecified', 'Other', 'Omics', 'Assay Metadata',
'Instrument', 'Unit', 'Assay', 'Sample Type', 'Model Organism',
'Ontology', 'Project', 'Organ', 'Boolean', 'Tissue', 'Template',
'BaseAnnotation',


In [66]:
dm.loc[
    dm["Module"].isin(["Organ", "Tissue", "Sample Type"]), "Module"
] = "Sample Metadata"

In [67]:
np.sort(dm["Module"].dropna().unique())

array(['Assay', 'BaseAnnotation', 'Instrument', 'Metadata',
       'Model Organism', 'Ontology', 'Other', 'Project',
       'Sample Metadata', 'Template', 'Unit', 'Unspecified',
       'Valid Value'], dtype=object)

In [69]:
dm.loc[dm["Module"].isin(["Omnics", "Assay Metadata"]), "Module"] = "Assay"

In [73]:
# write out data model
dm = dm[keep_cols]

dm.drop_duplicates(subset=["Attribute"], inplace=True)

dm.reset_index(drop=True, inplace=True)

In [74]:
# Updates 2023-09-21
dm.loc[dm["Attribute"] == "TRUE", "DependsOn"] = np.nan

# clean up source column
dm["Source"] = (
    dm["Source"]
    .fillna("")
    .str.split(",")
    .apply(lambda x: ",".join(sorted(np.unique(x))))
)
dm["Valid Values"] = (
    dm["Valid Values"]
    .fillna("")
    .str.split(",")
    .apply(lambda x: ",".join([y.strip() for y in x]))
)
dm["DependsOn"] = (
    dm["DependsOn"]
    .fillna("")
    .str.split(",")
    .apply(lambda x: ",".join([y.strip() for y in x]))
)

# update data model to remove not listed for purposes of this RFC
dm.loc[
    dm["Attribute"].str.contains(
        "RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Module",
] = "Ontology"

dm.loc[
    dm["Attribute"].str.contains(
        "RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Description",
] = "External ontology used for populating values"

dm.loc[
    dm["Attribute"].str.contains(
        "RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Parent",
] = "Ontology"

dm.loc[
    dm["Attribute"].str.contains(
        "RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Ontology",
] = "Self"

dm.loc[
    dm["Attribute"].str.contains(
        "RFC", regex=True, flags=re.IGNORECASE, na=False),
    "Attribute",
] = (
    dm.loc[
        dm["Attribute"].str.contains(
            "RFC", regex=True, flags=re.IGNORECASE, na=False),
        "Attribute",
    ]
    .str.split("(")
    .apply(lambda x: x[0].strip())
)

In [10]:
dm[["Attribute", 'Valid Values']] = dm[["Attribute", 'Valid Values']].apply(lambda x: x.str.replace("\(|\)|\|\?|/|\s+","_", regex=True))

In [13]:
dm[["Attribute", 'Valid Values']].apply(lambda x: x.str.replace("_^","", regex=True))

Unnamed: 0,Attribute,Valid Values
0,10x,
1,10x_Visium_Spatial_Gene_Expression,
2,AFU,
3,AI,
4,AIBL_pool,
...,...,...
677,modelSystemName,"3xTg-AD,_5XFAD,_AB42,_Abca7A1527GAPOE4Trem2R47..."
678,modelSystemType,"animal,_cerebral_organoid,_immortalized_cell_l..."
679,platform,"Affy5.0,_Affy6.0,_Affymetrix_Human_Gene_1.0_ST..."
680,protocol,


In [18]:
dm['Valid Values'] = dm['Valid Values'].fillna('None').str.split(',').apply(lambda x: ','.join([y.strip("_") for y in x]))

In [20]:
dm['Valid Values'][677]

'3xTg-AD,5XFAD,AB42,Abca7A1527GAPOE4Trem2R47H,Abca7KO,Abca7V1599M_rs117187003,Abi3S209F_Aduci,APOE2KI,APOE3KI,APOE4KI,APOE4Trem2R47H,ApoEKO,APPE693Q,AppKOAPOE4Trem2R47H,APPKM650,APPKM670,APPKM670-671NL-PSEN1deltaexon9,APPPS1,APPS1_Plexin-B1-KO,B6.Clu,B6.Gfap-APOE4,Bin1.B6,Bin1K358R_Aduci,BRI2-AB42,Cd2ap.B6,Ceacam1KO,Clasp2L163PSNP,CRND8,C57BL6J,hAbetaKI,hAPPAPOE4Trem2R47H,hCR1KIAPOE4Trem2,hTau,hTauTrem2,Il1rapKO,Kif21bT82TSNP,MAPT-P301K,Meox2KOHET,MthfrC677TSNP,Mtmr4V297GSNP,PicalmH458R,Plcg2KO,Plcg2M28LSNP,Plexin-B1-KO,rTg4510,Snx1D465NSNP,Sorl1A528TSNP,Spi1rs1377416,TAUPS19,TAUPS19_Plexin-B1-KO,TREM2,Trem2flox,Trem2KO,Trem2R47H,Trem2R47H_NSS,Trem2Y38C,TYROBP,TYROBP_KO.KO,TYROBP_WT,TYROBP_WT.KO'

In [11]:
# write out new model
dm.to_csv("../EL.data.model.csv", index=False)

In [3]:
# convert csv model to jsonld
!schematic schema convert {csv_model} --output_jsonld {json_model}

Starting schematic...
Traceback (most recent call last):
  File "/Users/nlee/Library/Caches/pypoetry/virtualenvs/cohort-builder-data-dictionary-fOfjgXmU-py3.10/bin/schematic", line 8, in <module>
    sys.exit(main())
  File "/Users/nlee/Library/Caches/pypoetry/virtualenvs/cohort-builder-data-dictionary-fOfjgXmU-py3.10/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/Users/nlee/Library/Caches/pypoetry/virtualenvs/cohort-builder-data-dictionary-fOfjgXmU-py3.10/lib/python3.10/site-packages/click/core.py", line 1078, in main
    rv = self.invoke(ctx)
  File "/Users/nlee/Library/Caches/pypoetry/virtualenvs/cohort-builder-data-dictionary-fOfjgXmU-py3.10/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/Users/nlee/Library/Caches/pypoetry/virtualenvs/cohort-builder-data-dictionary-fOfjgXmU-py3.10/lib/python3.10/site-packages/click/core.py", line