In [120]:
import os
import pandas as pd
import numpy as np
import re
import datetime
import pathlib

# custom package(s)
from utils import utils, synapse_login

import yaml
import pathlib

with open("./local_configs/notebook_config.yaml", "r") as f:
    config = yaml.safe_load(f)

csv_model = pathlib.Path("../" + config["file_names"]["csv_model"]).resolve()
json_model = pathlib.Path("../" + config["file_names"]["json_model"]).resolve()

In [121]:
dm_name = "../EL.data.model.csv"
output_dir = pathlib.Path("../backups").resolve()

dm_path = pathlib.Path(dm_name).resolve()
dm = utils.load_and_backup_dm(dm_path, output_dir)

In [123]:
# dm = dm.drop(columns=["Unnamed: 0"])

# clean up attributes
# remove any special characters
pattern = r"\(|\)|\.|-|_|:|;|/"
dm[["Attribute", "DependsOn", "Valid Values"]] = dm[
    ["Attribute", "DependsOn", "Valid Values"]
].apply(lambda x: x.str.replace(pattern, "").str.strip().str.replace("\s+", ""), axis=1)
dm["Attribute"] = dm["Attribute"].str.strip('"')
dm["Properties"].unique()
dm["Parent"] = dm["Parent"].str.replace("Template", "DataType")
dm.loc[dm["Parent"] == "MeasurementUnit"].to_csv("./measurementUnits.csv")
dm = dm.drop(index=dm.loc[dm["Parent"] == "MeasurementUnit"].index).reset_index(
    drop=True
)
dm[
    (dm["Attribute"].str.contains("units", flags=re.IGNORECASE))
    & (~dm["Valid Values"].isna())
]
dm["Properties"] = dm["Properties"].str.replace("DataType", "")

In [124]:
dm = (
    dm[dm["Properties"] != "ValidValue"]
    .sort_values(by="Attribute")
    .reset_index(drop=True)
)

In [125]:
dm

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
0,BiospecimenHuman,Metadata template for Biospecimen human,,"Component,cellType,fastingState,individualID,i...",,True,DataType,,,
1,BiospecimenNonHuman,Metadata template for Biospecimen nonHuman,,"Component,assay,cellType,dataGenerationSite,fa...",,True,DataType,,,
2,Component,Used to generate manifests,,,,True,,,,
3,DataSubtype,"Further qualification of dataType, which may b...","dataMatrix,metadata,normalized,processed,raw,r...",,DataProperty,False,BaseAnnotation,,sage.annotations-experimentalData.dataSubtype-...,
4,Genotyping,Metadata template for genotyping,,"Component,measurementTechnique,reagentCatalogN...",,True,DataType,,,
...,...,...,...,...,...,...,...,...,...,...
174,useTreatment,Was a treatment applied to the sample,"False,True",,unspecified,False,unspecified,,"ImmPort,Sage Bionetworks",
175,vacuumPressure,The recorded vacuum pressure value,,,DataProperty,False,unspecified,,Proposed minimum metadata relative to mass spe...,regex search ([0-9]+\.[0-9]*.)|([0-9]+)
176,vacuumPressureUnit,Unit of vacuum pressure value,,,DataProperty,False,unspecified,,Proposed minimum metadata relative to mass spe...,
177,valueReported,The count or gene count for the transcript,,,unspecified,True,unspecified,,Sage Bionetworks,regex search ([0-9]+\.[0-9]*.)|([0-9]+)


In [4]:
base_cols = [
    "Attribute",
    "Description",
    "Valid Values",
    "DependsOn",
    "Properties",
    "Required",
    "Parent",
    "DependsOn Component",
    "Source",
    "Validation Rules",
]

dm = dm[base_cols]

In [59]:
dm.loc[dm["Attribute"] == "age", "Validation Rules"] = np.nan
dm.loc[dm["Attribute"] == "race", "Validation Rules"] = np.nan

In [60]:
test = dm.loc[dm["Attribute"] == "IndividualHuman",
              "DependsOn"].values[0].split(",")

In [67]:
test

['individualID',
 'cohort',
 'project',
 'fieldCenterCode',
 'visitCode',
 'countryCode',
 'consentGroupID',
 'speciesGroup',
 'sex',
 'race',
 'ethnicity',
 'ethnicGroupCode',
 'age',
 'diagnosisStatus',
 'diagnosis',
 'Component']

In [70]:
print(len(test))
print(dm.loc[dm["Attribute"].isin(test + ["IndividualHuman"])].shape)

dm.loc[dm["Attribute"].isin(test + ["IndividualHuman"])]

16
(16, 10)


Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
2,Component,Used to generate manifests,,,,,,,,
6,IndividualHuman,Metadata template for Individual Human,,"individualID,cohort,project,fieldCenterCode,vi...",MetadataType,False,DataType,,,
20,age,Age of the individual (age in years of the ind...,,,unspecified,True,unspecified,,Sage Bionetworks,
34,cohort,Name of the cohort the individual belongs to,"CardiovascularHealthStudyCHS,Centenarian,Denma...",,unspecified,True,unspecified,,Sage Bionetworks,
36,consentGroupID,"Indicate the consent group for the individual,...",123,,unspecified,True,unspecified,,Sage Bionetworks,
41,countryCode,Indicate the geographic region for the individ...,"Notapplicable,Notcollected,OtherCountryCode,Un...",,unspecified,True,unspecified,,https://wits.worldbank.org/countryprofile/meta...,
48,diagnosis,Indicate the disease or condition.,,,DataProperty,False,unspecified,,"https://www.ebi.ac.uk/ols4/ontologies/hp,https...",
49,diagnosisStatus,Whether the individual has been diagnosed with...,"False,Notapplicable,Notcollected,TrueDiagnosis...",,unspecified,True,unspecified,,Sage Bionetworks,
56,ethnicGroupCode,A coded value specifying the self-declared eth...,,,DataProperty,False,unspecified,,https://ncithesaurus.nci.nih.gov/ncitbrowser/p...,
57,ethnicity,Ethnicity of individual,"HispanicorLatinoEthnicity,NotHispanicorLatinoE...",,unspecified,True,unspecified,,"Sage Bionetworks,https://www.synapse.org/#!Syn...",


In [62]:
dm = dm.fillna("")

In [69]:
dm = dm.replace("", np.nan)

In [64]:
dm["Required"] = dm["Required"].astype(str).str.upper()

In [24]:
dm.loc[dm["Attribute"] == "individualID", "Validation Rules"] = ""

In [42]:
dm.loc[
    dm["Attribute"] == "IndividualHuman", "DependsOn"
] = "individualID,cohort,project,fieldCenterCode,visitCode,countryCode,consentGroupID,speciesGroup,sex,race,ethnicity,ethnicGroupCode,age,diagnosisStatus,diagnosis,Component"

In [91]:
# check all dependsOn are in attributes
dependsOn = np.unique(",".join(dm["DependsOn"].dropna().values).split(","))

In [100]:
not_found = []

for d in dependsOn:
    if bool(d in dm['Attribute'].values) == False:
        print(
            bool(d in dm['Attribute'].values), '--', d
        )

        not_found.append(d)

False -- hasIonizationSource?
False -- measurementTechnique
False -- project
False -- tissue
False -- useReagent?
False -- useTreatment?


In [110]:
new_df = pd.DataFrame(not_found, columns=["Attribute"])
new_df["Required"] = "FALSE"

In [112]:
dm = pd.concat([dm, new_df])

In [114]:
dm = dm.loc[dm["Attribute"].isin(test + ["IndividualHuman"])]

In [126]:
dm[dm.duplicated(subset=['Attribute'])]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules


In [127]:
# write out new model
dm.to_csv("../EL.data.model.test.csv", index=False)

In [None]:
# convert csv model to jsonld
!schematic schema convert {csv_model} --output_jsonld {json_model}