In [39]:
import os
import pandas as pd
import numpy as np
import re
import datetime
import pathlib

# custom package(s)
from utils import utils, synapse_login

In [40]:
syn = synapse_login.main()

Welcome, Nicholas Lee!



In [41]:
dm_name = "../EL.data.model.csv"
output_path = os.path.join("../backups", dm_name)

dm_path = pathlib.Path(dm_name).resolve()
output_path = pathlib.Path(output_path).resolve()

In [42]:
dm = utils.load_and_backup_dm(dm_path, output_path)

In [43]:
# for any attributes that do not currently have a module name. In an attempt to remove the bracket tab in the webpage
dm.loc[dm["Module"].isna(), "Module"] = "Unspecified"

In [67]:
np.sort(dm["Module"].dropna().unique())

array(['Assay', 'Assay Metadata', 'BaseAnnotation', 'Instrument',
       'Metadata', 'Model Organism', 'Omics', 'Ontology', 'Other',
       'Project', 'Sample Metadata', 'Template', 'Unit', 'Unspecified',
       'Valid Value'], dtype=object)

In [46]:
dm.loc[dm["Module"] == "Boolean", "Module"] = "Valid Value"

In [47]:
dm.loc[dm["Module"].isin(["Project", "Instrument", "Assay Metadata"])].sort_values(
    by="Module"
)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
52,52,52,reagentID(s),"One or more identifiers, separated by a semico...",,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
53,53,53,reagentName,The reagent name is an alternative to the Reag...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
54,54,54,reagentManufacturer,The manufacturer is the source of a reagent an...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
55,55,55,reagentCatalogNumber,"If the assay reagent is a commercial product, ...",,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
56,56,56,reagentLotNumber,The lot number is often provided by a reagent ...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
57,57,57,reagentWeblink,An internet address that may provide details o...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
58,58,58,reagentContact,The contact information is particularly helpfu...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
342,342,342,Illumina HumanOmniExpress-24 v1.2 BeadChip,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,
343,343,343,Illumina Infinium MethylationEPIC BeadChip v1....,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,
344,344,344,Illumina Infinium MethylationEPIC BeadChip v2....,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,


In [48]:
# Update descriptions
dm.loc[dm["Module"].isin(["Instrument"]),
       "Description"] = "Instrument Model Name"
dm.loc[dm["Module"].isin(["Study"]), "Description"] = "Project name"

In [49]:
dm.loc[dm["Module"].isin(["Study"]), "Module"] = "Project"

In [50]:
dm.loc[~dm["DependsOn"].isna(),]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
178,178,178,OtherSampleType,"If ""other"" is selected list the type of sample",,specifySampleType,,False,ValidValue,,Sage Bionetworks,,Other,,
179,179,179,OtherTechnologyPlatformVersion,"If ""other"" list the name of the platform version",,specifyPlatformVersion,,False,ValidValue,,Sage Bionetworks,,Other,,
180,180,180,OtherPlatformLocation,"If ""other"" list the name of the platform location",,specifyPlatformLocation,,False,ValidValue,,Sage Bionetworks,,Other,,
181,181,181,OtherDnaBatchSizeUnit,"If ""other"" list unit of measure",,specifyDNABatchSizeUnit,,False,ValidValue,,Sage Bionetworks,,Other,,
182,182,182,OtherLibraryPrep,"If ""other"" list the name of the general strate...",,specifyLibraryPrep,,False,ValidValue,,Sage Bionetworks,,Other,,
183,183,183,OtherLibraryPreparationMethod,"If ""other"" list the name of the library prepar...",,specifyLibraryPreparationMethod,,False,ValidValue,,Sage Bionetworks,,Other,,
184,184,184,OtherLibraryVersion,"If ""other"" list the name of the library version",,specifyLibraryVersion,,False,ValidValue,,Sage Bionetworks,,Other,,
185,185,185,OtherSequencingBatchSizeUnit,"If ""other"" list unit of measure",,specifySequencingBatchSizeUnit,,False,ValidValue,,Sage Bionetworks,,Other,,
186,186,186,OtherReadLengthUnits,"If ""other"" provide the unit of measure",,specifyReadLengthUnits,,False,ValidValue,,Sage Bionetworks,,Other,,
187,187,187,OtherRepositoryName,"If ""other"" list the name of the repository",,specifyRepositoryName,,False,ValidValue,,Sage Bionetworks,,Other,,


In [51]:
# Pull grant information

In [52]:
# Fix Other values
dm.loc[
    dm["Attribute"].str.contains("^other|^specify", regex=True, flags=re.IGNORECASE),
    "Module",
] = "Other"

In [53]:
# Fix "specify" values
dm.loc[
    dm["Attribute"].str.contains("^specify", regex=True, flags=re.IGNORECASE), "Parent"
] = "specification"

In [54]:
dm.loc[
    dm["Attribute"].str.contains("^other|^specify", regex=True, flags=re.IGNORECASE),
]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
2,2,2,specifySampleType,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
3,3,3,specifyMeasurementTechnique,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
5,5,5,specifyPlatformVersion,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
7,7,7,specifyPlatformLocation,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
12,12,12,specifyDNABatchSizeUnit,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,438,438,OtherConversionRatioUnits,,,,,False,ValidValue,,,,Other,,
439,439,439,OtherExperiementalBatchSizeUnit,,,,,False,ValidValue,,,,Other,,
440,440,440,OtherLensVoltagesUnit,,,,,False,ValidValue,,,,Other,,
441,441,441,OtherVacuumPressureUnit,,,,,False,ValidValue,,,,Other,,


In [55]:
# Fix metadata module annotation
dm.loc[
    dm["Module"].str.contains(
        "Race|Ethnicity", regex=True, flags=re.IGNORECASE, na=False
    ),
    "Module",
] = "Metadata"

In [56]:
# fix Possible values are listed under the cleavage
with pd.option_context("display.max_colwidth", None):
    display(dm[dm["Attribute"].str.contains("Possible values are")])
    display(dm[dm["Valid Values"].str.contains("Possible values are", na=False)])

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology


In [57]:
# fixing more values

old_values = {
    "msInstrumentModel": "Possible values are listed under the instrument model term.OtherMsInstrumentModel,Unknown,Not collected,Not applicable,Not Specified",
    "modificationParameters": "Possible values are listed under modification parameters,OtherModificationParameters,Unknown,Not collected,Not applicable,Not Specified",
    "cleavageAgents": "Possible values are listed under the cleavage agent nameOtherCleavageAgents,Unknown,Not collected,Not applicable,Not Specified	",
}


# fixing values
dm.loc[dm["Attribute"] == "msInstrumentModel", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "msInstrumentModel", "Validation Rules"] = "str"

dm.loc[dm["Attribute"] == "modificationParameters", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "modificationParameters", "Validation Rules"] = "str"

dm.loc[dm["Attribute"] == "cleavageAgents", "Valid Values"] = ""
dm.loc[dm["Attribute"] == "cleavageAgents", "Validation Rules"] = "str"
dm.loc[dm["Attribute"] == "cleavageAgents", "Parent"] = ""

In [58]:
dm = dm.drop(dm[dm["Attribute"].str.contains("Possible values are")].index).reset_index(
    drop=True
)

In [59]:
# fix measurement units and change to just units
dm.loc[dm["Module"] == "Measurement Unit", "Module"] = "Unit"
dm.loc[dm["Module"] == "Unit", "Type"] = "NUMERIC"

In [60]:
# add study name attribute
dm.loc[dm["Attribute"].str.contains("study"),]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
158,158,158,studyCode,"Unique identifier for the study, assigned by t...","LC,LG,ILO,LLFS",,,True,DataProperty,,Sage Bionetworks,,Metadata,,
596,596,596,study pool,,,,,False,ValidValue,,,,Unspecified,,
648,648,648,studyName,Name of studies found in project,"MRGWAS,ELPSCRNA,Aging-PheWAS,Organoid scRNAseq",,,,validValues,,,,Unspecified,,


In [61]:
dm.loc[dm["Attribute"].str.contains("study"),]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
158,158,158,studyCode,"Unique identifier for the study, assigned by t...","LC,LG,ILO,LLFS",,,True,DataProperty,,Sage Bionetworks,,Metadata,,
596,596,596,study pool,,,,,False,ValidValue,,,,Unspecified,,
648,648,648,studyName,Name of studies found in project,"MRGWAS,ELPSCRNA,Aging-PheWAS,Organoid scRNAseq",,,,validValues,,,,Unspecified,,


In [62]:
# Later get from synapse
new_attribute = {
    "Attribute": ["studyName"],
    "Description": ["Name of studies found in project"],
    "Valid Values": ["MRGWAS,ELPSCRNA,Aging-PheWAS,Organoid scRNAseq"],
    "DependsOn": [np.nan],
    "Properties": [np.nan],
    "Required": [np.nan],
    "Parent": ["validValues"],
    "DependsOn Component": [np.nan],
    "Source": [np.nan],
    "Validation Rules": [np.nan],
    "Module": [np.nan],
    "Type": [np.nan],
    "Ontology": [np.nan],
}

dm = pd.concat([dm, pd.DataFrame.from_dict(new_attribute)])

In [63]:
recode_parent = {
    "dataProperty": "DataProperty",
    "specification": "Specification",
    "validValue": "ValidValue",
    "template": "Template",
}

dm["Parent"] = dm["Parent"].replace(recode_parent)

'Metadata', 'Unspecified', 'Other', 'Omics', 'Assay Metadata',
'Instrument', 'Unit', 'Assay', 'Sample Type', 'Model Organism',
'Ontology', 'Project', 'Organ', 'Boolean', 'Tissue', 'Template',
'BaseAnnotation',


In [65]:
dm.loc[
    dm["Module"].isin(["Organ", "Tissue", "Sample Type"]), "Module"
] = "Sample Metadata"

In [68]:
np.sort(dm["Module"].dropna().unique())

array(['Assay', 'Assay Metadata', 'BaseAnnotation', 'Instrument',
       'Metadata', 'Model Organism', 'Omics', 'Ontology', 'Other',
       'Project', 'Sample Metadata', 'Template', 'Unit', 'Unspecified',
       'Valid Value'], dtype=object)

In [None]:
dm.loc[
    dm["Module"].isin(["Omics"]), "Module"
] = "Assay"

In [71]:
dm.loc[
    dm["Module"].isin(["Assay", 'Assay Metadata']),]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
52,52.0,52.0,reagentID(s),"One or more identifiers, separated by a semico...",,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
53,53.0,53.0,reagentName,The reagent name is an alternative to the Reag...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
54,54.0,54.0,reagentManufacturer,The manufacturer is the source of a reagent an...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
55,55.0,55.0,reagentCatalogNumber,"If the assay reagent is a commercial product, ...",,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
56,56.0,56.0,reagentLotNumber,The lot number is often provided by a reagent ...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
57,57.0,57.0,reagentWeblink,An internet address that may provide details o...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
58,58.0,58.0,reagentContact,The contact information is particularly helpfu...,,,,False,DataProperty,,"Sage Bionetworks,ImmPort",,Assay Metadata,,
245,245.0,245.0,Affymetrix Genome-Wide Human SNP 6.0 Array,,,,,False,ValidValue,,,,Assay,,
246,246.0,246.0,Affymetrix Human Gene 1.0 ST Array,,,,,False,ValidValue,,,,Assay,,
247,247.0,247.0,Affymetrix Human Genome U133 Plus 2.0 Array,,,,,False,ValidValue,,,,Assay,,


In [None]:
# write out data model
dm.drop_duplicates(subset=["Attribute"], inplace=True)

dm.reset_index(drop=True, inplace=True)

dm.to_csv("../EL.data.model.csv", index=False)