In [1]:
import os
import pandas as pd
import numpy as np
import re

In [2]:
import datetime

today = datetime.date.today()
today_str = today.strftime("%Y-%m-%d")

In [3]:
dm = pd.read_csv("../EL.data.model.csv")

In [4]:
# write out old data model before changes
dm.to_csv(f"../backups/EL.data.model.{today_str}.csv")

In [28]:
dm["Module"].unique()

array(['Metadata', 'Unspecified', 'Other', 'Omics', 'Assay Metadata',
       'Instrument', 'Unit', 'Assay', 'Sample Type', 'Model Organism',
       'Ontology', 'Study', 'Organ', 'Boolean', 'Tissue', 'Template',
       'BaseAnnotation'], dtype=object)

In [6]:
# Pull grant information

In [7]:
# Fix Other values
dm.loc[
    dm["Attribute"].str.contains(
        "^other|^specify", regex=True, flags=re.IGNORECASE),
    "Module",
] = "Other"

In [8]:
# Fix "specify" values
dm.loc[
    dm["Attribute"].str.contains(
        "^specify", regex=True, flags=re.IGNORECASE), "Parent"
] = "specification"

In [9]:
dm.loc[
    dm["Attribute"].str.contains(
        "^other|^specify", regex=True, flags=re.IGNORECASE),
]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
2,specifySampleType,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
3,specifyMeasurementTechnique,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
5,specifyPlatformVersion,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
7,specifyPlatformLocation,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
12,specifyDNABatchSizeUnit,Value is determined by the data contributor,,,,False,specification,,Sage Bionetworks,,Other,String,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,OtherConversionRatioUnits,,,,,False,validValue,,,,Other,,
439,OtherExperiementalBatchSizeUnit,,,,,False,validValue,,,,Other,,
440,OtherLensVoltagesUnit,,,,,False,validValue,,,,Other,,
441,OtherVacuumPressureUnit,,,,,False,validValue,,,,Other,,


In [13]:
# Fix metadata module annotation
dm.loc[
    dm["Module"].str.contains(
        "Race|Ethnicity", regex=True, flags=re.IGNORECASE),
    "Module",
] = "Metadata"

In [10]:
# for any attributes that do not currently have a module name. In an attempt to remove the bracket tab in the webpage
dm.loc[dm["Module"].isna(), "Module"] = "Unspecified"

In [14]:
# fix Possible values are listed under the cleavage
with pd.option_context("display.max_colwidth", None):
    display(dm[dm["Attribute"].str.contains("Possible values are")])
    display(dm[dm["Valid Values"].str.contains(
        "Possible values are", na=False)])

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
462,Possible values are listed under modification parameters,,,,,False,validValue,,,,Unspecified,,
463,Possible values are listed under the cleavage agent nameOtherCleavageAgents,,,,,False,validValue,,,,Unspecified,,
464,Possible values are listed under the instrument model term.OtherMsInstrumentModel,,,,,False,validValue,,,,Unspecified,,


Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Type,Ontology
77,msInstrumentModel,The model of the mass spectrometer used,"Possible values are listed under the instrument model term.OtherMsInstrumentModel,Unknown,Not collected,Not applicable,Not Specified",,,True,dataProperty,,"Sage Bionetworks,Proposed minimum metadata relative to mass spectrometry",,Unspecified,,
131,modificationParameters,Modification parameters for the search engine run. [ PSI: PI http://www.w3.org/2002/07/owl#Axiom ],"Possible values are listed under modification parameters,OtherModificationParameters,Unknown,Not collected,Not applicable,Not Specified",,,True,dataProperty,,https://www.ebi.ac.uk/ols/ontologies/ms/termsiri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001055,,Unspecified,,
133,cleavageAgents,Name(s) of the enzyme used,"Possible values are listed under the cleavage agent nameOtherCleavageAgents,Unknown,Not collected,Not applicable,Not Specified",,,True,dataProperty,,Proteomics Sample Metadata,,Unspecified,,


In [16]:
# fixing more values

old_values = {
    "msInstrumentModel": "Possible values are listed under the instrument model term.OtherMsInstrumentModel,Unknown,Not collected,Not applicable,Not Specified",
    "modificationParameters": "Possible values are listed under modification parameters,OtherModificationParameters,Unknown,Not collected,Not applicable,Not Specified",
    "cleavageAgents": "Possible values are listed under the cleavage agent nameOtherCleavageAgents,Unknown,Not collected,Not applicable,Not Specified	"
}


# fixing values
dm.loc[dm['Attribute'] == "msInstrumentModel", 'Valid Values'] = ""
dm.loc[dm['Attribute'] == "msInstrumentModel", 'Validation Rules'] = "str"

dm.loc[dm['Attribute'] == "modificationParameters", 'Valid Values'] = ""
dm.loc[dm['Attribute'] == "modificationParameters", 'Validation Rules'] = "str"

dm.loc[dm['Attribute'] == "cleavageAgents", 'Valid Values'] = ""
dm.loc[dm['Attribute'] == "cleavageAgents", 'Validation Rules'] = "str"
dm.loc[dm['Attribute'] == "cleavageAgents", 'Parent'] = ""

In [23]:
dm = dm.drop(dm[dm["Attribute"].str.contains("Possible values are")].index).reset_index(drop = True)

In [26]:
# fix measurement units and change to just units
dm.loc[dm['Module'] == "Measurement Unit", 'Module'] = 'Unit'

In [27]:
# write out data model
dm.to_csv("../EL.data.model.csv", index=False)