In [183]:
import pandas as pd
import numpy as np
import re

In [184]:
dm = pd.read_csv('../EL.data.model.csv').drop(columns = ['Unnamed: 0'], errors = 'ignore')

In [185]:
dm['Parent'].unique()

array(['ValidValue', 'Component', 'AnalysisType', 'MeasurementUnit',
       'ManifestColumn', 'Ontology', 'Repository', 'Demographics',
       'BaseAnnotation'], dtype=object)

In [186]:
other_values = ["Other","Unknown","Not collected","Not applicable","Not specified"]

In [187]:
p_df = pd.read_excel("../_data/RFC Tables/EL RFC_ Assay_proteomics Data Model.xlsx")

p_df = p_df.rename(
    columns={
        "key": "Attribute",
        "description": "Description",
        "required": "Required",
        "requires": "Module",
        "concept source ontology": "Ontology",
        "valid values": "Valid Values",
        "type": "columnType",
        "note": "Notes",
    }
)
p_df = p_df.fillna("")


In [188]:
p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,multivalue,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,\nproteomics",False,string,Sage Bionetworks,values unique to each data contributor)
1,sampleType,The type of sample collected or the term used ...,,True,proteomics,False,string,"Sage Bionetworks,\n",A data contributor should be able to write in ...
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,"proteomics,\nsampleType = other",False,string,Sage Bionetworks,
3,experimentType,The type of experiment used.\n\nProvide a valu...,,True,proteomics,True,string,Proteomic Data Commons,A data contributor should be able to write in ...
4,measurementTechnique,The name of the measurement technique describi...,,True,proteomics,True,string,Sage Bionetworks,A data contributor should be able to write in ...
5,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,"proteomics,\nmeasurementTechnique = other",True,string,Sage Bionetworks,A data contributor should be able to write in ...
6,technologyPlatformVersion,"The specific version (application, manufacture...",\n,True,proteomics,True,string,http://purl.obolibrary.org/obo/NCIT_C45378,A data contributor should be able to write in ...
7,specifyPlatformVersion,"If ""other"" list the name of the platform version",,False,"proteomics,\ntechnologyPlatformVersion = other",True,string,Sage Bionetworks,A data contributor should be able to write in ...
8,platformLocation,"The name of the laboratory, facility, vendor, ...",\n,True,proteomics,True,string,"Sage Bionetworks,\nDSLWG",A data contributor should be able to write in ...
9,specifyPlatformLocation,"If ""other"" list the name of the platform location",,False,"proteomics,\nplatformLocation = other",True,string,Sage Bionetworks,


In [189]:
# string clean up
# Do not need to explicitly add "other" type values. Will add back later for list type attributes
p_df["Valid Values"] = (
    p_df["Valid Values"]
    .replace("n/a (unique to each data contributor)", "", regex=False)
    .str.replace(
        "Other|Unknown|Not collected|Not applicable|Not specified", "", regex=True
    )
    .str.strip(",")
)

# replace note type values
p_df["Valid Values"] = p_df["Valid Values"].apply(
    lambda x: ",".join(
        [y.strip() for y in x.split(",") if not bool(re.search("Possible values", y))]
    ).strip(",")
)

p_df = p_df.replace(r"\n", ",", regex=True).replace(r",,", ",", regex=True)


In [190]:
p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,multivalue,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,proteomics",False,string,Sage Bionetworks,values unique to each data contributor)
1,sampleType,The type of sample collected or the term used ...,,True,proteomics,False,string,"Sage Bionetworks,",A data contributor should be able to write in ...
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,"proteomics,sampleType = other",False,string,Sage Bionetworks,
3,experimentType,"The type of experiment used.,Provide a value O...",,True,proteomics,True,string,Proteomic Data Commons,A data contributor should be able to write in ...
4,measurementTechnique,The name of the measurement technique describi...,,True,proteomics,True,string,Sage Bionetworks,A data contributor should be able to write in ...
5,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,"proteomics,measurementTechnique = other",True,string,Sage Bionetworks,A data contributor should be able to write in ...
6,technologyPlatformVersion,"The specific version (application, manufacture...",,True,proteomics,True,string,http://purl.obolibrary.org/obo/NCIT_C45378,A data contributor should be able to write in ...
7,specifyPlatformVersion,"If ""other"" list the name of the platform version",,False,"proteomics,technologyPlatformVersion = other",True,string,Sage Bionetworks,A data contributor should be able to write in ...
8,platformLocation,"The name of the laboratory, facility, vendor, ...",,True,proteomics,True,string,"Sage Bionetworks,DSLWG",A data contributor should be able to write in ...
9,specifyPlatformLocation,"If ""other"" list the name of the platform location",,False,"proteomics,platformLocation = other",True,string,Sage Bionetworks,


In [191]:

p_df["Required"] = (
    p_df["Required"].astype(str)
    .apply(lambda x: False if bool(re.search("f", x)) else True)
    .astype(bool)
)

p_df = p_df.drop(columns=["multivalue"])

p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,proteomics",string,Sage Bionetworks,values unique to each data contributor)
1,sampleType,The type of sample collected or the term used ...,,True,proteomics,string,"Sage Bionetworks,",A data contributor should be able to write in ...
2,specifySampleType,"If ""other"" is selected list the type of sample",,True,"proteomics,sampleType = other",string,Sage Bionetworks,
3,experimentType,"The type of experiment used.,Provide a value O...",,True,proteomics,string,Proteomic Data Commons,A data contributor should be able to write in ...
4,measurementTechnique,The name of the measurement technique describi...,,True,proteomics,string,Sage Bionetworks,A data contributor should be able to write in ...
5,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,True,"proteomics,measurementTechnique = other",string,Sage Bionetworks,A data contributor should be able to write in ...
6,technologyPlatformVersion,"The specific version (application, manufacture...",,True,proteomics,string,http://purl.obolibrary.org/obo/NCIT_C45378,A data contributor should be able to write in ...
7,specifyPlatformVersion,"If ""other"" list the name of the platform version",,True,"proteomics,technologyPlatformVersion = other",string,Sage Bionetworks,A data contributor should be able to write in ...
8,platformLocation,"The name of the laboratory, facility, vendor, ...",,True,proteomics,string,"Sage Bionetworks,DSLWG",A data contributor should be able to write in ...
9,specifyPlatformLocation,"If ""other"" list the name of the platform location",,True,"proteomics,platformLocation = other",string,Sage Bionetworks,


In [192]:
# split the values out and create new attributes to add to the model.
# These will be the intermediate attributes that will allow the original column i.e. SampleType to then have the user fill in specifySampleType
p_df_others = p_df.loc[p_df["Module"].str.contains("=", na=False)].copy(deep=True)
p_df_others["others"] = (
    p_df_others["Module"]
    .str.split(",")
    .apply(lambda x: "".join([y.strip() for y in x if bool(re.search("=", y))]))
)

# in case there are multiple equals signs
p_df_others = p_df_others.explode("others")

p_df_others["Parent"] = (
    p_df_others["others"].str.split("=").apply(lambda x: x[0].strip())
)
p_df_others["OtherValue"] = (
    p_df_others["others"].str.split("=").apply(lambda x: x[1].strip())
)

p_df_others["others"] = (
    p_df_others["others"]
    .str.split("=")
    .apply(lambda x: x[1].strip().capitalize() + x[0][0].upper() + x[0][1:])
)
# Swap others -> Attribute and Attribute -> DependsOn
p_df_others = p_df_others.rename(
    columns={"Attribute": "DependsOn", "others": "Attribute"}
)
p_df_others["Required"] = False
p_df_others["Module"] = "Other"
p_df_others["Valid Values"] = ""
p_df_others

Unnamed: 0,DependsOn,Description,Valid Values,Required,Module,columnType,Ontology,Notes,Attribute,Parent,OtherValue
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,Other,string,Sage Bionetworks,,OtherSampleType,sampleType,other
5,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMeasurementTechnique,measurementTechnique,other
7,specifyPlatformVersion,"If ""other"" list the name of the platform version",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherTechnologyPlatformVersion,technologyPlatformVersion,other
9,specifyPlatformLocation,"If ""other"" list the name of the platform location",,False,Other,string,Sage Bionetworks,,OtherPlatformLocation,platformLocation,other
19,labelFreeQuantitation,"Type of label-free data analysis strategy ,Pro...",,False,Other,string,Proteomics Data Commons,,Label freeLabelQuantiation,labelQuantiation,Label Free
20,specifyLabelFreeQuantitation,"If ""other"" is selected please specify",,False,Other,string,Sage Bionetworks,,OtherLabelFreeQuantitation,labelFreeQuantitation,other
24,specifyMSAnalyzerTypeMS1,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS1,msAnalyzerTypeMS1,other
26,specifyMSAnalyzerTypeMS2,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS2,msAnalyzerTypeMS2,other
28,specifyMSAssayTechnique,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAssayTechnique,msAssayTechnique,other
30,specifyMSAnalyteType,"If ""other"" list the type of analyte",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMsAnalyteType,msAnalyteType,other


In [193]:
# Assign data modeling variables
p_df_others = p_df_others.assign(
    Description=p_df_others["DependsOn"].apply(
        lambda x: f"When {re.sub('specify|other', '',x, flags=re.IGNORECASE)} = `other`, add your custom value to the cell"
    ),
    columnType="string",
    Ontology="Sage Bionetworks",
    Required=False,
    Properties="ValidValue",
)

In [194]:
p_df_others

Unnamed: 0,DependsOn,Description,Valid Values,Required,Module,columnType,Ontology,Notes,Attribute,Parent,OtherValue,Properties
2,specifySampleType,"When SampleType = `other`, add your custom val...",,False,Other,string,Sage Bionetworks,,OtherSampleType,sampleType,other,ValidValue
5,specifyMeasurementTechnique,"When MeasurementTechnique = `other`, add your ...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMeasurementTechnique,measurementTechnique,other,ValidValue
7,specifyPlatformVersion,"When PlatformVersion = `other`, add your custo...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherTechnologyPlatformVersion,technologyPlatformVersion,other,ValidValue
9,specifyPlatformLocation,"When PlatformLocation = `other`, add your cust...",,False,Other,string,Sage Bionetworks,,OtherPlatformLocation,platformLocation,other,ValidValue
19,labelFreeQuantitation,"When labelFreeQuantitation = `other`, add your...",,False,Other,string,Sage Bionetworks,,Label freeLabelQuantiation,labelQuantiation,Label Free,ValidValue
20,specifyLabelFreeQuantitation,"When LabelFreeQuantitation = `other`, add your...",,False,Other,string,Sage Bionetworks,,OtherLabelFreeQuantitation,labelFreeQuantitation,other,ValidValue
24,specifyMSAnalyzerTypeMS1,"When MSAnalyzerTypeMS1 = `other`, add your cus...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS1,msAnalyzerTypeMS1,other,ValidValue
26,specifyMSAnalyzerTypeMS2,"When MSAnalyzerTypeMS2 = `other`, add your cus...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS2,msAnalyzerTypeMS2,other,ValidValue
28,specifyMSAssayTechnique,"When MSAssayTechnique = `other`, add your cust...",,False,Other,string,Sage Bionetworks,,OtherMsAssayTechnique,msAssayTechnique,other,ValidValue
30,specifyMSAnalyteType,"When MSAnalyteType = `other`, add your custom ...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMsAnalyteType,msAnalyteType,other,ValidValue


In [195]:
# Add other attributes to the list of valid values
p_df.loc[~p_df.replace("", np.nan)["Valid Values"].isna(), 'Valid Values'] = p_df.loc[~p_df.replace("", np.nan)["Valid Values"].isna()].apply(lambda x: re.sub(',+', ',', ','.join(x['Valid Values'] + ',' + p_df_others.loc[p_df_others['Parent'] == x['Attribute'], 'Attribute'].values)), axis = 1)

In [196]:
# Last bit of cleanup
p_df["Properties"] = "ManifestColumn"
p_df["Module"] = p_df["Attribute"].apply(
    lambda x: "Other" if bool(re.search("specify", x)) else "Metadata"
)


In [197]:
p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,columnType,Ontology,Notes,Properties
0,specimenID,Identifying string linked to a particular samp...,,True,Metadata,string,Sage Bionetworks,values unique to each data contributor),ManifestColumn
1,sampleType,The type of sample collected or the term used ...,,True,Metadata,string,"Sage Bionetworks,",A data contributor should be able to write in ...,ManifestColumn
2,specifySampleType,"If ""other"" is selected list the type of sample",,True,Other,string,Sage Bionetworks,,ManifestColumn
3,experimentType,"The type of experiment used.,Provide a value O...",,True,Metadata,string,Proteomic Data Commons,A data contributor should be able to write in ...,ManifestColumn
4,measurementTechnique,The name of the measurement technique describi...,,True,Metadata,string,Sage Bionetworks,A data contributor should be able to write in ...,ManifestColumn
5,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,True,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ManifestColumn
6,technologyPlatformVersion,"The specific version (application, manufacture...",,True,Metadata,string,http://purl.obolibrary.org/obo/NCIT_C45378,A data contributor should be able to write in ...,ManifestColumn
7,specifyPlatformVersion,"If ""other"" list the name of the platform version",,True,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ManifestColumn
8,platformLocation,"The name of the laboratory, facility, vendor, ...",,True,Metadata,string,"Sage Bionetworks,DSLWG",A data contributor should be able to write in ...,ManifestColumn
9,specifyPlatformLocation,"If ""other"" list the name of the platform location",,True,Other,string,Sage Bionetworks,,ManifestColumn


In [198]:
print("Shape of original data frame:", p_df.shape)
print("Shape of others data frame:", p_df_others.shape)

p_df_final = pd.concat([p_df, p_df_others]).reset_index(drop=True).replace("", np.nan)
p_df_final = p_df_final.set_index("Attribute")

print("Shape of final data frame:", p_df_final.shape)

p_df_final.info()

Shape of original data frame: (56, 9)
Shape of others data frame: (19, 12)
Shape of final data frame: (75, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 75 entries, specimenID to OtherCleavageAgents 
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Description   75 non-null     object
 1   Valid Values  4 non-null      object
 2   Required      75 non-null     bool  
 3   Module        75 non-null     object
 4   columnType    75 non-null     object
 5   Ontology      75 non-null     object
 6   Notes         41 non-null     object
 7   Properties    75 non-null     object
 8   DependsOn     19 non-null     object
 9   Parent        19 non-null     object
 10  OtherValue    19 non-null     object
dtypes: bool(1), object(10)
memory usage: 6.5+ KB


In [199]:
p_df_final['UsedIn'] = 'proteomics'

In [200]:
# Add proteomics template to the data model updated with the attributes in DependsOn
dm.loc[dm["Attribute"] == "proteomics", "DependsOn"] = (
    "Component," + "Filename," + ",".join(p_df["Attribute"])
)

dm.loc[dm["Attribute"] == "proteomics"]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType
38,proteomics,Template used for contributing metadata to the...,,"Component,Filename,specimenID,sampleType,speci...",,True,Component,,,,Template,,,


In [201]:
dm['Module'].unique()

array(['Instrument', 'Template', 'Analysis', 'ValidValues', 'Assay',
       'Other', 'Consortium', 'Ontology', 'Repository', 'SampleType',
       'Unspecified', 'Unit', 'Metadata', 'Model Organism',
       'BaseAnnotation'], dtype=object)

In [202]:
dm["Parent"].unique()

array(['ValidValue', 'Component', 'AnalysisType', 'MeasurementUnit',
       'ManifestColumn', 'Ontology', 'Repository', 'Demographics',
       'BaseAnnotation'], dtype=object)

In [203]:
dm = dm.set_index('Attribute')

In [204]:
dm.shape

(700, 13)

In [205]:
pd.concat([dm, p_df_final])

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10x Visium Spatial Gene Expression,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,,,,
EnVision 2103 Multiplate Reader,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,,,,
Exploris 240Exploris 480quadrupole time-of-flight,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,,,,
FIA-MSMS,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,,,,
Illumina 1M,Instrument Model Name,,,,False,ValidValue,,,,Instrument,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OtherDatabaseName,"When DatabaseName = `other`, add your custom v...",,specifyDatabaseName,ValidValue,False,databaseName,,,,Other,Sage Bionetworks,proteomics,string,A data contributor should be able to write in ...,other
OtherDatabaseSource,"When DatabaseSource = `other`, add your custom...",,specifyDatabaseSource,ValidValue,False,databaseSource,,,,Other,Sage Bionetworks,proteomics,string,A data contributor should be able to write in ...,other
OtherDataFile,"When DataFile = `other`, add your custom value...",,specifyOtherDataFile,ValidValue,False,dataFile,,,,Other,Sage Bionetworks,proteomics,string,,other
OtherModificationParameters,"When ModificationParameters = `other`, add you...",,specifyOtherModificationParameters,ValidValue,False,modificationParameters,,,,Other,Sage Bionetworks,proteomics,string,,other


In [206]:
dm.loc[dm['UsedIn'].str.contains('proteomics', na = False)]

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
platformLocation,"The name of the laboratory, facility, vendor, ...","OtherPlatformLocation, Unknown, Not collected,...",,,True,ManifestColumn,,Sage Bionetworks,,Assay,,"Metabolomics Human,Microbiome,RNAseq,Whole Gen...",
technologyPlatformVersion,"The specific version (application, manufacture...","10x Visium Spatial Gene Expression, Affymetrix...",,,True,ManifestColumn,,Sage Bionetworks,,Assay,,"Metabolomics Human,Microbiome,RNAseq,Whole Gen...",
specifyDatabaseName,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,proteomics",String
specifyDatabaseSource,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,proteomics",String
specifyDigestionMethod,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,proteomics,String
specifyMeasurementTechnique,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,Microbiome,RNAseq,Whole Gen...",String
specifyMSAnalyteType,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,proteomics",String
specifyMSAnalyzerTypeMS1,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,proteomics",String
specifyMSAnalyzerTypeMS2,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,proteomics",String
specifyMSAssayTechnique,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,,"Metabolomics Human,proteomics",String


In [207]:
dm.loc[dm["UsedIn"].str.contains("proteomics", na=False)].merge(p_df_final, how = 'outer', on = 'Attribute')

Unnamed: 0_level_0,Description_x,Valid Values_x,DependsOn_x,Properties_x,Required_x,Parent_x,DependsOn Component,Source,Validation Rules,Module_x,...,Required_y,Module_y,columnType_y,Ontology_y,Notes,Properties_y,DependsOn_y,Parent_y,OtherValue,UsedIn_y
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
platformLocation,"The name of the laboratory, facility, vendor, ...","OtherPlatformLocation, Unknown, Not collected,...",,,True,ManifestColumn,,Sage Bionetworks,,Assay,...,True,Metadata,string,"Sage Bionetworks,DSLWG",A data contributor should be able to write in ...,ManifestColumn,,,,proteomics
technologyPlatformVersion,"The specific version (application, manufacture...","10x Visium Spatial Gene Expression, Affymetrix...",,,True,ManifestColumn,,Sage Bionetworks,,Assay,...,True,Metadata,string,http://purl.obolibrary.org/obo/NCIT_C45378,A data contributor should be able to write in ...,ManifestColumn,,,,proteomics
specifyDatabaseName,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,...,True,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ManifestColumn,,,,proteomics
specifyDatabaseSource,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,...,True,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ManifestColumn,,,,proteomics
specifyDigestionMethod,Value is determined by the data contributor,,,,False,ManifestColumn,,Sage Bionetworks,,Other,...,True,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ManifestColumn,,,,proteomics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OtherDatabaseName,,,,,,,,,,,...,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ValidValue,specifyDatabaseName,databaseName,other,proteomics
OtherDatabaseSource,,,,,,,,,,,...,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,ValidValue,specifyDatabaseSource,databaseSource,other,proteomics
OtherDataFile,,,,,,,,,,,...,False,Other,string,Sage Bionetworks,,ValidValue,specifyOtherDataFile,dataFile,other,proteomics
OtherModificationParameters,,,,,,,,,,,...,False,Other,string,Sage Bionetworks,,ValidValue,specifyOtherModificationParameters,modificationParameters,other,proteomics


In [208]:
# combine duplicated rows
p_df_final = p_df_final.fillna("").astype(str).groupby(level=0).apply(
    lambda x: x.apply(lambda y: ",".join(y))
)

In [209]:
p_df_final.index.is_unique

True

In [210]:
# Update empty values with the new dataframe
dm.update(p_df_final, overwrite=False, errors = 'ignore')

In [212]:
dm.loc[dm['UsedIn'].str.contains('proteomics', na = False)].shape

(41, 13)

In [231]:
# join in the new template
dm_final = pd.concat([dm, p_df_final.loc[~p_df_final.index.isin(dm.index)]])
dm_final.shape

(733, 15)

In [232]:
# Replace any other values to leave open for contributors to add
dm_final.loc["platformLocation", "Valid Values"] = np.nan
dm_final.loc["dataFile", "Valid Values"] = np.nan
dm_final.loc["acquisitionMode", "Valid Values"] = np.nan
dm_final.loc["acquisitionSoftware", "Valid Values"] = np.nan

In [233]:
dm_final.loc[p_df_final.index]

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Label freeLabelQuantiation,"When labelFreeQuantitation = `other`, add your...",,labelFreeQuantitation,ValidValue,False,labelQuantiation,,,,Other,Sage Bionetworks,proteomics,string,,Label Free
OtherCleavageAgents,"When CleavageAgent = `other`, add your custom ...",,specifyOtherCleavageAgent,ValidValue,False,cleavageAgents,,,,Other,Sage Bionetworks,proteomics,string,,other
OtherDataFile,"When DataFile = `other`, add your custom value...",,specifyOtherDataFile,ValidValue,False,dataFile,,,,Other,Sage Bionetworks,proteomics,string,,other
OtherDatabaseName,"When DatabaseName = `other`, add your custom v...",,specifyDatabaseName,ValidValue,False,databaseName,,,,Other,Sage Bionetworks,proteomics,string,A data contributor should be able to write in ...,other
OtherDatabaseSource,"When DatabaseSource = `other`, add your custom...",,specifyDatabaseSource,ValidValue,False,databaseSource,,,,Other,Sage Bionetworks,proteomics,string,A data contributor should be able to write in ...,other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
specifyProteomicsAssayType,Value is determined by the data contributor,,,ManifestColumn,False,ManifestColumn,,Sage Bionetworks,,Other,Sage Bionetworks,proteomics,String,,
specifySampleType,Value is determined by the data contributor,,,ManifestColumn,False,ManifestColumn,,Sage Bionetworks,,Other,Sage Bionetworks,"Metabolomics Human,Microbiome,RNAseq,Whole Gen...",String,,
specimenID,Identifying string linked to a particular samp...,,,ManifestColumn,True,ManifestColumn,,Sage Bionetworks,,Metadata,Sage Bionetworks,"Biospecimen human,Biospecimen nonHuman,Metabol...",string,,
spectrometerFrequency,The frequency at which a spectrometer causes h...,,,ManifestColumn,True,ManifestColumn,,Sage Bionetworks,,Unspecified,"Sage Bionetworks,DSLWG","Metabolomics Human,proteomics",number,,


In [234]:
# qa check
with pd.option_context('display.max_rows', None):
    display(dm_final[dm_final.index.duplicated(keep=False)].sort_index())

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [235]:
# Check there are unique attributes
dm_final.index.is_unique

True

In [225]:
# no longer need column
dm_final = dm_final.drop(columns=["OtherValue"])

In [236]:
# Write out data model
dm_final.to_csv('../EL.data.model.csv')