In [82]:
import pandas as pd
import numpy as np
import re

RFC = "Metabolomics"

In [195]:
dm = pd.read_csv('../EL.data.model.csv').drop(columns = ['Unnamed: 0'], errors = 'ignore')

In [84]:
dm['Parent'].unique()

array(['ValidValue', 'ManifestColumn', 'MeasurementUnit', 'Demographics',
       'BaseAnnotation', 'AnalysisType', 'Component', 'Ontology',
       'Template', nan, 'labelQuantiation', 'Repository',
       'cleavageAgents', 'databaseName', 'databaseSource', 'dataFile',
       'digestionMethod', 'labelFreeQuantitation', 'measurementTechnique',
       'modificationParameters', 'msAnalyteType', 'msAnalyzerTypeMS1',
       'msAnalyzerTypeMS2', 'msAssayTechnique', 'msInstrumentModel',
       'platformLocation', 'proteomics digestionReagent',
       'proteomicsAssayType', 'sampleType', 'technologyPlatformVersion'],
      dtype=object)

In [85]:
other_values = ["Other","Unknown","Not collected","Not applicable","Not specified"]

In [86]:
p_df = pd.read_excel("../_data/RFC Tables/EL RFC Metabolomics Human Data Model.xlsx")

p_df = p_df.rename(
    columns={
        "key": "Attribute",
        "description": "Description",
        "required": "Required",
        "requires": "Module",
        "concept source ontology": "Ontology",
        "valid values": "Valid Values",
        "type": "columnType",
        "note": "Notes",
    }
)
p_df = p_df.fillna("")

p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,multivalue,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,\nmetabolomics",0.0,string,Sage Bionetworks,values unique to each data contributor)
1,sampleType,The type of sample collected or the term used ...,,True,metabolomics,0.0,string,"Sage Bionetworks,\n",A data contributor should be able to write in ...
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,"metabolomics,\nsampleType = other",0.0,string,Sage Bionetworks,
3,measurementTechnique,The name of the measurement technique describi...,,True,metabolomics,1.0,string,Sage Bionetworks,A data contributor should be able to write in ...
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,metabolomics\nmeasurementTechnique = other,0.0,string,Sage Bionetworks,A data contributor should be able to write in ...
...,...,...,...,...,...,...,...,...,...
64,processingBatchID,"Processing batch identifier, provided by the d...",\n,False,metabolomics,0.0,string,Sage Bionetworks,
65,processingBatchSize,The number of samples,,False,metabolomics,0.0,string,Sage Bionetworks,
66,processingBatchSizeUnit,The unit of measurement for number of samples ...,"AFU,\nAI,\nAU/ml,\nDK units/ml,\nbp\ng/dl,\ng/...",False,"metabolomics,\nprocessingBatchSize",0.0,string,Sage Bionetworks,
67,specifyProcessingBatchSizeUnit,"If ""other"" list units of measure",,False,"metabolomics,\nProcessingBatchSizeUnit = other",0.0,string,Sage Bionetworks,


In [87]:
# string clean up
# Do not need to explicitly add "other" type values. Will add back later for list type attributes
p_df["Valid Values"] = (
    p_df["Valid Values"]
    .replace("n/a (unique to each data contributor)", "", regex=False)
    .str.replace(
        "Other|Unknown|Not collected|Not applicable|Not specified", "", regex=True
    )
    .str.strip(",")
)

# replace note type values
p_df["Valid Values"] = p_df["Valid Values"].apply(
    lambda x: ",".join(
        [y.strip() for y in x.split(",") if not bool(re.search("Possible values", y))]
    ).strip(",")
)

p_df = p_df.replace(r"\n", ",", regex=True).replace(r",,", ",", regex=True)

p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,multivalue,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,metabolomics",0.0,string,Sage Bionetworks,values unique to each data contributor)
1,sampleType,The type of sample collected or the term used ...,,True,metabolomics,0.0,string,"Sage Bionetworks,",A data contributor should be able to write in ...
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,"metabolomics,sampleType = other",0.0,string,Sage Bionetworks,
3,measurementTechnique,The name of the measurement technique describi...,,True,metabolomics,1.0,string,Sage Bionetworks,A data contributor should be able to write in ...
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,"metabolomics,measurementTechnique = other",0.0,string,Sage Bionetworks,A data contributor should be able to write in ...
...,...,...,...,...,...,...,...,...,...
64,processingBatchID,"Processing batch identifier, provided by the d...",,False,metabolomics,0.0,string,Sage Bionetworks,
65,processingBatchSize,The number of samples,,False,metabolomics,0.0,string,Sage Bionetworks,
66,processingBatchSizeUnit,The unit of measurement for number of samples ...,"AFU,AI,AU/ml,DK units/ml,bp,g/dl,g/l,gm,HAU,IU...",False,"metabolomics,processingBatchSize",0.0,string,Sage Bionetworks,
67,specifyProcessingBatchSizeUnit,"If ""other"" list units of measure",,False,"metabolomics,ProcessingBatchSizeUnit = other",0.0,string,Sage Bionetworks,


In [88]:
p_df["Required"] = (
    p_df["Required"].astype(str)
    .apply(lambda x: False if bool(re.search("f", x)) else True)
    .astype(bool)
)

p_df = p_df.drop(columns=["multivalue"])

p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,metabolomics",string,Sage Bionetworks,values unique to each data contributor)
1,sampleType,The type of sample collected or the term used ...,,True,metabolomics,string,"Sage Bionetworks,",A data contributor should be able to write in ...
2,specifySampleType,"If ""other"" is selected list the type of sample",,True,"metabolomics,sampleType = other",string,Sage Bionetworks,
3,measurementTechnique,The name of the measurement technique describi...,,True,metabolomics,string,Sage Bionetworks,A data contributor should be able to write in ...
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,True,"metabolomics,measurementTechnique = other",string,Sage Bionetworks,A data contributor should be able to write in ...
...,...,...,...,...,...,...,...,...
64,processingBatchID,"Processing batch identifier, provided by the d...",,True,metabolomics,string,Sage Bionetworks,
65,processingBatchSize,The number of samples,,True,metabolomics,string,Sage Bionetworks,
66,processingBatchSizeUnit,The unit of measurement for number of samples ...,"AFU,AI,AU/ml,DK units/ml,bp,g/dl,g/l,gm,HAU,IU...",True,"metabolomics,processingBatchSize",string,Sage Bionetworks,
67,specifyProcessingBatchSizeUnit,"If ""other"" list units of measure",,True,"metabolomics,ProcessingBatchSizeUnit = other",string,Sage Bionetworks,


In [89]:
# split the values out and create new attributes to add to the model.
# These will be the intermediate attributes that will allow the original column i.e. SampleType to then have the user fill in specifySampleType
p_df_others = p_df.loc[p_df["Module"].str.contains("=", na=False)].copy(deep=True)
p_df_others["others"] = (
    p_df_others["Module"]
    .str.split(",")
    .apply(lambda x: "".join([y.strip() for y in x if bool(re.search("=", y))]))
)

# in case there are multiple equals signs
p_df_others = p_df_others.explode("others")

p_df_others["Parent"] = (
    p_df_others["others"].str.split("=").apply(lambda x: x[0].strip())
)
p_df_others["OtherValue"] = (
    p_df_others["others"].str.split("=").apply(lambda x: x[1].strip())
)

p_df_others["others"] = (
    p_df_others["others"]
    .str.split("=")
    .apply(lambda x: x[1].strip().capitalize() + x[0][0].upper() + x[0][1:])
)
# Swap others -> Attribute and Attribute -> DependsOn
p_df_others = p_df_others.rename(
    columns={"Attribute": "DependsOn", "others": "Attribute"}
)
p_df_others["Required"] = False
p_df_others["Module"] = "Other"
p_df_others["Valid Values"] = ""
p_df_others

Unnamed: 0,DependsOn,Description,Valid Values,Required,Module,columnType,Ontology,Notes,Attribute,Parent,OtherValue
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,Other,string,Sage Bionetworks,,OtherSampleType,sampleType,other
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMeasurementTechnique,measurementTechnique,other
6,specifyPlatformVersion,"If ""other"" list the name of the platform version",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherTechnologyPlatformVersion,technologyPlatformVersion,other
9,specifyPlatformLocation,"If ""other"" list the name of the platform location",,False,Other,string,Sage Bionetworks,,OtherPlatformLocation,platformLocation,other
17,specifyMSAnalyzerTypeMS1,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS1,msAnalyzerTypeMS1,other
19,specifyMSAnalyzerTypeMS2,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS2,msAnalyzerTypeMS2,other
21,specifyMSAssayTechnique,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMsAssayTechnique,msAssayTechnique,other
22,msAnalyteType,The type of biospecimen subjected to analysis ...,,False,Other,string,http://purl.obolibrary.org/obo/NCIT_C156434,A data contributor should be able to write in ...,TargetedMsTarget,msTarget,Targeted
23,specifyMSAnalyteType,"If ""other"" list the type of analyte",,False,Other,string,Sage Bionetworks,,OtherMsAnalyteType,msAnalyteType,other
24,specifyMSInstrumentModel,"If ""other"" is selected list the name of the in...",,False,Other,string,Sage Bionetworks,,OtherMsInstrumentModel,msInstrumentModel,other


In [90]:
# Assign data modeling variables
p_df_others = p_df_others.assign(
    Description=p_df_others["DependsOn"].apply(
        lambda x: f"When {re.sub('specify|other', '',x, flags=re.IGNORECASE)} = `other`, add your custom value to the cell"
    ),
    columnType="string",
    Ontology="Sage Bionetworks",
    Required=False,
    Properties="ValidValue",
)

In [91]:
p_df_others

Unnamed: 0,DependsOn,Description,Valid Values,Required,Module,columnType,Ontology,Notes,Attribute,Parent,OtherValue,Properties
2,specifySampleType,"When SampleType = `other`, add your custom val...",,False,Other,string,Sage Bionetworks,,OtherSampleType,sampleType,other,ValidValue
4,specifyMeasurementTechnique,"When MeasurementTechnique = `other`, add your ...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMeasurementTechnique,measurementTechnique,other,ValidValue
6,specifyPlatformVersion,"When PlatformVersion = `other`, add your custo...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherTechnologyPlatformVersion,technologyPlatformVersion,other,ValidValue
9,specifyPlatformLocation,"When PlatformLocation = `other`, add your cust...",,False,Other,string,Sage Bionetworks,,OtherPlatformLocation,platformLocation,other,ValidValue
17,specifyMSAnalyzerTypeMS1,"When MSAnalyzerTypeMS1 = `other`, add your cus...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS1,msAnalyzerTypeMS1,other,ValidValue
19,specifyMSAnalyzerTypeMS2,"When MSAnalyzerTypeMS2 = `other`, add your cus...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS2,msAnalyzerTypeMS2,other,ValidValue
21,specifyMSAssayTechnique,"When MSAssayTechnique = `other`, add your cust...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,OtherMsAssayTechnique,msAssayTechnique,other,ValidValue
22,msAnalyteType,"When msAnalyteType = `other`, add your custom ...",,False,Other,string,Sage Bionetworks,A data contributor should be able to write in ...,TargetedMsTarget,msTarget,Targeted,ValidValue
23,specifyMSAnalyteType,"When MSAnalyteType = `other`, add your custom ...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyteType,msAnalyteType,other,ValidValue
24,specifyMSInstrumentModel,"When MSInstrumentModel = `other`, add your cus...",,False,Other,string,Sage Bionetworks,,OtherMsInstrumentModel,msInstrumentModel,other,ValidValue


In [92]:
# Add other attributes to the list of valid values
p_df.loc[~p_df.replace("", np.nan)["Valid Values"].isna(), 'Valid Values'] = p_df.loc[~p_df.replace("", np.nan)["Valid Values"].isna()].apply(lambda x: re.sub(',+', ',', ','.join(x['Valid Values'] + ',' + p_df_others.loc[p_df_others['Parent'] == x['Attribute'], 'Attribute'].values)), axis = 1)

In [93]:
# Last bit of cleanup
p_df["Properties"] = "ManifestColumn"
p_df["Module"] = p_df["Attribute"].apply(
    lambda x: "Other" if bool(re.search("specify", x)) else "Metadata"
)


In [94]:
print("Shape of original data frame:", p_df.shape)
print("Shape of others data frame:", p_df_others.shape)

p_df_final = pd.concat([p_df, p_df_others]).reset_index(drop=True).replace("", np.nan)
p_df_final = p_df_final.set_index("Attribute")

print("Shape of final data frame:", p_df_final.shape)

p_df_final.info()

Shape of original data frame: (69, 9)
Shape of others data frame: (28, 12)
Shape of final data frame: (97, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 97 entries, specimenID to OtherDataFile 
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Description   97 non-null     object
 1   Valid Values  8 non-null      object
 2   Required      97 non-null     bool  
 3   Module        97 non-null     object
 4   columnType    95 non-null     object
 5   Ontology      95 non-null     object
 6   Notes         32 non-null     object
 7   Properties    97 non-null     object
 8   DependsOn     28 non-null     object
 9   Parent        28 non-null     object
 10  OtherValue    28 non-null     object
dtypes: bool(1), object(10)
memory usage: 8.4+ KB


In [95]:
p_df_final["UsedIn"] = RFC

In [96]:
# Add proteomics template to the data model updated with the attributes in DependsOn
dm.loc[dm["Attribute"] == RFC, "DependsOn"] = (
    "Component," + "Filename," + ",".join(p_df["Attribute"])
)

dm.loc[dm["Attribute"] == "proteomics"]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,multivalue
515,proteomics,Template used for contributing metadata to the...,,"Component,Filename,specimenID,sampleType,speci...",,True,Component,,,,Template,,,,,


In [97]:
dm['Module'].unique()

array(['ValidValues', 'Instrument', 'Assay', 'Unspecified', 'Metadata',
       'SampleType', 'BaseAnnotation', 'Analysis', 'Template',
       'Model Organism', 'Ontology', 'Tool', 'Unit', 'Consortium',
       'Other', 'Repository'], dtype=object)

In [190]:
dm = dm.set_index('Attribute')

In [191]:
dm.shape

(755, 15)

In [100]:
# combine duplicated rows
p_df_final = p_df_final.fillna("").astype(str).groupby(level=0).apply(
    lambda x: x.apply(lambda y: ",".join(y))
)

In [101]:
p_df_final.index.is_unique

True

In [102]:
# Update empty values with the new dataframe
dm.update(p_df_final, overwrite=False, errors = 'ignore')

In [129]:
dm['Description'].update(p_df_final['Description'])

In [130]:
dm.loc[dm['UsedIn'].str.contains(RFC, na = False)].shape

(65, 15)

In [167]:
# join in the new template
dm_final = pd.concat([dm, p_df_final])
dm_final.shape

(832, 16)

In [168]:
# Replace any other values to leave open for contributors to add
dm_final.loc["platformLocation", "Valid Values"] = np.nan
dm_final.loc["dataFile", "Valid Values"] = np.nan
dm_final.loc["acquisitionMode", "Valid Values"] = np.nan
dm_final.loc["acquisitionSoftware", "Valid Values"] = np.nan

In [169]:
# qa check
with pd.option_context('display.max_rows', None):
    display(dm_final[dm_final.index.duplicated(keep=False)].sort_index())

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,multivalue,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
OtherDataFile,"When DataFile = `other`, add your custom value...",,specifyOtherDataFile,ValidValue,False,dataFile,,,,Other,Sage Bionetworks,proteomics,string,,,
OtherDataFile,"When DataFile = `other`, add your custom value...",,specifyOtherDataFile,ValidValue,False,dataFile,,,,Other,Sage Bionetworks,Metabolomics,string,,,other
OtherMeasurementTechnique,"When MeasurementTechnique = `other`, add your ...",,specifyMeasurementTechnique,ValidValue,False,measurementTechnique,,,,Other,Sage Bionetworks,Metabolomics,string,A data contributor should be able to write in ...,,other
OtherMeasurementTechnique,"When MeasurementTechnique = `other`, add your ...",,specifyMeasurementTechnique,ValidValue,False,measurementTechnique,,,,Other,Sage Bionetworks,proteomics,string,A data contributor should be able to write in ...,,
OtherMsAnalyteType,"When MSAnalyteType = `other`, add your custom ...",,specifyMSAnalyteType,ValidValue,False,msAnalyteType,,,,Other,Sage Bionetworks,proteomics,string,A data contributor should be able to write in ...,,
OtherMsAnalyteType,"When MSAnalyteType = `other`, add your custom ...",,specifyMSAnalyteType,ValidValue,False,msAnalyteType,,,,Other,Sage Bionetworks,Metabolomics,string,,,other
OtherMsAnalyzerTypeMS1,"When MSAnalyzerTypeMS1 = `other`, add your cus...",,specifyMSAnalyzerTypeMS1,ValidValue,False,msAnalyzerTypeMS1,,,,Other,Sage Bionetworks,proteomics,string,,,
OtherMsAnalyzerTypeMS1,"When MSAnalyzerTypeMS1 = `other`, add your cus...",,specifyMSAnalyzerTypeMS1,ValidValue,False,msAnalyzerTypeMS1,,,,Other,Sage Bionetworks,Metabolomics,string,,,other
OtherMsAnalyzerTypeMS2,"When MSAnalyzerTypeMS2 = `other`, add your cus...",,specifyMSAnalyzerTypeMS2,ValidValue,False,msAnalyzerTypeMS2,,,,Other,Sage Bionetworks,Metabolomics,string,,,other
OtherMsAnalyzerTypeMS2,"When MSAnalyzerTypeMS2 = `other`, add your cus...",,specifyMSAnalyzerTypeMS2,ValidValue,False,msAnalyzerTypeMS2,,,,Other,Sage Bionetworks,proteomics,string,,,


In [170]:
apply_cols = [x for x in dm_final.columns if x not in ['Description']]

In [171]:
# join duplicate values and then get unique list. Then join for single string. Could break if there are commas in teh description or something
dm_final[apply_cols] = (
    dm_final[apply_cols]
    .fillna("")
    .astype(str)
    .groupby(level=0)
    .apply(
        lambda x: x.apply(
            lambda y: ",".join(np.unique(",".join(y).split(",")))
        ).str.strip(",| ")
    )
)

dm_final[["Module", "UsedIn"]] = (
    dm_final[["Module", "UsedIn"]]
    .fillna("")
    .apply(
        lambda c: c.str.split(",").apply(
            lambda x: ",".join(
                [y for y in x if y not in ["Unspecified", "Metabolomics Human"]]
            )
        )
    )
)

In [172]:
dm_final.loc["msTarget"]

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,multivalue,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
msTarget,Specifies whether or not a specific molecule(s...,"Not Specified, Not applicable, Not collected, ...",,ManifestColumn,True,ManifestColumn,,Sage Bionetworks,,Metadata,"DSLWG,Sage Bionetworks","Metabolomics,proteomics",string,,,
msTarget,Specifies whether or not a specific molecule(s...,"Not Specified, Not applicable, Not collected, ...",,ManifestColumn,True,ManifestColumn,,Sage Bionetworks,,Metadata,"DSLWG,Sage Bionetworks","Metabolomics,proteomics",string,,,


In [173]:
dm_final.shape

(832, 16)

In [192]:
# Check there are unique attributes
dm_final.index.is_unique

True

In [214]:
# remove duplicate rows
dm_final = dm_final[~dm_final.index.duplicated(keep="first")]

In [212]:
dm_final = dm_final.loc[~dm_final.duplicated(subset="Attribute", keep="first")]
dm_final = dm_final.set_index('Attribute')

In [215]:
dm_final.shape

(730, 15)

In [216]:
dm_final = dm_final.sort_index(key=lambda x: x.str.lower())

In [217]:
# no longer need column
dm_final = dm_final.drop(columns=["OtherValue"], errors='ignore')

In [218]:
# Write out data model
dm_final.to_csv('../EL.data.model.csv')