In [214]:
import pandas as pd
import numpy as np
import re

In [215]:
dm = pd.read_csv('../EL.data.model.csv').drop(columns = ['Unnamed: 0'], errors = 'ignore')

In [216]:
dm['Parent'].unique()

array(['ValidValue', 'ManifestColumn', 'MeasurementUnit', 'Demographics',
       'BaseAnnotation', 'AnalysisType', 'Component', 'Ontology'],
      dtype=object)

In [217]:
other_values = ["Other","Unknown","Not collected","Not applicable","Not specified"]

In [218]:
p_df = pd.read_excel("../_data/RFC Tables/EL RFC_ Assay_proteomics Data Model.xlsx")

p_df = p_df.rename(
    columns={
        "key": "Attribute",
        "description": "Description",
        "required": "Required",
        "requires": "Module",
        "concept source ontology": "Ontology",
        "valid values": "Valid Values",
        "type": "columnType",
        "note": "Notes",
    }
)
p_df = p_df.fillna("")

# string clean up
# Do not need to explicitly add "other" type values. Will add back later for list type attributes
p_df["Valid Values"] = (
    p_df["Valid Values"]
    .replace("n/a (unique to each data contributor)", "", regex=False)
    .str.replace(
        "Other|Unknown|Not collected|Not applicable|Not specified", "", regex=True
    )
    .str.strip(",")
)

# replace note type values
p_df["Valid Values"] = p_df["Valid Values"].apply(
    lambda x: ",".join(
        [y.strip() for y in x.split(",") if not bool(re.search("Possible values", y))]
    ).strip(",")
)

p_df = p_df.replace(r"\n", ",", regex=True).replace(r",,", ",", regex=True)

p_df["Required"] = (
    p_df["Required"]
    .apply(lambda x: False if bool(re.search("f", x)) else True)
    .astype(bool)
)

p_df = p_df.drop(columns=["multivalue"])

p_df

Unnamed: 0,Attribute,Description,Valid Values,Required,Module,columnType,Ontology,Notes
0,specimenID,Identifying string linked to a particular samp...,,True,"Biospecimen,proteomics",string,Sage Bionetworks,
1,sampleType,The type of sample collected,"Amniotic Fluid,Appendix,B cell,Basophils,Bone,...",True,proteomics,string,"Sage Bionetworks,ImmPort","The sample types are adopted from Uberon, Cell..."
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,"proteomics,sampleType = other",string,Sage Bionetworks,
3,measurementTechnique,The measurement technique describing the assay...,"16S rRNA gene sequencing,1D Gel,2D Gel,Array,B...",True,proteomics,string,Sage Bionetworks,
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,"proteomics,measurementTechnique = other",string,Sage Bionetworks,
5,technologyPlatformVersion,"The specific version (application, manufacture...","10x Visium Spatial Gene Expression,​​Affymetri...",True,proteomics,string,http://purl.obolibrary.org/obo/NCIT_C45378,
6,specifyPlatformVersion,"If ""other"" list the name of the platform version",,False,"proteomics,technologyPlatformVersion = other",string,Sage Bionetworks,
7,platformLocation,"The name of the laboratory, facility, vendor, ...",,True,proteomics,string,"Sage Bionetworks,DSLWG",
8,specifyPlatformLocation,"If ""other"" list the name of the platform location",,False,"proteomics,platformLocation = other",string,Sage Bionetworks,
9,msTarget,Specifies whether or not a specific molecule(s...,"Targeted,Untargeted",True,proteomics,string,"Sage Bionetworks,DSLWG",


In [219]:
dm.loc[dm["Attribute"] == "proteomics", "DependsOn"] = (
    "Component," + "Filename," + ",".join(p_df["Attribute"])
)

dm.loc[dm["Attribute"] == "proteomics"]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType
476,proteomics,Template used for contributing metadata to the...,,"Component,Filename,specimenID,sampleType,speci...",,True,Component,,,,Template,,,


In [220]:
# split the values out and create new attributes to add to the model.
# These will be the intermediate attributes that will allow the original column i.e. SampleType to then have the user fill in specifySampleType
p_df_others = p_df.loc[p_df["Module"].str.contains("=", na=False)].copy(deep=True)
p_df_others["others"] = (
    p_df_others["Module"]
    .str.split(",")
    .apply(lambda x: "".join([y.strip() for y in x if bool(re.search("=", y))]))
)
# in case there are multiple equals signs
p_df_others = p_df_others.explode("others")

p_df_others["Parent"] = (
    p_df_others["others"].str.split("=").apply(lambda x: x[0].strip())
)
p_df_others["OtherValue"] = (
    p_df_others["others"].str.split("=").apply(lambda x: x[1].strip())
)

p_df_others["others"] = (
    p_df_others["others"]
    .str.split("=")
    .apply(lambda x: x[1].strip().capitalize() + x[0][0].upper() + x[0][1:])
)
# Swap others -> Attribute and Attribute -> DependsOn
p_df_others = p_df_others.rename(
    columns={"Attribute": "DependsOn", "others": "Attribute"}
)
p_df_others["Required"] = False
p_df_others["Module"] = "Other"
p_df_others["Valid Values"] = ""
p_df_others

Unnamed: 0,DependsOn,Description,Valid Values,Required,Module,columnType,Ontology,Notes,Attribute,Parent,OtherValue
2,specifySampleType,"If ""other"" is selected list the type of sample",,False,Other,string,Sage Bionetworks,,OtherSampleType,sampleType,other
4,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,Other,string,Sage Bionetworks,,OtherMeasurementTechnique,measurementTechnique,other
6,specifyPlatformVersion,"If ""other"" list the name of the platform version",,False,Other,string,Sage Bionetworks,,OtherTechnologyPlatformVersion,technologyPlatformVersion,other
8,specifyPlatformLocation,"If ""other"" list the name of the platform location",,False,Other,string,Sage Bionetworks,,OtherPlatformLocation,platformLocation,other
11,specifyMSAnalyzerTypeMS1,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS1,msAnalyzerTypeMS1,other
13,specifyMSAnalyzerTypeMS2,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAnalyzerTypeMS2,msAnalyzerTypeMS2,other
15,specifyMSAssayTechnique,"If ""other"" is selected list the name of the ma...",,False,Other,string,Sage Bionetworks,,OtherMsAssayTechnique,msAssayTechnique,other
16,msAnalyteType,the type of biospecimen subjected to analysis,,False,Other,string,http://purl.obolibrary.org/obo/NCIT_C156434,,TargetedMsTarget,msTarget,Targeted
17,specifyMSAnalyteType,"If ""other"" list the type of analyte",,False,Other,string,Sage Bionetworks,,OtherMsAnalyteType,msAnalyteType,other
19,specifyProteomicsAssayType,"If ""other"" is selected list the name of the ty...",,False,Other,string,Sage Bionetworks,,OtherProteomicsAssayType,proteomicsAssayType,other


In [221]:
# Assign data modeling variables
p_df_others = p_df_others.assign(
    Description=p_df_others["DependsOn"].apply(
        lambda x: f"When {re.sub('specify|other', '',x, flags=re.IGNORECASE)} = `other`, add your custom value to the cell"
    ),
    columnType="string",
    Ontology="Sage Bionetworks",
    Required=False,
    Properties="ValidValue",
)

In [222]:
# Add other attributes to the list of valid values
p_df.loc[~p_df.replace("", np.nan)["Valid Values"].isna(), 'Valid Values'] = p_df.loc[~p_df.replace("", np.nan)["Valid Values"].isna()].apply(lambda x: re.sub(',+', ',', ','.join(x['Valid Values'] + ',' + p_df_others.loc[p_df_others['Parent'] == x['Attribute'], 'Attribute'].values)), axis = 1)

In [223]:
dm['Module'].unique()

array(['ValidValues', 'Instrument', 'Assay', 'Unspecified', 'Metadata',
       'SampleType', 'BaseAnnotation', 'Analysis', 'Template',
       'Model Organism', 'Ontology', 'Unit', 'Project', 'Other'],
      dtype=object)

In [224]:
dm["Parent"].unique()

array(['ValidValue', 'ManifestColumn', 'MeasurementUnit', 'Demographics',
       'BaseAnnotation', 'AnalysisType', 'Component', 'Ontology'],
      dtype=object)

In [225]:
# Last bit of cleanup
p_df["Properties"] = "ManifestColumn"
p_df["Module"] = p_df["Attribute"].apply(
    lambda x: "Other" if bool(re.search("specify", x)) else "Metadata"
)


In [226]:
p_df_final = pd.concat([p_df, p_df_others]).reset_index(drop = True).replace('', np.nan)

In [227]:
p_df_final['UsedIn'] = 'proteomics'

In [228]:
p_df_final = p_df_final.set_index('Attribute')

In [229]:
dm = dm.set_index('Attribute')

In [230]:
dm.shape

(695, 13)

In [236]:
dm_final = pd.concat([dm.loc[~dm.index.isin(p_df_final.index)], p_df_final])
dm_final.shape

(713, 15)

In [237]:
dm_final.loc[p_df_final.index]

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
specimenID,Identifying string linked to a particular samp...,,,ManifestColumn,True,,,,,Metadata,Sage Bionetworks,proteomics,string,,
sampleType,The type of sample collected,"Amniotic Fluid,Appendix,B cell,Basophils,Bone,...",,ManifestColumn,True,,,,,Metadata,"Sage Bionetworks,ImmPort",proteomics,string,"The sample types are adopted from Uberon, Cell...",
specifySampleType,"If ""other"" is selected list the type of sample",,,ManifestColumn,False,,,,,Other,Sage Bionetworks,proteomics,string,,
measurementTechnique,The measurement technique describing the assay...,"16S rRNA gene sequencing,1D Gel,2D Gel,Array,B...",,ManifestColumn,True,,,,,Metadata,Sage Bionetworks,proteomics,string,,
specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,,ManifestColumn,False,,,,,Other,Sage Bionetworks,proteomics,string,,
technologyPlatformVersion,"The specific version (application, manufacture...","10x Visium Spatial Gene Expression,​​Affymetri...",,ManifestColumn,True,,,,,Metadata,http://purl.obolibrary.org/obo/NCIT_C45378,proteomics,string,,
specifyPlatformVersion,"If ""other"" list the name of the platform version",,,ManifestColumn,False,,,,,Other,Sage Bionetworks,proteomics,string,,
platformLocation,"The name of the laboratory, facility, vendor, ...",,,ManifestColumn,True,,,,,Metadata,"Sage Bionetworks,DSLWG",proteomics,string,,
specifyPlatformLocation,"If ""other"" list the name of the platform location",,,ManifestColumn,False,,,,,Other,Sage Bionetworks,proteomics,string,,
msTarget,Specifies whether or not a specific molecule(s...,"Targeted,Untargeted,TargetedMsTarget",,ManifestColumn,True,,,,,Metadata,"Sage Bionetworks,DSLWG",proteomics,string,,


In [241]:
# qa check
with pd.option_context('display.max_rows', None):
    display(dm_final[dm_final.index.duplicated(keep=False)].sort_index())

Unnamed: 0_level_0,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,Module,Ontology,UsedIn,columnType,Notes,OtherValue
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [242]:
dm_final[dm_final.astype(str).applymap(
    lambda x: True if bool(re.search("proteomics", x, flags=re.IGNORECASE)) else False
).sum(axis=1, numeric_only=True) > 0].index.duplicated().sum()

0

In [245]:
dm_final = dm_final.drop(columns=["OtherValue"])

In [246]:
dm_final.to_csv('../EL.data.model.csv')