- Fix inconsistencies in parent column


In [2]:
import pandas as pd
import numpy as np
import os

from glob import glob
import yaml
import re
import copy
import json

# Functions


In [3]:
# Clean list columns into single string
def join_strings(string):
    try:
        return ','.join(string)
    except:
        return ''

In [4]:
def search_df(df, pattern):
    mask = np.column_stack([df[col].str.contains(
        pattern, na=False, flags=re.IGNORECASE) for col in df])

    df = df.loc[mask.any(axis=1)]

    with pd.option_context("display.max_colwidth", None):
        display(df)

    return df

In [5]:
def find_row(df, attribute):
    """ Get indexes of the dataframe"""
    indexes = df.index[df['Attribute'].str.contains(
        "(^" + re.escape(attribute) + "$)", flags=re.IGNORECASE)].tolist()
    if len(indexes) != 0:
        return indexes
    else:
        print(attribute)
        return None


def replace_valid_value(df, indexes, regex_dict, attribute):
    """ Alter the dataframe valid values with the replacement value """
    if indexes == None:
        return df

    elif len(indexes) > 0:
        regex_dict = regex_dict[attribute]

        for index in indexes:

            df.loc[index, 'Valid Values'] = re.sub(
                **regex_dict, string=df.loc[index, 'Valid Values'])

            # print(df.loc[index, 'Valid Values'])
        return df
    else:
        return df


def code_equals_values(df, regex_dict, attribute):

    print('attribute: ', attribute)

    indexes = find_row(df, attribute)

    print('Index: ', indexes)

    # regex = replace_value(df, replacements, indexes)

    # print("Regex: ", regex)
    if indexes == None:
        return df
    else:
        df = replace_valid_value(df, indexes, regex_dict, attribute)
        return df

In [6]:
base_cols = ['Attribute',
             'Description',
             'Valid Values',
             'DependsOn',
             'Properties',
             'Required',
             'Parent',
             'DependsOn Component',
             'Source',
             'Validation Rules']

In [7]:
# Unzip compressed folder if downloaded from Google Drive
# %unzip 'RFC Tables-20230620T181152Z-001.zip'

# Collect RFCs


In [8]:
with open("../configs/notebook_config.yaml", 'r') as f:
    config = yaml.safe_load(f)

In [9]:
# Get all the RFC file paths
file_paths = glob(
    r"C:\Users\nlee\Documents\Projects\ELITE-DCC\ELITE-data-models\RFC Tables\*")
file_paths

['C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq_methylomics) data model.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL Assay_ RNAseq data model.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL Assay_ scRNAseq data model.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL RFC Assay_Whole Genome Sequencing Data Model.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL RFC genotyping_assay.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL RFC Metabolomics Human Data Model.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL RFC Microbiome Data Model.xlsx',
 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\RFC Tables\\EL RFC_ Assay_proteomics Data

Create Data Model for Schematic


In [10]:
# file_path = 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\RFC Tables\\EL RFC Assay_Whole Genome Sequencing Data Model.xlsx'
# file_name = os.path.basename(file_path)

# dm = pd.read_excel(file_path)


# # Create file_name column to check
# dm.insert(loc=0,
#           column='file_name',
#           value=file_name
#           )


# # Create new columnn for data model name
# dm.insert(loc=1,
#           column='dm',
#           value=re.sub("\s\s+", " ", re.sub("_", " ",
#                                             re.sub('(EL)|(RFC)|(\.xlsx)|([Aa]ssay)|([Dd]ata [Mm]odel)', "", file_name)).strip())
#           )

# dm.fillna("")
# dm.reset_index(drop=True, inplace=True)
# dm.head()

In [11]:
dm = pd.DataFrame()

# parse through files to create complete data model
for fp in file_paths:

    file_name = os.path.basename(fp)

    temp = pd.read_excel(fp)

    # Create file_name column to check
    temp.insert(loc=0,
                column='file_name',
                value=file_name
                )

    # Create new columnn for data model name
    temp.insert(loc=1,
                column='dm',
                value=re.sub("\s\s+", " ", re.sub("_", " ",
                                                  re.sub('(EL)|(RFC)|(\.xlsx)|([Aa]ssay)|([Dd]ata [Mm]odel)', "", file_name)).strip())
                )

    dm = pd.concat([dm, temp])

dm.fillna("")
dm.reset_index(drop=True, inplace=True)
dm.head()

Unnamed: 0,file_name,dm,key,description,valid values,required,requires,multivalue,type,concept source ontology,note,Unnamed: 9,Unnamed: 10,Unnamed: 8,ontology,term id
0,EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq_...,bsSeq (bisulfite-seq WGBS methylseq methylomics),specimenID,Identifying string linked to a particular samp...,n/a (unique to each data contributor),True,"Biospecimen,\nbsSeq",False,string,Sage Bionetworks,,,,,,
1,EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq_...,bsSeq (bisulfite-seq WGBS methylseq methylomics),sampleType,The type of sample collected,"Amniotic Fluid,\nAppendix,\nB cell,\nBasophils...",True,bsSeq,False,string,"Sage Bionetworks,\nImmPort","The sample types are adopted from Uberon, Cell...",,,,,
2,EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq_...,bsSeq (bisulfite-seq WGBS methylseq methylomics),specifySampleType,"If ""other"" is selected list the type of sample",,False,bsSeq\nsampleType = other,False,string,Sage Bionetworks,,,,,,
3,EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq_...,bsSeq (bisulfite-seq WGBS methylseq methylomics),measurementTechnique,The measurement technique describing the assay...,"16S rRNA gene sequencing,\n1D Gel,\n2D Gel,\nA...",True,bsSeq,False,string,Sage Bionetworks,,,,,,
4,EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq_...,bsSeq (bisulfite-seq WGBS methylseq methylomics),specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,"bsSeq,\nmeasurementTechnique = other",False,string,Sage Bionetworks,,,,,,


In [12]:
validation_coder = {
    'number': 'regex search ([0-9]+\.[0-9]*.?)|([0-9]+)',
    'integer': 'regex search ([0-9]+)',
    'string': ''
}

Data model clean up


In [13]:
dm[['required', 'multivalue']] = dm[[
    'required', 'multivalue']].fillna(False).astype(str)

In [14]:
dm = dm.apply(lambda x: x.str.replace(
    pat='\n|(n/a \(unique to each data contributor\))', repl='', regex=True).str.split(","), axis=1)

dm.head()

Unnamed: 0,file_name,dm,key,description,valid values,required,requires,multivalue,type,concept source ontology,note,Unnamed: 9,Unnamed: 10,Unnamed: 8,ontology,term id
0,[EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq...,[bsSeq (bisulfite-seq WGBS methylseq methylomi...,[specimenID],[Identifying string linked to a particular sam...,[],[True],"[Biospecimen, bsSeq]",[False],[string],[Sage Bionetworks],,,,,,
1,[EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq...,[bsSeq (bisulfite-seq WGBS methylseq methylomi...,[sampleType],[The type of sample collected],"[Amniotic Fluid, Appendix, B cell, Basophils, ...",[True],[bsSeq],[False],[string],"[Sage Bionetworks, ImmPort]","[The sample types are adopted from Uberon, Ce...",,,,,
2,[EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq...,[bsSeq (bisulfite-seq WGBS methylseq methylomi...,[specifySampleType],"[If ""other"" is selected list the type of sample]",,[False],[bsSeqsampleType = other],[False],[string],[Sage Bionetworks],,,,,,
3,[EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq...,[bsSeq (bisulfite-seq WGBS methylseq methylomi...,[measurementTechnique],[The measurement technique describing the assa...,"[16S rRNA gene sequencing, 1D Gel, 2D Gel, Arr...",[True],[bsSeq ],[False],[string],[Sage Bionetworks],,,,,,
4,[EL Assay_ bsSeq (bisulfite-seq_WGBS_methylseq...,[bsSeq (bisulfite-seq WGBS methylseq methylomi...,[specifyMeasurementTechnique],"[If ""other"" is selected list the name of the m...",,[False],"[bsSeq, measurementTechnique = other]",[False],[string],[Sage Bionetworks],,,,,,


In [15]:
# revert lists back to strings
dm = dm.applymap(lambda x: join_strings(x))

# Rename columns with DCA standards
dm_schema_cols = {
    "dm": "Parent",
    "key": "Attribute",
    "description": "Description",
    "valid values": "Valid Values",
    "required": "Required",
    'requires': "DependsOn Component",
    # "Parent": None,
    "concept source ontology": "Source"
    # "Validation Rules": None
}

dm = dm.rename(dm_schema_cols, axis=1)

# drop unimportant columns
r = re.compile("Unnamed*", re.IGNORECASE)

# drop_cols = list(filter(r.match, dm.columns.tolist())) + \
#     ['file_name', 'term id', 'ontology', 'multivalue', 'type', 'note']


# Add additional required columns for DCA
dm['Properties'] = ""
dm['Validation Rules'] = dm['type'].map(validation_coder)
dm['DependsOn'] = ""
# dm['DependsOn Component'] = ""

keep_cols = ['Parent', 'Attribute', 'Description',
             'Valid Values', 'Required', 'DependsOn', 'DependsOn Component', 'Properties', 'Validation Rules', 'Source']

dm = dm[keep_cols]

In [16]:
dm.head()

Unnamed: 0,Parent,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Source
0,bsSeq (bisulfite-seq WGBS methylseq methylomics),specimenID,Identifying string linked to a particular samp...,,True,,"Biospecimen,bsSeq",,,Sage Bionetworks
1,bsSeq (bisulfite-seq WGBS methylseq methylomics),sampleType,The type of sample collected,"Amniotic Fluid,Appendix,B cell,Basophils,Bone,...",True,,bsSeq,,,"Sage Bionetworks,ImmPort"
2,bsSeq (bisulfite-seq WGBS methylseq methylomics),specifySampleType,"If ""other"" is selected list the type of sample",,False,,bsSeqsampleType = other,,,Sage Bionetworks
3,bsSeq (bisulfite-seq WGBS methylseq methylomics),measurementTechnique,The measurement technique describing the assay...,"16S rRNA gene sequencing,1D Gel,2D Gel,Array,B...",True,,bsSeq,,,Sage Bionetworks
4,bsSeq (bisulfite-seq WGBS methylseq methylomics),specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,,"bsSeq,measurementTechnique = other",,,Sage Bionetworks


In [17]:
# Dropping measurement technique
dm = dm.drop(index=dm.query(
    'Attribute == "measurementTechnique"').index.values).reset_index(drop=True)

In [18]:
dm.reset_index(drop=True, inplace=True)

# QA Check
dm['Parent'].unique()

array(['bsSeq (bisulfite-seq WGBS methylseq methylomics)', 'RNAseq',
       'scRNAseq', 'Whole Genome Sequencing', 'genotyping',
       'Metabolomics Human', 'Microbiome', 'proteomics',
       'Biospecimen human', 'Biospecimen nonHuman', 'Individual Human',
       'Individual nonHuman'], dtype=object)

In [19]:
display(dm.dtypes)
display(dm.head())
display(dm.Parent.unique())

Parent                 object
Attribute              object
Description            object
Valid Values           object
Required               object
DependsOn              object
DependsOn Component    object
Properties             object
Validation Rules       object
Source                 object
dtype: object

Unnamed: 0,Parent,Attribute,Description,Valid Values,Required,DependsOn,DependsOn Component,Properties,Validation Rules,Source
0,bsSeq (bisulfite-seq WGBS methylseq methylomics),specimenID,Identifying string linked to a particular samp...,,True,,"Biospecimen,bsSeq",,,Sage Bionetworks
1,bsSeq (bisulfite-seq WGBS methylseq methylomics),sampleType,The type of sample collected,"Amniotic Fluid,Appendix,B cell,Basophils,Bone,...",True,,bsSeq,,,"Sage Bionetworks,ImmPort"
2,bsSeq (bisulfite-seq WGBS methylseq methylomics),specifySampleType,"If ""other"" is selected list the type of sample",,False,,bsSeqsampleType = other,,,Sage Bionetworks
3,bsSeq (bisulfite-seq WGBS methylseq methylomics),specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,False,,"bsSeq,measurementTechnique = other",,,Sage Bionetworks
4,bsSeq (bisulfite-seq WGBS methylseq methylomics),technologyPlatformVersion,"The specific version (application, manufacture...","0x Visium Spatial Gene Expression,​​Affymetrix...",True,,bsSeq,,,Sage Bionetworks


array(['bsSeq (bisulfite-seq WGBS methylseq methylomics)', 'RNAseq',
       'scRNAseq', 'Whole Genome Sequencing', 'genotyping',
       'Metabolomics Human', 'Microbiome', 'proteomics',
       'Biospecimen human', 'Biospecimen nonHuman', 'Individual Human',
       'Individual nonHuman'], dtype=object)

In [20]:
# Reorder Columns based on DCA Standards
dm = dm.loc[:, base_cols]

# Clean up

- Cleanup `DependsOn Component`


In [21]:
recoder = {
    'metabolmics': 'metabolomics',
    "(mass spec proteomics)": "Proteomics",
    '(mass spec metabolomics)': 'Metabolomics Human',
    "(assay_otheruseTreatment? = Yes)": "assay_other, useTreatment? = Yes",
    'OtherUnknown': 'Other, Unknown',
    'falseFalseFALSEtrueTrueTRUE': 'TRUE, FALSE',
    re.compile('Forwardreverse', flags=re.IGNORECASE): 'forward,reverse',
    re.compile('singleEndpairedEnd'): 'singleEnd, pairedEnd',
    re.compile('(WGS)'): 'Whole Genome Sequencing',
    re.compile('\?'): '',
    'Zeiss LSM 980Other': 'Zeiss LSM 980',
    'bsSeqsampleType = other': 'bsSeq, sampleType = other',
    re.compile('HPO, MONDO, MAXO codes or labels (not listed for purposes of this RFC)'): 'HPO and MONDO and MAXO codes or labels (not listed for purposes of this RFC)'
}

# 'mass spec metabolomics,measurementTechnique = other'
# falseFalseFALSEtrueTrueTRUE

In [22]:
with pd.option_context("display.max_colwidth", None):
    display(dm[dm['Valid Values'].str.contains("and Body Composition Study")])

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
335,cohort,Name of the cohort the individual belongs to,"Centenarian, US Family, Denmark Family, The Osteoporotic Fractures in Men (MrOS) Study, Study of Osteoporotic Fractures (SOF), The Health, Aging, and Body Composition Study (HealthABC), Cardiovascular Health Study (CHS),Other,Unknown,Not collected,Not applicable",,,True,Individual Human,Individual,Sage Bionetworks,


In [23]:
with pd.option_context("display.max_colwidth", None):
    display(dm.query('Attribute == "ethnicity"'))

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
344,ethnicity,Ethnicity of individual,"Not Hispanic or Latino, Hispanic or Latino, Prefer not to answer,Other,Unknown,Not collected,Not applicable",,,True,Individual Human,Individual,"Sage Bionetworks,https://www.synapse.org/#!Synapse:syn25878249",


In [24]:
# Inconsistency with parent and depends on component
dm['DependsOn Component'] = dm['DependsOn Component'].replace(
    recoder, regex=True)

dm['Valid Values'] = dm['Valid Values'].replace(
    recoder, regex=True)

In [25]:
dm = dm.apply(lambda x: x.replace(recoder, regex=True), axis=1)

In [26]:
# QA check
dm[dm['DependsOn Component'].str.contains(
    'metabolomics', case=False, na=False)]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
161,specimenID,Identifying string linked to a particular samp...,,,,True,Metabolomics Human,"Biospecimen,Metabolomics Human",Sage Bionetworks,
162,sampleType,The type of sample collected,"Amniotic Fluid,Appendix,B cell,Basophils,Bone,...",,,True,Metabolomics Human,Metabolomics Human,"Sage Bionetworks,ImmPort",
163,specifySampleType,"If ""other"" is selected list the type of sample",,,,False,Metabolomics Human,"Metabolomics Human,sampleType = other",Sage Bionetworks,
164,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,,,False,Metabolomics Human,Metabolomics HumanmeasurementTechnique = other,Sage Bionetworks,
165,technologyPlatformVersion,"The specific version (application, manufacture...","Other, Unknown,Not collected,Not applicable,No...",,,True,Metabolomics Human,Metabolomics Human,http://purl.obolibrary.org/obo/NCIT_C45378,
...,...,...,...,...,...,...,...,...,...,...
222,processingBatchID,"Processing batch identifier, provided by the d...",,,,False,Metabolomics Human,Metabolomics Human,Sage Bionetworks,
223,processingBatchSize,The number of samples,,,,False,Metabolomics Human,Metabolomics Human,Sage Bionetworks,
224,processingBatchSizeUnit,The unit of measurement for number of samples ...,"AFU,AI,AU/ml,DK units/ml,bpg/dl,g/l,gm,HAU,IU,...",,,False,Metabolomics Human,"Metabolomics Human,processingBatchSize",Sage Bionetworks,
225,specifyProcessingBatchSizeUnit,"If ""other"" list units of measure",,,,False,Metabolomics Human,"Metabolomics Human,ProcessingBatchSizeUnit = o...",Sage Bionetworks,


## Cleaning other values and equal values

Removing illegal characters


Remove any special characters


In [27]:
old_values = dm[dm['Attribute'].astype(
    str).str.contains("\(|\)|\?", na=False)]['Attribute'].unique().tolist()

old_values

['reagentID(s)', 'treatmentID(s)']

In [28]:
new_values = [re.sub("\(|\)|\?", "", t) for t in old_values]
new_values

['reagentIDs', 'treatmentIDs']

In [29]:
# dictionary of replacement values for data model
old_values = [re.compile(re.escape(ov)) for ov in old_values]

original_recoder = dict(zip(old_values, new_values))

original_recoder

{re.compile(r'reagentID\(s\)', re.UNICODE): 'reagentIDs',
 re.compile(r'treatmentID\(s\)', re.UNICODE): 'treatmentIDs'}

Clean up equals in depends on


In [30]:
# Split list to process other values
# Find the other columns in the data model
others = dm[dm['DependsOn Component'].str.contains(
    "=", na=False)].copy()

In [31]:
others['DependsOn Component Original'] = others['DependsOn Component'].str.split(
    ',')

In [32]:
# Create series of equals values to use for new attributes/ valid values relationship
others['equals_series'] = others['DependsOn Component Original'].apply(
    lambda x: [y for y in x if bool(re.search("=", y))][0])

others

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules,DependsOn Component Original,equals_series
2,specifySampleType,"If ""other"" is selected list the type of sample",,,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),"bsSeq, sampleType = other",Sage Bionetworks,,"[bsSeq, sampleType = other]",sampleType = other
3,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),"bsSeq,measurementTechnique = other",Sage Bionetworks,,"[bsSeq, measurementTechnique = other]",measurementTechnique = other
5,specifyPlatformVersion,"If ""other"" list the name of the platform version",,,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),"bsSeq,technologyPlatformVersion = other",Sage Bionetworks,,"[bsSeq, technologyPlatformVersion = other]",technologyPlatformVersion = other
7,specifyPlatformLocation,"If ""other"" list the name of the platform location",,,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),"bsSeq,platformLocation = other",Sage Bionetworks,,"[bsSeq, platformLocation = other]",platformLocation = other
13,specifyDNABatchSizeUnit,"If ""other"" list unit of measure",,,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),"bsSeq,dnaBatchSizeUnit = other",Sage Bionetworks,,"[bsSeq, dnaBatchSizeUnit = other]",dnaBatchSizeUnit = other
...,...,...,...,...,...,...,...,...,...,...,...,...
294,specifyOtherModificationParameters,"If ""other"" is selected, list the custom modifi...",,,,False,proteomics,"Proteomics,modificationParameters = other",Sage Bionetworks,,"[Proteomics, modificationParameters = other]",modificationParameters = other
296,specifyOtherCleavageAgent,"If ""other"" is selected, list the enzyme name.",,,,False,proteomics,"Proteomics,cleavageAgents = other",Sage Bionetworks,,"[Proteomics, cleavageAgents = other]",cleavageAgents = other
345,ethnicGroupCode,A coded value specifying the self-declared eth...,,,,False,Individual Human,"Individual,ethnicity = Hispanic or Latino",https://ncithesaurus.nci.nih.gov/ncitbrowser/p...,,"[Individual, ethnicity = Hispanic or Latino]",ethnicity = Hispanic or Latino
348,diagnosis,Indicate the disease or condition.,"HPO, MONDO, MAXO codes or labels (not listed f...",,,False,Individual Human,"Individual,diagnosisStatus = true","https://www.ebi.ac.uk/ols/ontologies/mondo,htt...",,"[Individual, diagnosisStatus = true]",diagnosisStatus = true


In [33]:
def create_new_value(old_value):
    new_vals = old_value.split('=')
    new_vals = [nv.strip() for nv in new_vals]
    # convert to camel case
    nv = new_vals[1].capitalize() + new_vals[0][0].upper()+new_vals[0][1:]
    return nv

In [34]:
others['equals_attribute'] = others['equals_series'].apply(create_new_value)

In [35]:
others['DependsOn Component'] = ''

In [36]:
equals_df = others.copy()

In [37]:
equals_df = equals_df.drop(columns='DependsOn')

In [38]:
equals_df = equals_df.rename({
    'Attribute': 'DependsOn',
    'equals_attribute': 'Attribute'
}, axis=1)[base_cols]

equals_df['DependsOn Component'] = ''

equals_df

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
2,OtherSampleType,"If ""other"" is selected list the type of sample",,specifySampleType,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
3,OtherMeasurementTechnique,"If ""other"" is selected list the name of the me...",,specifyMeasurementTechnique,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
5,OtherTechnologyPlatformVersion,"If ""other"" list the name of the platform version",,specifyPlatformVersion,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
7,OtherPlatformLocation,"If ""other"" list the name of the platform location",,specifyPlatformLocation,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
13,OtherDnaBatchSizeUnit,"If ""other"" list unit of measure",,specifyDNABatchSizeUnit,,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
...,...,...,...,...,...,...,...,...,...,...
294,OtherModificationParameters,"If ""other"" is selected, list the custom modifi...",,specifyOtherModificationParameters,,False,proteomics,,Sage Bionetworks,
296,OtherCleavageAgents,"If ""other"" is selected, list the enzyme name.",,specifyOtherCleavageAgent,,False,proteomics,,Sage Bionetworks,
345,Hispanic or latinoEthnicity,A coded value specifying the self-declared eth...,,ethnicGroupCode,,False,Individual Human,,https://ncithesaurus.nci.nih.gov/ncitbrowser/p...,
348,TrueDiagnosisStatus,Indicate the disease or condition.,"HPO, MONDO, MAXO codes or labels (not listed f...",diagnosis,,False,Individual Human,,"https://www.ebi.ac.uk/ols/ontologies/mondo,htt...",


# Update base attribute equals values


In [39]:
others['base_attribute'] = others['equals_series'].str.split("=", expand=True)[
    0]

In [40]:
others['Properties'] = 'dataProperty'

In [41]:
# dm[dm['Parent'].str.contains('bsSeq ', regex=True)]

In [42]:
temp = others['equals_series'].str.split("=", expand=True)
temp = temp.apply(lambda x: x.str.strip(), axis=1)
temp = temp.rename({0: 'base_attribute', 1: 'value_to_replace'}, axis=1)
temp['value_to_replace'] = temp['value_to_replace'].str.capitalize()

# new value
temp['new_value'] = others['equals_attribute']

temp.reset_index(drop=True, inplace=True)

temp

Unnamed: 0,base_attribute,value_to_replace,new_value
0,sampleType,Other,OtherSampleType
1,measurementTechnique,Other,OtherMeasurementTechnique
2,technologyPlatformVersion,Other,OtherTechnologyPlatformVersion
3,platformLocation,Other,OtherPlatformLocation
4,dnaBatchSizeUnit,Other,OtherDnaBatchSizeUnit
...,...,...,...
112,modificationParameters,Other,OtherModificationParameters
113,cleavageAgents,Other,OtherCleavageAgents
114,ethnicity,Hispanic or latino,Hispanic or latinoEthnicity
115,diagnosisStatus,True,TrueDiagnosisStatus


In [43]:
temp = temp.drop_duplicates()

In [44]:
temp[temp['new_value'] == 'YesHasIonizationSource?']

Unnamed: 0,base_attribute,value_to_replace,new_value


In [45]:
replacements = {}

for i, x in temp.iterrows():
    ba, vtr, nv = x
    replacements[ba] = {
        'pattern': re.compile("(" + vtr + ")", flags=re.IGNORECASE),
        'repl': nv}

replacements

{'sampleType': {'pattern': re.compile(r'(Other)', re.IGNORECASE|re.UNICODE),
  'repl': 'OtherSampleType'},
 'measurementTechnique': {'pattern': re.compile(r'(Other)',
  re.IGNORECASE|re.UNICODE),
  'repl': 'OtherMeasurementTechnique'},
 'technologyPlatformVersion': {'pattern': re.compile(r'(Other)',
  re.IGNORECASE|re.UNICODE),
  'repl': 'OtherTechnologyPlatformVersion'},
 'platformLocation': {'pattern': re.compile(r'(Other)',
  re.IGNORECASE|re.UNICODE),
  'repl': 'OtherPlatformLocation'},
 'dnaBatchSizeUnit': {'pattern': re.compile(r'(Other)',
  re.IGNORECASE|re.UNICODE),
  'repl': 'OtherDnaBatchSizeUnit'},
 'libraryPrep': {'pattern': re.compile(r'(Other)', re.IGNORECASE|re.UNICODE),
  'repl': 'OtherLibraryPrep'},
 'libraryPreparationMethod': {'pattern': re.compile(r'(Other)',
  re.IGNORECASE|re.UNICODE),
  'repl': 'OtherLibraryPreparationMethod'},
 'libraryVersion': {'pattern': re.compile(r'(Other)',
  re.IGNORECASE|re.UNICODE),
  'repl': 'OtherLibraryVersion'},
 'sequencingBatchSiz

In [46]:
for attribute in replacements.keys():
    dm = code_equals_values(dm, replacements, attribute)
    print("-" * 20)

attribute:  sampleType
Index:  [1, 47, 73, 110, 141, 162, 228, 260]
--------------------
attribute:  measurementTechnique
measurementTechnique
Index:  None
--------------------
attribute:  technologyPlatformVersion
Index:  [4, 50, 76, 113, 165, 231, 263]
--------------------
attribute:  platformLocation
Index:  [6, 52, 78, 115, 167, 233, 265]
--------------------
attribute:  dnaBatchSizeUnit
Index:  [12, 244]
--------------------
attribute:  libraryPrep
Index:  [15, 96, 127, 246]
--------------------
attribute:  libraryPreparationMethod
Index:  [17, 25, 61, 98, 129, 248]
--------------------
attribute:  libraryVersion
Index:  [19, 27, 63, 100, 131, 250]
--------------------
attribute:  sequencingBatchSizeUnit
Index:  [23, 94, 240]
--------------------
attribute:  readLengthUnits
Index:  [32, 68, 105, 136, 255]
--------------------
attribute:  repositoryName
Index:  [55, 81, 118]
--------------------
attribute:  transcriptType
Index:  [57, 83, 120]
--------------------
attribute:  rnaBa

  after removing the cwd from sys.path.


In [47]:
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', 0):
#     display(
#         dm[dm['Attribute'].str.contains(
#             'proteomicsAssayType', flags=re.IGNORECASE)]
#     )

In [48]:
equals_df['Properties'] = 'validValue'

# Add new attributes


In [49]:
dm['Properties'] = 'dataProperty'

In [50]:
dm.update(others[base_cols])

In [51]:
dm = pd.concat([dm, equals_df], ignore_index=True)

In [52]:
# Do not need
dm['DependsOn Component'] = ''

# Clean Up Valid Values and Add New Attributes


In [53]:
# valid values that contain other
pure_others = dm[dm['Valid Values'].str.contains(
    '([Oo]ther$)', flags=re.IGNORECASE)]

  This is separate from the ipykernel package so we can avoid doing imports until


In [54]:
pure_others['replacement_value'] = pure_others['Attribute'].apply(
    lambda x: "Other" + (x[0].upper() + x[1:]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [55]:
regex_dict = {}

pattern = '([Oo]ther$)'

for i, r in pure_others.iterrows():
    regex_dict[r['Attribute']] = {'pattern': re.compile(
        pattern, flags=re.IGNORECASE), 'repl': r['replacement_value']}

In [56]:
temp_dict = {}

for i, r in pure_others.iterrows():
    temp_dict[r['Attribute']] = {'pattern': str(re.compile(
        pattern, flags=re.IGNORECASE)), 'repl': r['replacement_value']}

json_formatted = json.dumps(temp_dict, indent=4)
print(json_formatted)

{
    "conversionRatioUnits": {
        "pattern": "re.compile('([Oo]ther$)', re.IGNORECASE)",
        "repl": "OtherConversionRatioUnits"
    },
    "vacuumPressureUnit": {
        "pattern": "re.compile('([Oo]ther$)', re.IGNORECASE)",
        "repl": "OtherVacuumPressureUnit"
    },
    "lensVoltagesUnit ": {
        "pattern": "re.compile('([Oo]ther$)', re.IGNORECASE)",
        "repl": "OtherLensVoltagesUnit "
    },
    "experiementalBatchSizeUnit": {
        "pattern": "re.compile('([Oo]ther$)', re.IGNORECASE)",
        "repl": "OtherExperiementalBatchSizeUnit"
    },
    "YesHasIonizationSource": {
        "pattern": "re.compile('([Oo]ther$)', re.IGNORECASE)",
        "repl": "OtherYesHasIonizationSource"
    }
}


In [57]:
for attribute in regex_dict.keys():
    dm = code_equals_values(dm, regex_dict, attribute)
    print("-" * 20)

attribute:  conversionRatioUnits
Index:  [42]
--------------------
attribute:  vacuumPressureUnit
Index:  [196]
--------------------
attribute:  lensVoltagesUnit 
Index:  [199]
--------------------
attribute:  experiementalBatchSizeUnit
Index:  [207]
--------------------
attribute:  YesHasIonizationSource
Index:  [432, 433, 434, 436, 437, 439, 440]
--------------------


  after removing the cwd from sys.path.


In [58]:
dm['Valid Values'] = dm['Valid Values'].str.replace(
    re.escape('(falseFalseFALSEtrueTrueTRUE)'), '', regex=True)

In [59]:
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', 0):
#     display(dm[dm['Valid Values'].str.contains(
#         re.escape('(not listed for purposes of this RFC)'))].sort_values(by='Source'))

In [60]:
def create_list_from_lists(main_list, new_list):
    try:
        main_list = main_list + list
    except:
        pass
    return main_list

In [61]:
split_valid_values = dm['Valid Values'].str.split(',')
new_valid_values = [i for i in split_valid_values]

In [62]:
test = []
for nv in new_valid_values:
    # print(nv)
    try:
        test = test + nv
    except:
        pass

test = list(np.unique(test))

In [63]:
valid_values = ','.join(dm['Valid Values'])
valid_values = valid_values.split(",")
valid_values = list(np.unique(valid_values))
valid_values = [v.strip() for v in valid_values if len(v) > 0]

In [64]:
# Fuzzy matching to find misspellings
# Fuzzy matching
from thefuzz import fuzz

scores = {}
for v in valid_values:
    scores[v] = {}
    for v2 in valid_values:
        if v == v2:
            next
        else:
            score = fuzz.ratio(v.lower(), v2.lower())
            if score == 100:
                scores[v][v2] = score
    if len(scores[v]) == 0:
        scores.pop(v)

scores

{'FALSE': {'False': 100},
 'False': {'FALSE': 100},
 'Lipid': {'lipid': 100},
 'Not Specified': {'Not specified': 100},
 'Not specified': {'Not Specified': 100},
 'Plasma': {'plasma': 100},
 'Protein': {'protein': 100},
 'Saliva': {'saliva': 100},
 'Serum': {'serum': 100},
 'Sputum': {'sputum': 100},
 'TRUE': {'True': 100},
 'True': {'TRUE': 100},
 'Urine': {'urine': 100},
 'lipid': {'Lipid': 100},
 'plasma': {'Plasma': 100},
 'protein': {'Protein': 100},
 'saliva': {'Saliva': 100},
 'serum': {'Serum': 100},
 'sputum': {'Sputum': 100},
 'urine': {'Urine': 100}}

In [65]:
new_values_recoded = []
for v in scores.values():
    new_values_recoded.append(list(v.keys())[0].lower())

new_values_recoded = np.unique(new_values_recoded)

recoder_valid_values = []
for nv in new_values_recoded:
    value_add = {re.compile(nv, flags=re.IGNORECASE): nv}
    recoder_valid_values.append(value_add)

In [66]:
recoder_valid_values

[{re.compile(r'false', re.IGNORECASE|re.UNICODE): 'false'},
 {re.compile(r'lipid', re.IGNORECASE|re.UNICODE): 'lipid'},
 {re.compile(r'not specified', re.IGNORECASE|re.UNICODE): 'not specified'},
 {re.compile(r'plasma', re.IGNORECASE|re.UNICODE): 'plasma'},
 {re.compile(r'protein', re.IGNORECASE|re.UNICODE): 'protein'},
 {re.compile(r'saliva', re.IGNORECASE|re.UNICODE): 'saliva'},
 {re.compile(r'serum', re.IGNORECASE|re.UNICODE): 'serum'},
 {re.compile(r'sputum', re.IGNORECASE|re.UNICODE): 'sputum'},
 {re.compile(r'true', re.IGNORECASE|re.UNICODE): 'true'},
 {re.compile(r'urine', re.IGNORECASE|re.UNICODE): 'urine'}]

# Cleanup valid values


In [67]:
def clean_list(x):
    try:
        result = ','.join([y.strip() for y in x])
        return result
    except:
        return ''

In [68]:
recoder_valid_values = {
    re.compile('Not Specified', flags=re.IGNORECASE): 'Not Specified',
    re.compile('(Other$)', flags=re.IGNORECASE): "Other",
    re.compile('lipid', re.IGNORECASE): 'Lipid',
    re.compile('plasma', re.IGNORECASE): 'Plasma',
    re.compile('protein', re.IGNORECASE): 'Protein',
    re.compile('saliva', re.IGNORECASE): 'Saliva',
    re.compile('serum', re.IGNORECASE): 'Serum',
    re.compile('sputum', re.IGNORECASE): 'Sputum',
    re.compile('urine', re.IGNORECASE): 'Urine',
    re.compile('(^0x Visium Spatial Gene Expression)'): '10x Visium Spatial Gene Expression',
    re.compile('falseFalseFALSEtrueTrueTRUE	'): 'TRUE, FALSE',
    re.compile('TRUE|TRUEDiagnosisStatus', re.IGNORECASE): 'TRUE',
    re.compile('TRUEDiagnosisStatus', re.IGNORECASE): 'TRUE',
    re.compile('FALSE', re.IGNORECASE): 'FALSE',
    re.compile('$f^', re.IGNORECASE): 'F',
    re.compile('UnknownNot collected'): 'Unknown, Not collected',
    re.compile(r"\u200b\u200b"): "",
    re.compile("The Health,Aging,and Body Composition Study \(HealthABC\)"): "The Health and Aging and Body Composition Study (HealthABC)",
    re.compile('Not Hispanic or latinoEthnicity'): 'Not Hispanic or latino',
    re.compile('Hispanic or latinoEthnicity'): 'Hispanic or latino',
    re.compile('HPO, MONDO, MAXO codes or labels \(not listed for purposes of this RFC\)'): 'HPO and MONDO and MAXO codes or labels (not listed for purposes of this RFC)'
}

In [69]:
dm['Valid Values'] = dm['Valid Values'].replace(recoder_valid_values, regex=True).str.split(
    ',').apply(lambda x: clean_list(x))

valid_values = list(np.unique(','.join(test).split(',')))

valid_values = [v.strip() for v in valid_values if len(v) > 0]

valid_values_df = pd.DataFrame(
    {
        'Attribute': pd.Series(valid_values),
        'Properties': pd.Series(['validValue' for v in valid_values]),
        'Required': pd.Series(['False' for v in valid_values])
    }
)

valid_values_df = valid_values_df[~valid_values_df['Attribute'].isin(
    dm['Attribute'].tolist())]

valid_values_df

Unnamed: 0,Attribute,Properties,Required
0,Aging,validValue,False
1,Baker pool,validValue,False
2,Cardiovascular Health Study (CHS),validValue,False
3,Denmark Family,validValue,False
4,FALSE,validValue,False
...,...,...,...
439,ul,validValue,False
440,umol/l,validValue,False
441,units/ml,validValue,False
442,urine,validValue,False


In [70]:
# adding valid values found in attribute columns
dm2 = pd.concat([dm, valid_values_df], axis=0, ignore_index=True)

In [71]:
# Fuzzy matching to find misspellings
# Fuzzy matching

from thefuzz import fuzz

valid_values = dm2['Attribute'].replace(
    recoder_valid_values, regex=True).tolist()

scores = {}
for v in valid_values:
    scores[v] = {}
    for v2 in valid_values:
        if v == v2:
            next
        else:
            score = fuzz.ratio(v.lower(), v2.lower())
            if score == 100:
                scores[v][v2] = score
    if len(scores[v]) == 0:
        scores.pop(v)

scores

{'f': {'F': 100}, 'F': {'f': 100}}

In [72]:
dm2[['Attribute', 'DependsOn']] = dm2[['Attribute', 'DependsOn']].apply(
    lambda x: x.replace(recoder_valid_values, regex=True))

In [73]:
dm2[dm2.duplicated(subset=['Attribute', 'Parent', 'DependsOn'], keep=False)
    ].sort_values(by=['Parent', 'Attribute'])

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
367,OtherLibraryPreparationMethod,"If ""other"" list the name of the library prepar...",,specifyLibraryPreparationMethod,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
370,OtherLibraryPreparationMethod,"If ""other"" list the name of the library prepar...",,specifyLibraryPreparationMethod,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
368,OtherLibraryVersion,"If ""other"" list the name of the library version",,specifyLibraryVersion,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
371,OtherLibraryVersion,"If ""other"" list the name of the library version",,specifyLibraryVersion,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
9,dnaBatchID,DNA isolation batch,"Other,Unknown,Not collected,Not applicable",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
10,dnaBatchID,"DNA isolation batch identifier, provided by th...","Unknown,Not collected,Not applicable,Not Speci...",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
17,libraryPreparationMethod,Method by which library was prepared,"10x,Accel-NGS 2S Plus,Accel-NGS Methyl-Seq,CEL...",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
25,libraryPreparationMethod,Method by which library was prepared,"10x,Accel-NGS 2S Plus,Accel-NGS Methyl-Seq,CEL...",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
19,libraryVersion,"Library Version: for example, rnaSeq 10x libra...","OtherLibraryVersion,Unknown,Not collected,Not ...",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
27,libraryVersion,"Library Version: for example, rnaSeq 10x libra...","OtherLibraryVersion,Unknown,Not collected,Not ...",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,


In [74]:
dm2.query('Attribute == "UnknownNot collected"')

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules


In [75]:
# print(dm2.shape)

# dm2 = dm2.drop_duplicates(
#     subset=['Attribute', 'Parent', 'DependsOn']).reset_index(drop=True)

# display(dm2.shape)

In [76]:
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', 0):
#     display(dm[dm['Attribute'] == 'sampleType'])

# Check columns for Speical Characters


In [77]:
check_cols = ['Attribute']

mask = np.column_stack([dm2[col].str.contains(
    "\(|\)", na=False) for col in dm2[check_cols]])

with pd.option_context("display.max_colwidth", None):
    display(dm2[check_cols].loc[mask.any(axis=1)])

Unnamed: 0,Attribute
144,reagentID(s)
152,treatmentID(s)
480,Cardiovascular Health Study (CHS)
484,MAXO codes or labels (not listed for purposes of this RFC)
487,Study of Osteoporotic Fractures (SOF)
489,The Osteoporotic Fractures in Men (MrOS) Study
492,and Body Composition Study (HealthABC)
531,CCF-BSO values (not listed for purposes of this RFC)
537,Cerebrospinal Fluid (CSF)
578,Genbank common names (not listed for purposes of this RFC)Unknown


# Create Manifests in data model


In [78]:
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', 0):
#     display(dm2.query('Properties == "dataProperty"').groupby(
#         'Parent').agg({'Attribute': list}).reset_index())

In [79]:
# Create the manifest templates based on the tables
manifests = dm2.query('Properties == "dataProperty"').groupby(
    'Parent').agg({'Attribute': list}).reset_index()

manifests['Attribute'] = manifests['Attribute'].apply(
    lambda x: x + ['Component'])

manifests['Required'] = 'True'
manifests['Properties'] = 'dataType'

manifests = manifests.rename(
    {'Attribute': 'DependsOn', 'Parent': 'Attribute'}, axis=1)
manifests['DependsOn'] = manifests['DependsOn'].apply(lambda x: ','.join(x))


# Store the data types in a list
dataTypes = manifests['Attribute'].tolist()

In [80]:
dm2 = dm2[~dm2['Attribute'].isin(dataTypes)]

dm2 = pd.concat([dm2, manifests], axis=0, ignore_index=True)

print(dm2.shape)

(904, 10)


In [81]:
dm2[dm2.duplicated(
    subset=['Attribute', 'DependsOn', 'Properties', 'Parent'], keep=False)].sort_values(by='Attribute')

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
497,10x Visium Spatial Gene Expression,,,,validValue,False,,,,
495,10x Visium Spatial Gene Expression,,,,validValue,False,,,,
568,FALSE,,,,validValue,False,,,,
483,FALSE,,,,validValue,False,,,,
482,FALSE,,,,validValue,False,,,,
825,Lipid,,,,validValue,False,,,,
639,Lipid,,,,validValue,False,,,,
690,Not Specified,,,,validValue,False,,,,
687,Not Specified,,,,validValue,False,,,,
367,OtherLibraryPreparationMethod,"If ""other"" list the name of the library prepar...",,specifyLibraryPreparationMethod,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,


In [82]:
# Another typo
dm2['Valid Values'] = dm2['Valid Values'].str.replace(
    'falseFalseFALSEtrueTrueTRUE', 'TRUE,FALSE')

In [83]:
dm2[dm2['Attribute'] == 'RNAseq']['DependsOn'].values[0].split(',')

['specimenID',
 'sampleType',
 'specifySampleType',
 'specifyMeasurementTechnique',
 'technologyPlatformVersion',
 'specifyPlatformVersion',
 'platformLocation',
 'specifyPlatformLocation',
 'referenceTranscriptID',
 'repositoryName',
 'specifyRepositoryName',
 'transcriptType',
 'specifyTranscriptType',
 'resultUnit',
 'valueReported',
 'libraryPreparationMethod',
 'specifyLibraryPreparationMethod',
 'libraryVersion',
 'specifyLibraryVersion',
 'isStranded',
 'readStrandOrigin',
 'readLength',
 'readLengthUnits',
 'specifyReadLengthUnits',
 'runType',
 'totalReads',
 'Component']

In [84]:
temp = dm2.drop_duplicates(
    subset=['Attribute', 'Parent']).reset_index(drop=True)

In [85]:
temp[temp['Attribute'] == 'RNAseq']['DependsOn'].values[0].split(',')

['specimenID',
 'sampleType',
 'specifySampleType',
 'specifyMeasurementTechnique',
 'technologyPlatformVersion',
 'specifyPlatformVersion',
 'platformLocation',
 'specifyPlatformLocation',
 'referenceTranscriptID',
 'repositoryName',
 'specifyRepositoryName',
 'transcriptType',
 'specifyTranscriptType',
 'resultUnit',
 'valueReported',
 'libraryPreparationMethod',
 'specifyLibraryPreparationMethod',
 'libraryVersion',
 'specifyLibraryVersion',
 'isStranded',
 'readStrandOrigin',
 'readLength',
 'readLengthUnits',
 'specifyReadLengthUnits',
 'runType',
 'totalReads',
 'Component']

In [86]:
print(dm2.shape)

dm2 = dm2.drop_duplicates(
    subset=['Attribute']).reset_index(drop=True)  # 'DependsOn', 'Properties',

display(dm2.shape)

(904, 10)


(632, 10)

In [87]:
# Recode required columns and fix spelling mistakes
required_recoder = {
    "0.0": "False",
    "1.0": "True",
    "FASLSE": "False"
}
dm2['Required'] = dm2['Required'].replace(required_recoder)

dm2[~dm2['Required'].str.contains("True|False", regex=True)]

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules


In [88]:
search_df(dm2, 'measurementTechnique')

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
3,specifyMeasurementTechnique,"If ""other"" is selected list the name of the measurement technique used",,,dataProperty,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
180,OtherMeasurementTechnique,"If ""other"" is selected list the name of the measurement technique used",,specifyMeasurementTechnique,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
194,OtherMetabolomics HumanmeasurementTechnique,"If ""other"" is selected list the name of the measurement technique used",,specifyMeasurementTechnique,validValue,False,Metabolomics Human,,Sage Bionetworks,
624,Metabolomics Human,,,"specimenID,sampleType,specifySampleType,specifyMeasurementTechnique,technologyPlatformVersion,specifyPlatformVersion,platformLocation,specifyPlatformLocation,msTarget,msAnalyzerTypeMS1,specifyMSAnalyzerTypeMS1,msAnalyzerTypeMS2,specifyMSAnalyzerTypeMS2,msAssayTechnique,specifyMSAssayTechnique,msAnalyteType,specifyMSAnalyteType,msInstrumentModel,specifyMSInstrumentModel,sampleIntroduction,specifySampleIntroduction,extractionMethod,acquisitionMode,acquisitionSoftware,acquistionSoftwareVersion,samplePrepProtocol,databaseName,specifyDatabaseName,databaseSource,specifyDatabaseSource,databaseWeblink,spectrometerFrequency,hasIonizationSource,ionProperty,vacuumPressure,vacuumPressureUnit,specifyVacuumPressueUnits,lensVoltages ,lensVoltagesUnit ,specifyLensVoltageUnits,gasFlowTemperature,gasFlowTemperatureUnit,hasAssayControl,controlType,specifyControlType,experiementalBatchSize,experiementalBatchSizeUnit,specifyExperimentalBatchSizeUnits,batchSize,batchSizeUnit,specifyBatchSizeUnits,batchID,batchLabel,sampleBatchID,sampleBatchSize,sampleBatchSizeUnit,specifySampleBatchSizeUnits,acquisitionBatchID,acquisitionBatchSize,acquisitionBatchSizeUnit,specifyAcquisitionBatchSizeUnits,processingBatchID,processingBatchSize,processingBatchSizeUnit,specifyProcessingBatchSizeUnit,f,Component",dataType,True,,,,
625,Microbiome,,,"specimenID,sampleType,specifySampleType,specifyMeasurementTechnique,technologyPlatformVersion,specifyPlatformVersion,platformLocation,specifyPlatformLocation,extractionMethod,specifyExtractionMethod,libraryBatchID,sequencingBatchID,sequencingBatchSize,sequencingBatchSizeUnit,specifySequencingBatchSizeUnit,dnaBatchID,dnaBatchSize,dnaBatchSizeUnit,specifyDNABatchSizeUnit,libraryPrep,specifyLibraryPrep,libraryPreparationMethod,specifyLibraryPreparationMethod,libraryVersion,specifyLibraryVersion,isStranded,readStrandOrigin,readLength,readLengthUnits,specifyReadLengthUnits,runType,totalReads,Component",dataType,True,,,,
626,RNAseq,,,"specimenID,sampleType,specifySampleType,specifyMeasurementTechnique,technologyPlatformVersion,specifyPlatformVersion,platformLocation,specifyPlatformLocation,referenceTranscriptID,repositoryName,specifyRepositoryName,transcriptType,specifyTranscriptType,resultUnit,valueReported,libraryPreparationMethod,specifyLibraryPreparationMethod,libraryVersion,specifyLibraryVersion,isStranded,readStrandOrigin,readLength,readLengthUnits,specifyReadLengthUnits,runType,totalReads,Component",dataType,True,,,,
627,Whole Genome Sequencing,,,"specimenID,sampleType,specifySampleType,specifyMeasurementTechnique,technologyPlatformVersion,specifyPlatformVersion,platformLocation,specifyPlatformLocation,referenceTranscriptID,repositoryName,specifyRepositoryName,transcriptType,specifyTranscriptType,resultUnit,valueReported,rnaBatchID,libraryBatchID,sequencingBatchID,libraryPrep,specifyLibraryPrep,libraryPreparationMethod,specifyLibraryPreparationMethod,libraryVersion,specifyLibraryVersion,isStranded,readStrandOrigin,readLength,readLengthUnits,specifyReadLengthUnits,runType,totalReads,Component",dataType,True,,,,
628,bsSeq (bisulfite-seq WGBS methylseq methylomics),,,"specimenID,sampleType,specifySampleType,specifyMeasurementTechnique,technologyPlatformVersion,specifyPlatformVersion,platformLocation,specifyPlatformLocation,referenceTranscriptID,dnaBatchID,dnaBatchID,dnaBatchSize,dnaBatchSizeUnit,specifyDNABatchSizeUnit,libraryBatchID,libraryPrep,specifyLibraryPrep,libraryPreparationMethod,specifyLibraryPreparationMethod,libraryVersion,specifyLibraryVersion,sequencingBatchID,sequencingBatchSize,sequencingBatchSizeUnit,specifySequencingBatchSizeUnit,libraryPreparationMethod,specifyLibraryPreparationMethod,libraryVersion,specifyLibraryVersion,isStranded,readStrandOrigin,readLength,readLengthUnits,specifyReadLengthUnits,runType,totalReads,directionalBSseqLibrary,gDNAconc,lambdaDNAconc,pcrCycles,meanCoverage,conversionRatio,conversionRatioUnits,specifyConversionRatioUnit,enrichmentMethod,specifyEnrichmentMethod,Component",dataType,True,,,,
629,genotyping,,,"specimenID,sampleType,specifySampleType,useReagent,reagentID(s),reagentName,reagentManufacturer,reagentCatalogNumber,reagentLotNumber,reagentWeblink,reagentContact,useTreatment,treatmentID(s),treatmentName,treatmentAmountValue,treatmentAmountUnit,treatmentDurationValue,treatmentDurationUnit,treatmentTemperatureValue,treatmentTemperatureUnit,specifyMeasurementTechnique,Component",dataType,True,,,,
630,proteomics,,,"specimenID,sampleType,specifySampleType,specifyMeasurementTechnique,technologyPlatformVersion,specifyPlatformVersion,platformLocation,specifyPlatformLocation,msTarget,msAnalyzerTypeMS1,specifyMSAnalyzerTypeMS1,msAnalyzerTypeMS2,specifyMSAnalyzerTypeMS2,msAssayTechnique,specifyMSAssayTechnique,msAnalyteType,specifyMSAnalyteType,protoemicsAssayType,specifyProteomicsAssayType,msInstrumentModel,specifyMSInstrumentModel,digestionMethod,specifyDigestionMethod,fractionIdentifier,acquisitionMode,acquisitionSoftware,acquistionSoftwareVersion,samplePrepProtocol,databaseName,specifyDatabaseName,databaseSource,specifyDatabaseSource,databaseWeblink,spectrometerFrequency,modificationParameters,specifyOtherModificationParameters,cleavageAgents,specifyOtherCleavageAgent,fragmentMassTolerance,precursorMassTolerance,dataFile,Component",dataType,True,,,,


Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
3,specifyMeasurementTechnique,"If ""other"" is selected list the name of the me...",,,dataProperty,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
180,OtherMeasurementTechnique,"If ""other"" is selected list the name of the me...",,specifyMeasurementTechnique,validValue,False,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
194,OtherMetabolomics HumanmeasurementTechnique,"If ""other"" is selected list the name of the me...",,specifyMeasurementTechnique,validValue,False,Metabolomics Human,,Sage Bionetworks,
624,Metabolomics Human,,,"specimenID,sampleType,specifySampleType,specif...",dataType,True,,,,
625,Microbiome,,,"specimenID,sampleType,specifySampleType,specif...",dataType,True,,,,
626,RNAseq,,,"specimenID,sampleType,specifySampleType,specif...",dataType,True,,,,
627,Whole Genome Sequencing,,,"specimenID,sampleType,specifySampleType,specif...",dataType,True,,,,
628,bsSeq (bisulfite-seq WGBS methylseq methylomics),,,"specimenID,sampleType,specifySampleType,specif...",dataType,True,,,,
629,genotyping,,,"specimenID,sampleType,specifySampleType,useRea...",dataType,True,,,,
630,proteomics,,,"specimenID,sampleType,specifySampleType,specif...",dataType,True,,,,


Last bit of cleanup


In [89]:
# Remove measurement technique dependency from biospecimen human
dm2.loc[dm2['Attribute'] == 'Biospecimen human', 'DependsOn'] = dm2.loc[dm2['Attribute']
                                                                        == 'Biospecimen human', 'DependsOn'].values[0].replace("measurementTechnique,", '')
bio_measure_technique_index = dm2.query(
    'Attribute == "measurementTechnique" and Parent == "Biospecimen human"')

if len(bio_measure_technique_index.index) > 0:
    dm2 = dm2.drop(
        index=bio_measure_technique_index.index[0]).reset_index(drop=True)

In [90]:
# Remove measurement technique dependency from biospecimen human
dm2.loc[dm2['Attribute'] == 'Biospecimen human', 'DependsOn'] = dm2.loc[dm2['Attribute']
                                                                        == 'Biospecimen human', 'DependsOn'].values[0].replace("specifyMeasurementTechnique,", '')
bio_measure_technique_index = dm2.query(
    'Attribute == "specifyMeasurementTechnique" and Parent == "Biospecimen human"')


if len(bio_measure_technique_index.index) > 0:
    dm2 = dm2.drop(
        index=bio_measure_technique_index.index[0]).reset_index(drop=True)

In [91]:
# Remove measurement technique dependency from biospecimen human
dm2.loc[dm2['Attribute'] == 'Biospecimen human', 'DependsOn'] = dm2.loc[dm2['Attribute']
                                                                        == 'Biospecimen human', 'DependsOn'].values[0].replace("OtherMeasurementTechnique,", '')
bio_measure_technique_index = dm2.query(
    'Attribute == "OtherMeasurementTechnique"')


if len(bio_measure_technique_index.index) > 0:
    dm2 = dm2.drop(
        index=bio_measure_technique_index.index[0]).reset_index(drop=True)

In [92]:
dm2.loc[dm2[dm2['Attribute'] == 'visitCode'].index.values[0],
        'Valid Values'] = "1,2,Other,Unknown,Not collected,Not applicable"

dm2.loc[dm2[dm2['Attribute'] == 'visitCode'].index.values[0],
        'Validation Rules'] = ""

Extra comma at beginning of valid values


# Validation Rules


In [93]:
mixed_attrs = [
    {'attribute': 'tissueWeight', 'val_type': 'mixed float', 'regex': 'regex search'},
    {'attribute': 'tissueVolume', 'val_type': 'mixed float', 'regex': 'regex search'},
    {'attribute': 'specimenAge', 'val_type': 'mixed integer', 'regex': 'regex search'},
    {'attribute': 'samplingAge', 'val_type': 'mixed integer', 'regex': 'regex search'},
    {'attribute': 'age', 'val_type': 'mixed integer', 'regex': 'regex search'}
]

for ma in mixed_attrs:
    attribute = ma['attribute']
    val_type = ma['val_type']

    # get indexes for new validation rules based on attribute
    indexes = dm2[dm2['Attribute'] == attribute].index.tolist()

    for i in indexes:
        if val_type == 'integer':
            first_part = '[0-9]+'
        elif val_type == 'float':
            first_part = '^\d*?\.?\d$'
        elif val_type == 'mixed integer':
            regex = 'regex search'
            num_match = '^\d*?'
        elif val_type == 'mixed float':
            regex = 'regex search'
            num_match = '^\d*?\.?\d$'
            # All valid values are applicable
            new_string = regex + num_match + '|' + \
                '' + '|'.join(dm2.loc[i, 'Valid Values'].split(',')
                              )

        dm2.loc[i, 'Validation Rules'] = new_string

In [94]:
dm2['Validation Rules'].unique().tolist()

['',
 'regex search ([0-9]+\\.[0-9]*.)|([0-9]+)',
 nan,
 'regex search^\\d*?\\.?\\d$|Unknown|Not collected|Not applicable',
 'regex search ([0-9]+)']

## Building Dependencies


In [95]:
dependencies = {
    'specimenID': 'matchAtLeastOne Biospecimenhuman.specimenID value',
    'individualID': 'matchExactlyOne IndividualHuman.individualID set'
}

In [96]:
for k, v in dependencies.items():
    indexes = dm2[dm2['Attribute'] == k]['Validation Rules'].index.values
    dm2.loc[indexes, 'Validation Rules'] = v

In [97]:
dm2.loc[3, 'Valid Values'] = dm2.loc[3, 'Valid Values'].replace(
    ',Whole Genome Sequencing', '')

# Create File Annotations Attributes


In [98]:
# base file annotations
base_file_annotations = {
    'resourceType': '',
    'isReleased': False,
    'fileType': ''
}

In [99]:
with pd.option_context("display.max_colwidth", None):
    display(dm2.query('Attribute == "visitCode"'))

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
161,visitCode,"Indicate which longitudinal visit for the individual the data comes from, provided by the data contributor's data dictionary","1,2,Other,Unknown,Not collected,Not applicable",,dataProperty,True,Individual Human,,Sage Bionetworks,


# Write out new data model


In [100]:
hard_coded_valid_values = [
    {'attribute': 'visitCode',
        'valid_value': "1,2,Other,Unknown,Not collected,Not applicable"},
    {'attribute': 'consentGroupID', 'valid_value': "1,2,3"},
    {'attribute': 'samplingAge', 'valid_value': ""},
    {'attribute': 'specimenAge', 'valid_value': ""},
    {'attribute': 'age', 'valid_value': ""},
]

In [101]:
for h in hard_coded_valid_values:
    dm2.loc[dm2[dm2['Attribute'] == h['attribute']].index.values[0],
            'Valid Values'] = h['valid_value']

In [102]:
checks = [c['attribute'] for c in hard_coded_valid_values]

with pd.option_context("display.max_colwidth", None):
    display(dm2.query(f'Attribute in @checks'))

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
141,specimenAge,The subject's age at the time of specimen extraction is the individual's age (since birth) at the time a given specimen was extracted. Measured as age in years.,,,dataProperty,True,Biospecimen human,,Sage Bionetworks,regex search^\d*?\.?\d$|Unknown|Not collected|Not applicable
142,samplingAge,"The calculated age of the sample, measurement is determined or coded by the data contributor.",,,dataProperty,True,Biospecimen human,,Sage Bionetworks,regex search^\d*?\.?\d$|Unknown|Not collected|Not applicable
161,visitCode,"Indicate which longitudinal visit for the individual the data comes from, provided by the data contributor's data dictionary","1,2,Other,Unknown,Not collected,Not applicable",,dataProperty,True,Individual Human,,Sage Bionetworks,
163,consentGroupID,"Indicate the consent group for the individual, provided by the data contributor's data dictionary",123,,dataProperty,True,Individual Human,,Sage Bionetworks,
169,age,"Age of the individual (age in years of the individual at first recorded study event (enrollment, visit, observation, sample collection, survey completion, etc.)",,,dataProperty,True,Individual Human,,"Sage Bionetworks,",regex search^\d*?\.?\d$|Unknown|Not collected|Not applicable


In [103]:
import yaml

In [104]:
with open("../configs/notebook_config.yaml", 'r') as f:
    config = yaml.safe_load(f)

# paths to import files
root_path = config['paths']['root']
schematic_config = config['paths']['schematic']
csv_model = config['file_names']['csv_model']
json_model = config['file_names']['json_model']

print(
    "Schematic config: ", schematic_config,
    "\n", "CSV model: ", csv_model,
    "\n", "JSON LD Model: ", json_model)

Schematic config:  C:/Users/nlee/Documents/Projects/schematic/schematic/config.yml 
 CSV model:  C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.csv 
 JSON LD Model:  C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.jsonld


In [107]:
dm2.columns.tolist()

['Attribute',
 'Description',
 'Valid Values',
 'DependsOn',
 'Properties',
 'Required',
 'Parent',
 'DependsOn Component',
 'Source',
 'Validation Rules']

In [5]:
# write out data model into csv
dm2.to_csv(csv_model)

NameError: name 'dm2' is not defined

# Convert CSV to JSON LD


In [6]:
print(f'schematic schema convert {csv_model} --output_jsonld {json_model}')

!schematic schema convert {csv_model} --output_jsonld {json_model}

schematic schema convert C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.csv --output_jsonld C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.jsonld
^C


The Data Model was created and saved to 'C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.jsonld' location.


Starting schematic...
Done adding requirements and value ranges to attributes


## Get manifest names to generate manifests


In [7]:
import json

with open(json_model, 'r') as jf:
    jo = json.load(jf)

In [8]:
# Manifest names in data model
manifest_names_extracted = []

for i in jo['@graph']:
    if i['@id'] == "bts:dataType":
        manifest_names_extracted.append(
            i["schema:domainIncludes"]["@id"].replace('bts:', ''))

# display names extracted
manifest_display_names_extracted = []

for i in jo['@graph']:
    if i['@id'].replace("bts:", '') in (manifest_names_extracted):
        manifest_display_names_extracted.append(
            i["sms:displayName"])
manifest_display_names_extracted

# Create dictionary for lookup later
manifest_name_relationships = dict(
    zip(manifest_names_extracted, manifest_display_names_extracted))

manifest_name_relationships

{'Biospecimenhuman': 'Biospecimen human',
 'BiospecimennonHuman': 'Biospecimen nonHuman',
 'IndividualHuman': 'Individual Human',
 'IndividualnonHuman': 'Individual nonHuman',
 'MetabolomicsHuman': 'Metabolomics Human',
 'Microbiome': 'Microbiome',
 'RNAseq': 'RNAseq',
 'WholeGenomeSequencing': 'Whole Genome Sequencing',
 'BsSeq(bisulfite-seqWGBSmethylseqmethylomics)': 'bsSeq (bisulfite-seq WGBS methylseq methylomics)',
 'Genotyping': 'genotyping',
 'Proteomics': 'proteomics',
 'ScRNAseq': 'scRNAseq'}

# Create dca template configuration


In [9]:
def manifest_template(k, v, t='file'):
    manifest_template = {
        "display_name": v,
        "schema_name": k,
        "type": t
    }
    return manifest_template


dca_template = {
    "manifest_schemas": [],
    "service_version": "v23.1.1",
    "schema_version": "v1.2"
}

records = ['IndividualHuman', 'IndividualnonHuman',
           'Biospecimenhuman', 'BiospecimennonHuman']


for k, v in manifest_name_relationships.items():
    if k in records:
        t = 'records'
    else:
        t = 'file'
    dca_template['manifest_schemas'].append(manifest_template(k, v, t))

json_formatted_str = json.dumps(dca_template, indent=2)
print(json_formatted_str)

with open('C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/manifests/dca_template.json', 'w') as f:
    f.write(json_formatted_str)

{
  "manifest_schemas": [
    {
      "display_name": "Biospecimen human",
      "schema_name": "Biospecimenhuman",
      "type": "records"
    },
    {
      "display_name": "Biospecimen nonHuman",
      "schema_name": "BiospecimennonHuman",
      "type": "records"
    },
    {
      "display_name": "Individual Human",
      "schema_name": "IndividualHuman",
      "type": "records"
    },
    {
      "display_name": "Individual nonHuman",
      "schema_name": "IndividualnonHuman",
      "type": "records"
    },
    {
      "display_name": "Metabolomics Human",
      "schema_name": "MetabolomicsHuman",
      "type": "file"
    },
    {
      "display_name": "Microbiome",
      "schema_name": "Microbiome",
      "type": "file"
    },
    {
      "display_name": "RNAseq",
      "schema_name": "RNAseq",
      "type": "file"
    },
    {
      "display_name": "Whole Genome Sequencing",
      "schema_name": "WholeGenomeSequencing",
      "type": "file"
    },
    {
      "display_name": "bs

In [None]:
# old_schema = {
#     "manifest_schemas": [
#         {
#             "display_name": "Biospecimen human",
#             "schema_name": "Biospecimenhuman",
#             "type": "record"
#         },
#         {
#             "display_name": "Biospecimen nonHuman",
#             "schema_name": "BiospecimennonHuman",
#             "type": "record"
#         },
#         {
#             "display_name": "Individual Human",
#             "schema_name": "IndividualHuman",
#             "type": "record"
#         },
#         {
#             "display_name": "Individual nonHuman",
#             "schema_name": "IndividualnonHuman",
#             "type": "record"
#         },
#         {
#             "display_name": "Metabolomics Human",
#             "schema_name": "MetabolomicsHuman",
#             "type": "file"
#         },
#         {
#             "display_name": "Microbiome",
#             "schema_name": "Microbiome",
#             "type": "file"
#         },
#         {
#             "display_name": "RNAseq",
#             "schema_name": "RNAseq",
#             "type": "file"
#         },
#         {
#             "display_name": "Study",
#             "schema_name": "Study",
#             "type": "file"
#         },
#         {
#             "display_name": "Whole Genome Sequencing",
#             "schema_name": "WholeGenomeSequencing",
#             "type": "file"
#         },
#         {
#             "display_name": "'bsSeq (bisulfite-seq WGBS methylseq methylomics)",
#             "schema_name": "'BsSeq(bisulfite-seqWGBSmethylseqmethylomics)",
#             "type": "file"
#         },
#         {
#             "display_name": "scRNAseq",
#             "schema_name": "ScRNAseq",
#             "type": "file"
#         },
#         {
#             "display_name": "Genotyping",
#             "schema_name": "Genotyping",
#             "type": "file"
#         },
#         {
#             "display_name": "Proteomics",
#             "schema_name": "Proteomics",
#             "type": "file"
#         }
#     ],
#     "service_version": "v23.1.1",
#     "schema_version": "v1.2"
# }

In [None]:
len(dca_template['manifest_schemas'])

In [None]:
# len(old_schema['manifest_schemas'])

## Generate Manifests


In [10]:
output_path = 'C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/manifests/'

- valueErrors =
  - BsSeq(bisulfite-seqWGBSmethylseqmethylomics): [valueReported]
  - "RNAseq": [rnaBatchID, specifyLibraryPrep]
  - 'MetabolomicsHuman': ['specifyLibraryVersion']
  - Proteomics : ['specifyTranscriptType']


- Remove `(not listed for purposes of this RFC)`
- Change Assay to be blank or list of values
- Weight needs to be blank
- Uknown spelling error
- 'False / 'True -> remove '


In [11]:
with pd.option_context("display.max_colwidth", None):
    display(dm2.query('Attribute == "ethnicity"'))

NameError: name 'pd' is not defined

In [13]:
import os

In [14]:
manifest = 'IndividualHuman'

print(f"schematic manifest --config {schematic_config} get -dt {manifest} --output_csv {os.path.join(output_path, manifest + '.csv')} --title {'EL_Manifest_' + manifest} --sheet_url 2>&1 >> manifest_generation_results.txt")

!echo '{manifest}:' >> manifest_generation_results.txt
!schematic manifest --config {schematic_config} get -dt {manifest} --output_csv {os.path.join(output_path, manifest + '.xlsx')} --title {'EL_Manifest_' + manifest} --sheet_url 2>&1 >> manifest_generation_results.txt

schematic manifest --config C:/Users/nlee/Documents/Projects/schematic/schematic/config.yml get -dt IndividualHuman --output_csv C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/manifests/IndividualHuman.csv --title EL_Manifest_IndividualHuman --sheet_url 2>&1 >> manifest_generation_results.txt
Starting schematic...
The (model > input > location) argument with value 'C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.jsonld' is being read from the config file.
The '--jsonld' argument is being taken from configuration file (model > input > location), i.e., 'C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.jsonld'.
JSON schema successfully generated from schema.org schema!
JSON schema file log stored as C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.1.0.IndividualHuman.schema.json
Permission Id: anyoneWithLink
Find the manifest template using this Google Sheet U

In [None]:
!echo ----- newly_generated_manifests ------- 2>&1 >> manifest_generation_links.txt
for manifest in manifest_names_extracted:
    print(manifest)
    !echo {manifest}: >> manifest_generation_results.txt
    !schematic manifest --config {schematic_config} get -dt {manifest} --output_xlsx {os.path.join(output_path, manifest + '.xlsx')} --title {'EL_Manifest_' + manifest} --sheet_url 2>&1 >> manifest_generation_results.txt
    !echo ----------------------------------------------------------------

Add validation rules


In [None]:
def mixed_validation_rule(df, attribute):
    ...

# Validation of manifests


In [None]:
import random
import string
from pathlib import Path
import lorem

Hard coded variables


In [None]:
# number of rows to fill in
nrows = 10
random.seed = 27

# last minute chaos
chaos = False

Functions


In [None]:
def valid_values_to_list(df, attribute):

    valid_values = df.query(f'Attribute == "{attribute}"')[
        'Valid Values'].str.split(',').values[0]

    return valid_values

In [None]:
def get_random_value(list_of_vv):
    return random.choice(list_of_vv)

In [None]:
def get_rand_integer(min=0, max=100):
    return random.randint(min, max)

In [None]:
def get_rand_float(min=0, max=100):
    return round(random.uniform(0.0, 100.0), 2)

In [None]:
def get_random_string():
    t = lorem.sentence().split(" ")[0]
    return t

In [None]:
def introduce_random_NAs(df, N=5):
    """ Another test to see if columns can handle empty values or if they will flag the empty value"""

    rows, cols = df.shape

    row_index = [get_rand_integer(max=rows-1) for _ in range(N)]

    col_index = [get_rand_integer(max=cols - 1) for _ in range(N)]

    indexes = list(zip(row_index, col_index))

    # Print indexes to check where values got replaced
    print(indexes)
    # for i in indexes
    df.iloc[row_index, col_index] = np.NaN

    return df

In [None]:
# find attribute column, fill in with value
def fill_in_attribute(df, index, attribute, value):
    df.loc[index, attribute] = value
    return df

In [None]:
def gen_mixed_string_with_length(N=12):
    # initializing size of string

    # using random.choices()
    # generating random strings
    res = ''.join(random.choices(string.ascii_uppercase +
                                 string.digits, k=N))

    # print result
    return res

In [None]:
def random_change():
    # list of functions to choose from
    choices = [introduce_random_NAs, gen_mixed_string_with_length,
               get_rand_integer, get_random_string, get_rand_float]

    choice = random.choice(choices)

    print(choice.__name__)

    return choice()

In [None]:
def partition(list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]

Create individual and biospecimen ids from random text


In [None]:
# for individaul IDs and specimenIDs
individaulIds = [gen_mixed_string_with_length(N=5) for _ in range(1, 100)]
specimenIds = [gen_mixed_string_with_length() for _ in range(1, 1000)]

In [None]:
specimenIds_partitioned = partition(specimenIds, 100)

In [None]:
ind_bio_map = []

for i, v in enumerate(individaulIds):
    for s in specimenIds_partitioned[i]:
        ind_bio_map.append(
            {
                'individualID': v,
                'specimenID': s

            }
        )

ind_bio_map = pd.DataFrame(ind_bio_map)

In [None]:
# Manifests
# Get all the RFC file paths
manifest_paths = glob(
    r"C:\Users\nlee\Documents\Projects\ELITE-DCC\ELITE-data-models\manifests\*.csv")
manifest_paths

Tested:

- scRNAseq : Passed (Need to add validation for biospecimenId)


In [None]:
# load data model
data_model_path = r'C:\Users\nlee\Documents\Projects\ELITE-DCC\ELITE-data-models\models\EL_data_model_v3.csv'

dm = pd.read_csv(data_model_path).iloc[:, 1:].fillna('')

dm.head()

In [None]:
# load manifest
manifest_path = 'C:\\Users\\nlee\\Documents\\Projects\\ELITE-DCC\\ELITE-data-models\\manifests\\WholeGenomeSequencing.csv'

df = pd.read_csv(manifest_path)

df.head()

In [None]:
manifest_name = Path(manifest_path).stem

parent_name = manifest_name_relationships[manifest_name]

df['Component'] = manifest_name

In [None]:
# Attributes from data model
attrs_to_fill = dm[dm['Attribute'] == (
    parent_name)]['DependsOn'].values[0].split(',')

attrs_to_fill

In [None]:
dm_attrs = dm[(dm['Attribute'].isin(attrs_to_fill))
              ].drop_duplicates(subset=['Attribute']).copy()

dm_attrs

In [None]:
# find values in dm_attrrs with valid values filled in
vv_attrs = dm_attrs[dm_attrs['Valid Values'] != '']['Attribute'].tolist()

In [None]:
# fill in values for the mainfest with the data model
for attribute in vv_attrs:
    for i in range(nrows):
        index = i
        temp_vv = valid_values_to_list(dm_attrs, attribute)
        new_val = get_random_value(temp_vv)
        df = fill_in_attribute(df, index, attribute, new_val)

In [None]:
# update df with sample of individual and biospecimen sample ids
ind_sample = ind_bio_map.sample(nrows, replace=True)[
    ['individualID', 'specimenID']].reset_index(drop=True)

df.update(ind_sample)

In [None]:
swap_validation_coder = {v: k for k, v in validation_coder.items()}

# column coding for values
dm_attrs['swapper'] = dm_attrs['Validation Rules'].fillna(
    'number').replace(swap_validation_coder)

dm_attrs['swapper'] = dm_attrs['swapper'].apply(
    lambda x: re.sub('^\d*?\.?\d$', 'number', x))

# need a better regex later
dm_attrs['swapper'] = dm_attrs['swapper'].apply(lambda x: re.sub(
    'number numbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumbernumber', 'number', x))

print(dm_attrs['swapper'].unique())

dm_attrs.head()

In [None]:
free_form_attrs = dm_attrs[(dm_attrs['Valid Values'] == '') & (
    ~dm_attrs['Attribute'].isin(['individualID', 'specimenID']))]

free_form_attrs

In [None]:
def generate_values(value):
    if value == 'string':
        # generate string
        return get_random_string()
    elif value == 'number':
        # generate random number
        return get_rand_integer()
    elif value == 'mixed'
        # split string into regex expressions i.e. numbers and valid values
        choices = ['string', 'number']
        # chose random to fill in cell

In [None]:
test = '([0-9]+\\.[0-9]*.)|([0-9]+)'
test2 = '^\\d*?\\.?\\d$|Other|Unknown|Not collected|Not applicable'

# def match_pattern(rule):

pattern_checker = {
    'strip_patterns': 'regex|search|match',
    'number': '(?![a-zA-Z]+)(\[0-9\])',
    'string': '([a-zA-Z]+)(?!\[0-9\])',
    'mixed': '(?:[a-zA-Z]+\d+)'
}

result = False

while result == False:
    for k, v in pattern_checker.items():
        result = bool(re.search(pattern=v, string=test))
        print(pattern)
        print(result)


# pattern = '(?![a-zA-Z]+)(\[0-9\])'
# results = bool(re.search(pattern = pattern, string = test))
# results

In [None]:
for attribute in free_form_attrs['Attribute'].tolist():
    for i in range(nrows):
        index = i
        test = free_form_attrs.query(f'Attribute == "{attribute}"')
        new_val = test['swapper'].apply(generate_values).values[0]
        df = fill_in_attribute(df, index, attribute, new_val)

In [None]:
# write out test manifest to file for testing in DCA
df['Component'] = manifest_name

if chaos:
    df = introduce_random_NAs(df)

In [None]:
# Write out manifest
csv_path = os.path.join(
    r'C:\Users\nlee\Documents\Projects\ELITE-DCC\ELITE-data-models\test_manifests', manifest_name + '_test.csv')

df.to_csv(csv_path)

# Run Validation Test


In [None]:
# manifest_path = "C:/Users/nlee/Documents/Projects/schematic/schematic/tests/data/mock_manifests/example_biospecimen_test.csv"
print(
    f"schematic model --config {schematic_config} validate --manifest_path {csv_path} --data_type {manifest_name}")
!schematic model --config {schematic_config} validate --manifest_path {csv_path} --data_type {manifest_name}

# Submit Manifest


In [None]:
print(csv_path)
print(schematic_config)
print(manifest_name)
dataset_id = 'syn51753850'

- d option is the project


In [None]:
dataset_id = 'syn51753844'

!schematic model --config C:/Users/nlee/Documents/Projects/schematic/schematic/config.yml submit -mp C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/test_manifests/WholeGenomeSequencing_test.csv -d syn51753850 -vc WholeGenomeSequencing -mrt both

In [None]:
search_df(dm2, 'valueReported')

# Store Manifest In Dataset For Testing


In [None]:
import synapseclient
from synapseclient import File

manifest_folder_id = 'syn51728840'

%run C:\Users\nlee\Documents\Projects\utils\syanpse_login.py


In [None]:
csv_entity = File(csv_path, description=f'Test manifest for {manifest_name}', parent=data_folder, annotations={
                  'resourceType': 'manifest', 'manifestType': {manifest_name}})

csv_entity = syn.store(csv_entity)

In [None]:
s = syn.getColumns('syn51748558')

In [None]:
for i in s:
    print(i)