In [26]:
import pandas as pd
import numpy as np
import os

from glob import glob
import yaml
import re

In [27]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/adknowledgeportal/data-models/main/AD.model.csv")

# preprocess AD data model to remove duplicates
df = df.sort_values(by=['Attribute', 'Valid Values']).reset_index(drop=True)
df = df.drop_duplicates(keep='first', subset=['Attribute'])

In [28]:
df.loc[df.query('Attribute.str.contains("template")',
                engine='python').index.tolist(), 'Properties'] = 'template'

In [30]:
attrs_interest = [
    'analysisType',
    'analysisType',
    'analytical covariates',
    'assay',
    'assay',
    'biospecimen',
    'consortium',
    'data dictionary',
    'dataSubtype',
    'dataType',
    'fileFormat',
    'grant',
    'ID mapping',
    'individual',
    'isConsortiumAnalysis',
    'isModelSystem',
    'isMultiSpecimen',
    'libraryPrep',
    'libraryType',
    'manifest',
    'manifest',
    'metadata',
    'metadataType',
    'modelSystemName',
    'modelSystemType',
    'platform',
    'project',
    'protocol',
    'protocol',
    'resourceType',
    'type'
]

In [31]:
metadataTypes = [
    'analytical covariates',
    'assay',
    'biospecimen',
    'data dictionary',
    'ID mapping',
    'individual',
    'manifest',
    'protocol'
]

In [32]:
df_new_attrs = df.query('Attribute in @attrs_interest').copy()

In [33]:
df_new_attrs['Properties'] = 'BaseAnnotation'

In [34]:
# get existing annotations from ELITE portal

# Cleanup data model attributes to fit ELITE data model


In [35]:
def rewrite_df_value(df, col_name, search_term, col_value, new_value):
    try:
        df.loc[df[df[col_name] ==
                  search_term].index[0], col_value] = new_value
        return df
    except:
        return df

In [36]:
df_new_attrs = rewrite_df_value(
    df_new_attrs,
    'Attribute',
    'study',
    'Valid Values',
    'LLFS,ILO,LG,LC')

df_new_attrs = rewrite_df_value(
    df_new_attrs,
    'Attribute',
    'consortium',
    'Valid Values',
    'ELITE')

df_new_attrs = rewrite_df_value(
    df_new_attrs,
    'Attribute',
    'metadataType',
    'Valid Values',
    'analytical covariates, assay, biospecimen, data dictionary, ID mapping, individual, manifest, protocol')

In [37]:
# recode Parent
recoder = {
    'ValidValue': 'validValue',
    'DataProperty': 'dataProperty',
    'DataType': 'dataType'
}

df_new_attrs = df_new_attrs.replace(recoder)

# Merge new attributes with existiing data model


In [38]:
dm_elite = pd.read_csv(
    "C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.csv").iloc[:, 1:]

dm_elite = dm_elite.replace({"", np.nan})

In [39]:
dm_elite.query(
    'Attribute in @attrs_interest')

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
14,libraryPrep,The general strategy by which the library was ...,"amplicon,cellHashing,Chromium Single Cell 3',D...",,dataProperty,True,bsSeq (bisulfite-seq WGBS methylseq methylomics),,Sage Bionetworks,
154,assay,The analysis or technology used to generate th...,TBD,,dataProperty,True,Biospecimen nonHuman,,Sage Bionetworks,


In [40]:
# # Only needed for the first time since the Valid values were TBD
dm_elite = rewrite_df_value(
    dm_elite,
    'Attribute',
    'assay',
    'Valid Values',
    np.nan)

In [41]:
# Add valid values from the AD model
dm_elite['Valid Values'] = dm_elite['Valid Values'].fillna(
    dm_elite['Attribute'].map(df_new_attrs.set_index('Attribute')['Valid Values']))

In [42]:
# Add new attributes from AD model
dm_new = pd.concat([dm_elite, df_new_attrs])

In [43]:
dm_new['Parent'] = dm_new['Properties']
dm_new['Properties'] = np.nan

In [44]:
dm_new.query('Parent == "dataType"')

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
619,Biospecimen human,,,"individualID,specimenID,specimenType,specimenA...",,True,dataType,,,
620,Biospecimen nonHuman,,,"individualID,specimenID,specimenIdSource,dataG...",,True,dataType,,,
621,Individual Human,,,"individualID,cohort,studyCode,fieldCenterCode,...",,True,dataType,,,
622,Individual nonHuman,,,"individualID,cohort,studyCode,taxon,speciesGro...",,True,dataType,,,
623,Metabolomics Human,,,"specimenID,sampleType,specifySampleType,specif...",,True,dataType,,,
624,Microbiome,,,"specimenID,sampleType,specifySampleType,specif...",,True,dataType,,,
625,RNAseq,,,"specimenID,sampleType,specifySampleType,specif...",,True,dataType,,,
626,Whole Genome Sequencing,,,"specimenID,sampleType,specifySampleType,specif...",,True,dataType,,,
627,bsSeq (bisulfite-seq WGBS methylseq methylomics),,,"specimenID,sampleType,specifySampleType,specif...",,True,dataType,,,
628,genotyping,,,"specimenID,sampleType,specifySampleType,useRea...",,True,dataType,,,


In [25]:
dm_new.query('Attribute == "studyCode"')

Unnamed: 0,Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
159,studyCode,"Unique identifier for the study, assigned by t...","LC,LG,ILO,LLFS",,,True,,,Sage Bionetworks,


In [45]:
# write out versioned data model
dm_new.to_csv(
    'C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/old_models/EL_data_model_v3.2.0.csv')

# write out versioned data model
dm_new.to_csv(
    'C:/Users/nlee/Documents/Projects/ELITE-DCC/ELITE-data-models/models/EL_data_model_v3.csv')