# Prepare STARGEO queries

In [1]:
import pandas

In [2]:
mapping_df = pandas.read_table('data/DO-tag-mapping.tsv')
mapping_df.head()

Unnamed: 0,doid_id,doid_name,min_distance,case_tag,control_tag,notes
0,DOID:10652,Alzheimer's disease,0,AD,AD_Control,
1,DOID:9206,Barrett's esophagus,0,BE_Tissue,EAC_Non_Tumor,
2,DOID:13241,Behcet's disease,0,Behcet,Behcet_control,
3,DOID:11949,Creutzfeldt-Jakob disease,0,CJD,CJD_control,
4,DOID:8778,Crohn's disease,0,CD,CD_control,


In [3]:
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv'
do_df = pandas.read_table(url).rename(columns={'subsumed_id': 'doid_id', 'subsumed_name': 'doid_name'})
do_df = do_df[['slim_id', 'slim_name', 'doid_id']]
do_df.head(5)

Unnamed: 0,slim_id,slim_name,doid_id
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156
1,DOID:0050425,restless legs syndrome,DOID:0050425
2,DOID:0050741,alcohol dependence,DOID:0050741
3,DOID:0050742,nicotine dependence,DOID:0050742
4,DOID:0060073,lymphatic system cancer,DOID:0060073


In [4]:
merged_df = do_df.merge(mapping_df)
merged_df.head()

Unnamed: 0,slim_id,slim_name,doid_id,doid_name,min_distance,case_tag,control_tag,notes
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156,idiopathic pulmonary fibrosis,0,IPF,IPF_control,
1,DOID:0050425,restless legs syndrome,DOID:0050425,restless legs syndrome,0,,,no series to tag - Omar
2,DOID:0050741,alcohol dependence,DOID:0050741,alcohol dependence,0,alcoholism,alcoholism_control,
3,DOID:0050742,nicotine dependence,DOID:0050742,nicotine dependence,0,Smoker,Nonsmoker,
4,DOID:0060073,lymphatic system cancer,DOID:0060073,lymphatic system cancer,0,,,


In [7]:
all_tags = set()

def tags_to_query(tags):
    tags = sorted(tags)
    return ' or '.join("{tag} == '{tag}'".format(tag=tag) for tag in tags)

def taggregate(df):
    global all_tags
    tags = {'case': set(), 'control': set()}
    for i, row in df.iterrows():
        for kind in tags.keys():
            field = row[kind + '_tag']
            if pandas.isnull(field):
                continue
            split_tags = set(field.split(';'))
            all_tags |= split_tags
            tags[kind] |= split_tags
    queries = {kind + '_query': tags_to_query(tags) for kind, tags in tags.items()}
    return pandas.Series(queries)

query_df = merged_df.groupby(['slim_id', 'slim_name']).apply(taggregate).reset_index()
query_df = query_df[(query_df.case_query != '') & (query_df.control_query != '')]
query_df.to_csv('data/queries.tsv', sep='\t', index=False)
query_df.head()

Unnamed: 0,slim_id,slim_name,case_query,control_query
0,DOID:0050156,idiopathic pulmonary fibrosis,IPF == 'IPF',IPF_control == 'IPF_control'
2,DOID:0050741,alcohol dependence,alcoholism == 'alcoholism',alcoholism_control == 'alcoholism_control'
3,DOID:0050742,nicotine dependence,Smoker == 'Smoker',Nonsmoker == 'Nonsmoker'
8,DOID:1024,leprosy,borderline_leprosy == 'borderline_leprosy' or ...,leprosy_control == 'leprosy_control'
9,DOID:10283,prostate cancer,PC == 'PC' or PC_tissue_case == 'PC_tissue_case',PC_Control == 'PC_Control' or PC_tissue_contro...


## Identify tags in mapping that are not in the database

In [12]:
tag_df = pandas.read_table('download/tag.tsv')
db_tags = set(tag_df.tag_name)
all_tags - db_tags

{'PC_Control'}