# Combine analyses across diseases

In [1]:
import os
import itertools

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
# Read entrez_df
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/symbols-human.tsv'
entrez_df = pandas.read_table(url)
entrez_to_symbols = {entrez_id: set(df.symbol) for entrez_id, df in entrez_df.groupby('GeneID')}

In [3]:
# Read queries
query_df = pandas.read_table('data/queries.tsv')
query_df['doid'] = query_df.slim_id.map(lambda x: x.replace(':', '_'))

In [4]:
# Read files
rows = list()
filenames = ['log.txt', 'samples.tsv', 'balanced_permutation.tsv.gz']
for doid in os.listdir('data/doslim'):
    if not doid.startswith('DOID'):
        continue
    row = [os.path.exists(os.path.join('data', 'doslim', doid, filename)) for filename in filenames]
    rows.append([doid] + row)

file_df = pandas.DataFrame(rows, columns=['doid'] + filenames)
file_df = file_df.merge(query_df)

In [5]:
doids = file_df[file_df['balanced_permutation.tsv.gz']].doid
dfs = list()
for doid in doids:
    path = os.path.join('data', 'doslim', doid, 'balanced_permutation.tsv.gz')
    df = pandas.read_table(path)
    df = df[['mygene_sym', 'mygene_entrez']]
    df['doid'] = doid
    dfs.append(df)

genemap_df = pandas.concat(dfs)
genemap_df.head(2)

Unnamed: 0,mygene_sym,mygene_entrez,doid
0,A1BG,1,DOID_635
1,A1CF,29974,DOID_635


## GeneID-Symbol pairs in STARGEO that do not match Entrez Gene

In [6]:
# Subset to genes whose symbols do not match their GeneIDs according to Entrez Gene
validate = lambda x: x.mygene_sym in entrez_to_symbols.get(x.mygene_entrez, '')
discord_df = genemap_df[-genemap_df.apply(validate, axis='columns')]
discord_df = discord_df.merge(query_df)
discord_df.to_csv('diagnose/discord.tsv', index=False, sep='\t')
discord_df.head(5)

Unnamed: 0,mygene_sym,mygene_entrez,doid,slim_id,slim_name,case_query,control_query
0,CTB-174D11.1,728095,DOID_635,DOID:635,acquired immunodeficiency syndrome,HIV_Stage1 == 'HIV_Stage1' or HIV_stage2 == 'H...,HIV_healthycontrol == 'HIV_healthycontrol'
1,CXorf22,170063,DOID_635,DOID:635,acquired immunodeficiency syndrome,HIV_Stage1 == 'HIV_Stage1' or HIV_stage2 == 'H...,HIV_healthycontrol == 'HIV_healthycontrol'
2,CXorf30,645090,DOID_635,DOID:635,acquired immunodeficiency syndrome,HIV_Stage1 == 'HIV_Stage1' or HIV_stage2 == 'H...,HIV_healthycontrol == 'HIV_healthycontrol'
3,FLJ11292,55338,DOID_635,DOID:635,acquired immunodeficiency syndrome,HIV_Stage1 == 'HIV_Stage1' or HIV_stage2 == 'H...,HIV_healthycontrol == 'HIV_healthycontrol'
4,FLJ22184,80164,DOID_635,DOID:635,acquired immunodeficiency syndrome,HIV_Stage1 == 'HIV_Stage1' or HIV_stage2 == 'H...,HIV_healthycontrol == 'HIV_healthycontrol'


In [7]:
# Number of discordant pairs per disease
discord_df.slim_name.value_counts()

breast cancer                         121
obesity                               118
Parkinson's disease                   107
kidney cancer                         102
allergic rhinitis                     101
amyotrophic lateral sclerosis         101
pancreatic cancer                      98
ulcerative colitis                     98
endogenous depression                  92
polycystic ovary syndrome              90
malaria                                88
bipolar disorder                       87
ovarian cancer                         84
atopic dermatitis                      84
type 1 diabetes mellitus               84
hematologic cancer                     84
alopecia areata                        84
nicotine dependence                    84
psoriatic arthritis                    82
idiopathic pulmonary fibrosis          77
Crohn's disease                        75
melanoma                               73
hypertension                           53
azoospermia                       

## Entrez GeneIDs that appear multiple times for the same disease

In [8]:
def remove_unique_id(df):
    counts = df.mygene_entrez.value_counts()
    return df[df.mygene_entrez.isin(counts.index[counts > 1])]
    
duplicate_id_df = genemap_df.groupby('doid').apply(remove_unique_id).merge(query_df)
duplicate_id_df.to_csv('diagnose/duplicate_ids.tsv', index=False, sep='\t')
duplicate_id_df.head()

Unnamed: 0,mygene_sym,mygene_entrez,doid,slim_id,slim_name,case_query,control_query
0,C22ORF15,150248,DOID_263,DOID:263,kidney cancer,RCC == 'RCC',RCC_Control == 'RCC_Control'
1,C22orf15,150248,DOID_263,DOID:263,kidney cancer,RCC == 'RCC',RCC_Control == 'RCC_Control'


## Symbols that appear multiple times for the same disease

In [9]:
def remove_unique_sym(df):
    counts = df.mygene_sym.value_counts()
    return df[df.mygene_sym.isin(counts.index[counts > 1])]
    
duplicate_symbol_df = genemap_df.groupby('doid').apply(remove_unique_sym).merge(query_df)
duplicate_symbol_df.to_csv('diagnose/duplicate_symbols.tsv', index=False, sep='\t')
duplicate_symbol_df.head()

Unnamed: 0,mygene_sym,mygene_entrez,doid,slim_id,slim_name,case_query,control_query
0,LINC01002,399844,DOID_0050156,DOID:0050156,idiopathic pulmonary fibrosis,IPF == 'IPF',IPF_control == 'IPF_control'
1,LINC01002,729737,DOID_0050156,DOID:0050156,idiopathic pulmonary fibrosis,IPF == 'IPF',IPF_control == 'IPF_control'
2,SFPQ,6421,DOID_0050156,DOID:0050156,idiopathic pulmonary fibrosis,IPF == 'IPF',IPF_control == 'IPF_control'
3,SFPQ,654780,DOID_0050156,DOID:0050156,idiopathic pulmonary fibrosis,IPF == 'IPF',IPF_control == 'IPF_control'
4,SFPQ,6421,DOID_0050742,DOID:0050742,nicotine dependence,Smoker == 'Smoker',Nonsmoker == 'Nonsmoker'
