# Combine analyses across diseases

In [1]:
import os
import itertools

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
# Read entrez_df
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
entrez_df = entrez_df[entrez_df.type_of_gene == 'protein-coding']

In [3]:
# Read queries
query_df = pandas.read_table('data/queries.tsv')
query_df['doid'] = query_df.slim_id.map(lambda x: x.replace(':', '_'))

In [4]:
# Read files
rows = list()
filenames = ['log.txt', 'samples.tsv', 'balanced_permutation.tsv.gz']
for doid in os.listdir('data/doslim'):
    if not doid.startswith('DOID'):
        continue
    row = [os.path.exists(os.path.join('data', 'doslim', doid, filename)) for filename in filenames]
    rows.append([doid] + row)

file_df = pandas.DataFrame(rows, columns=['doid'] + filenames)
file_df = file_df.merge(query_df)
file_df.to_csv('data/files.tsv', sep='\t', index=False)

In [5]:
# Restrict to diseases with meta-analyses that returned results
doids = file_df[file_df['balanced_permutation.tsv.gz']].doid
len(doids)

49

In [6]:
# Combine meta-analyses signicant genes
rows = list()
for doid in doids:
    path = os.path.join('data', 'doslim', doid, 'balanced_permutation.tsv.gz')
    meta_df = pandas.read_table(path)
    meta_df = meta_df[meta_df.mygene_entrez.isin(entrez_df.GeneID)]
    mult_tests = multipletests(meta_df.random_pval, alpha=0.05, method='fdr_bh')
    reject, pvals_corrected, alphacSidak, alphacBonf = mult_tests
    meta_df['random_pval_corrected'] = pvals_corrected
    meta_df = meta_df[reject]
    for i, row in meta_df.iterrows():
        direction = 'up' if row.random_TE > 0 else 'down'
        rows.append([doid, direction, row.mygene_entrez, row.random_TE, row.random_pval_corrected])

diffex_df = pandas.DataFrame(rows, columns=['doid', 'direction', 'entrez_gene_id', 'log2_fold_change', 'p_adjusted'])
diffex_df.entrez_gene_id = diffex_df.entrez_gene_id.astype(int)
diffex_df = diffex_df.sort_values(['doid', 'direction', 'entrez_gene_id'])
assert not diffex_df.duplicated(['doid', 'entrez_gene_id']).any()

In [7]:
diffex_df = query_df[['doid', 'slim_id', 'slim_name']].merge(
    entrez_df[['GeneID', 'Symbol']].rename(
        columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'}
    ).merge(diffex_df))

diffex_df = diffex_df.sort_values(['slim_name', 'direction', 'gene_symbol']).drop('doid', 1)
diffex_df.to_csv('data/diffex.tsv', sep='\t', index=False, float_format='%.5g')
diffex_df.head()

Unnamed: 0,slim_id,slim_name,entrez_gene_id,gene_symbol,direction,log2_fold_change,p_adjusted
15440,DOID:10652,Alzheimer's disease,65985,AACS,down,-0.036,0.0
15551,DOID:10652,Alzheimer's disease,79719,AAGAB,down,-0.051823,0.0
11603,DOID:10652,Alzheimer's disease,16,AARS,down,-0.013714,2.006613e-11
15292,DOID:10652,Alzheimer's disease,60496,AASDHPPT,down,-0.022448,0.0
11604,DOID:10652,Alzheimer's disease,21,ABCA3,down,-0.005883,0.006815468


In [8]:
count_df = diffex_df.groupby(['slim_id', 'slim_name', 'direction']).count().reset_index()
count_df = count_df.pivot_table(values='entrez_gene_id', index=['slim_id', 'slim_name'], columns='direction').fillna(0).reset_index()
count_df.to_csv('data/summary.tsv', sep='\t', index=False)
count_df.head()

direction,slim_id,slim_name,down,up
0,DOID:0050156,idiopathic pulmonary fibrosis,799,963
1,DOID:0050741,alcohol dependence,143,163
2,DOID:0050742,nicotine dependence,5178,4277
3,DOID:10283,prostate cancer,37,43
4,DOID:10652,Alzheimer's disease,2218,2189


## Disease similarities

In [9]:
matrix_df = diffex_df.pivot('slim_id', 'entrez_gene_id', 'direction')
matrix_df = matrix_df.fillna(0).replace(['up', 'down'], [1, -1])
matrix_df = matrix_df[(matrix_df != 0).sum(axis=1) >= 500]

In [10]:
def get_overlap_coef(series_0, series_1):
    """Jaccard coefficient"""
    numerator = sum((series_0 == series_1) & (series_0 * series_1 != 0))
    denominator = sum((series_0 != 0) | (series_1 != 0))
    return numerator / denominator

In [11]:
rows = list()
for doid_0, doid_1 in itertools.permutations(matrix_df.index, 2):
    series_0 = matrix_df.loc[doid_0, :]
    series_1 = matrix_df.loc[doid_1, :]
    coef = get_overlap_coef(series_0, series_1)
    row = doid_0, doid_1, coef
    rows.append(row)

similarity_df = pandas.DataFrame(rows, columns=['doid_0', 'doid_1', 'similarity'])

In [12]:
similarity_df = query_df[['slim_id', 'slim_name']].rename(columns={'slim_id': 'doid_0', 'slim_name': 'disease_0'}).merge(
query_df[['slim_id', 'slim_name']].rename(columns={'slim_id': 'doid_1', 'slim_name': 'disease_1'}).merge(similarity_df))
similarity_df = similarity_df.sort_values(['doid_0', 'similarity'], ascending=False)
similarity_df.to_csv('data/similarity.tsv', sep='\t', index=False, float_format='%.4g')

In [13]:
similarity_df.sort_values('similarity', ascending=False).iloc[::2, :].head(20)

Unnamed: 0,doid_0,disease_0,doid_1,disease_1,similarity
200,DOID:1612,breast cancer,DOID:263,kidney cancer,0.28185
579,DOID:8577,ulcerative colitis,DOID:263,kidney cancer,0.271839
40,DOID:0050742,nicotine dependence,DOID:3083,chronic obstructive pulmonary disease,0.25659
38,DOID:0050742,nicotine dependence,DOID:263,kidney cancer,0.229478
33,DOID:0050742,nicotine dependence,DOID:1612,breast cancer,0.210564
209,DOID:1612,breast cancer,DOID:8577,ulcerative colitis,0.206316
510,DOID:635,acquired immunodeficiency syndrome,DOID:9074,systemic lupus erythematosus,0.205436
329,DOID:263,kidney cancer,DOID:14227,azoospermia,0.200949
47,DOID:0050742,nicotine dependence,DOID:8577,ulcerative colitis,0.180694
336,DOID:263,kidney cancer,DOID:2986,IgA glomerulonephritis,0.164418
