# Combine analyses across diseases

In [1]:
import os
import itertools

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
# Read entrez_df
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
entrez_df = entrez_df[entrez_df.type_of_gene == 'protein-coding']

In [3]:
# Read queries
query_df = pandas.read_table('data/queries.tsv')
query_df['doid'] = query_df.slim_id.map(lambda x: x.replace(':', '_'))

In [4]:
# Read files
rows = list()
filenames = ['log.txt', 'samples.tsv', 'balanced_permutation.tsv.gz']
for doid in os.listdir('data/doslim'):
    if not doid.startswith('DOID'):
        continue
    row = [os.path.exists(os.path.join('data', 'doslim', doid, filename)) for filename in filenames]
    rows.append([doid] + row)

file_df = pandas.DataFrame(rows, columns=['doid'] + filenames)
file_df = file_df.merge(query_df)
file_df.to_csv('data/files.tsv', sep='\t', index=False)

In [5]:
doids = file_df[file_df['balanced_permutation.tsv.gz']].doid

In [6]:
#assert not any(meta_df.mygene_entrez.duplicated())

In [7]:
rows = list()
for doid in doids:
    path = os.path.join('data', 'doslim', doid, 'balanced_permutation.tsv.gz')
    meta_df = pandas.read_table(path)
    meta_df = meta_df[meta_df.mygene_entrez.isin(entrez_df.GeneID)]
    mult_tests = multipletests(meta_df.random_pval, alpha=0.05, method='fdr_bh')
    reject, pvals_corrected, alphacSidak, alphacBonf = mult_tests
    meta_df = meta_df[reject]
    for i, row in meta_df.iterrows():
        direction = 'up' if row.random_TE > 0 else 'down'
        rows.append([doid, direction, row.mygene_entrez])

diffex_df = pandas.DataFrame(rows, columns=['doid', 'direction', 'entrez_gene_id'])
diffex_df = diffex_df.sort_values(['doid', 'direction', 'entrez_gene_id'])
diffex_df = diffex_df.drop_duplicates()
diffex_df = diffex_df[-(
    diffex_df.duplicated(['doid', 'entrez_gene_id'], 'first') |
    diffex_df.duplicated(['doid', 'entrez_gene_id'], 'last')
    )]
diffex_df.entrez_gene_id = diffex_df.entrez_gene_id.astype(int)

In [8]:
count_df = diffex_df.groupby(['doid', 'direction']).count().reset_index()
count_df = count_df.pivot_table('entrez_gene_id', 'doid', 'direction').fillna(0).reset_index()

In [9]:
summary_df = query_df[['doid', 'slim_id', 'slim_name']].merge(count_df)
summary_df.to_csv('data/summary.tsv', sep='\t', index=False)

In [10]:
summary_df

Unnamed: 0,doid,slim_id,slim_name,down,up
0,DOID_0050156,DOID:0050156,idiopathic pulmonary fibrosis,799,963
1,DOID_0050741,DOID:0050741,alcohol dependence,143,163
2,DOID_0050742,DOID:0050742,nicotine dependence,5178,4277
3,DOID_10283,DOID:10283,prostate cancer,37,43
4,DOID_10652,DOID:10652,Alzheimer's disease,2218,2189
5,DOID_10763,DOID:10763,hypertension,54,50
6,DOID_11612,DOID:11612,polycystic ovary syndrome,328,358
7,DOID_12365,DOID:12365,malaria,406,407
8,DOID_13223,DOID:13223,uterine fibroid,160,235
9,DOID_1324,DOID:1324,lung cancer,132,343


In [11]:
matrix_df = diffex_df.pivot('doid', 'entrez_gene_id', 'direction')
matrix_df = matrix_df.fillna(0).replace(['up', 'down'], [1, -1])
matrix_df = matrix_df[(matrix_df != 0).sum(axis=1) >= 500]

## Disease similarities

In [12]:
def get_overlap_coef(series_0, series_1):
    """jaccard"""
    numerator = sum((series_0 == series_1) & (series_0 * series_1 != 0))
    denominator = sum((series_0 != 0) | (series_1 != 0))
    return numerator / denominator

In [13]:
rows = list()
for doid_0, doid_1 in itertools.permutations(matrix_df.index, 2):
    series_0 = matrix_df.loc[doid_0, :]
    series_1 = matrix_df.loc[doid_1, :]
    coef = get_overlap_coef(series_0, series_1)
    row = doid_0, doid_1, coef
    rows.append(row)

similarity_df = pandas.DataFrame(rows, columns=['doid_0', 'doid_1', 'similarity'])

In [14]:
similarity_df = query_df[['doid', 'slim_name']].rename(columns={'doid': 'doid_0', 'slim_name': 'disease_0'}).merge(
query_df[['doid', 'slim_name']].rename(columns={'doid': 'doid_1', 'slim_name': 'disease_1'}).merge(similarity_df))

In [15]:
similarity_df = similarity_df.sort_values(['doid_0', 'similarity'], ascending=False)
similarity_df.to_csv('data/similarity.tsv', sep='\t', index=False)

In [16]:
similarity_df.sort_values('similarity', ascending=False).iloc[::2, :].head(20)

Unnamed: 0,doid_0,disease_0,doid_1,disease_1,similarity
319,DOID_263,kidney cancer,DOID_1612,breast cancer,0.28185
532,DOID_8577,ulcerative colitis,DOID_263,kidney cancer,0.271839
37,DOID_0050742,nicotine dependence,DOID_263,kidney cancer,0.229478
183,DOID_1612,breast cancer,DOID_0050742,nicotine dependence,0.210564
201,DOID_1612,breast cancer,DOID_8577,ulcerative colitis,0.206316
465,DOID_635,acquired immunodeficiency syndrome,DOID_9074,systemic lupus erythematosus,0.205436
317,DOID_263,kidney cancer,DOID_14227,azoospermia,0.200949
45,DOID_0050742,nicotine dependence,DOID_8577,ulcerative colitis,0.180694
350,DOID_2986,IgA glomerulonephritis,DOID_263,kidney cancer,0.164418
30,DOID_0050742,nicotine dependence,DOID_14227,azoospermia,0.162609


In [17]:
diffex_detailed_df = query_df[['doid', 'slim_id', 'slim_name']].merge(diffex_df).merge(
    entrez_df[['GeneID', 'Symbol']].rename(
    columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'})
    ).sort_values(['slim_name', 'direction', 'gene_symbol']).drop('doid', 1)
diffex_detailed_df.to_csv('data/diffex.tsv', sep='\t', index=False)