# Combine analyses across diseases

In [7]:
import os
import itertools

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

In [8]:
# Read entrez_df
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
entrez_df = entrez_df[entrez_df.type_of_gene == 'protein-coding']

In [9]:
# Read queries
commit = '153688869fb87412dd716ac27994e51baeb16078'
url = 'https://raw.githubusercontent.com/dhimmel/stargeo/{}/data/queries.tsv'.format(commit)
query_df = pandas.read_table(url)
query_df['doid'] = query_df.slim_id.map(lambda x: x.replace(':', '_'))

In [10]:
# Read files
rows = list()
filenames = ['log.txt', 'samples.tsv', 'meta.tsv']
for doid in os.listdir('data/doslim'):
    if not doid.startswith('DOID'):
        continue
    row = [os.path.exists(os.path.join('data', 'doslim', doid, filename)) for filename in filenames]
    rows.append([doid] + row)

file_df = pandas.DataFrame(rows, columns=['doid'] + filenames)
file_df = query_df.merge(file_df)
file_df.to_csv('data/files.tsv', sep='\t', index=False)

In [11]:
doids = file_df[file_df['meta.tsv']].doid

In [12]:
doid = doids[0]

In [13]:
#assert not any(meta_df.mygene_entrez.duplicated())

In [14]:
rows = list()
for doid in doids:
    path = os.path.join('data', 'doslim', doid, 'meta.tsv')
    meta_df = pandas.read_table(path)
    meta_df = meta_df[meta_df.mygene_entrez.isin(entrez_df.GeneID)]
    mult_tests = multipletests(meta_df.random_pval, method='fdr_bh')
    reject, pvals_corrected, alphacSidak, alphacBonf = mult_tests
    meta_df = meta_df[reject]
    for i, row in meta_df.iterrows():
        direction = 'up' if row.random_TE > 0 else 'down'
        rows.append([doid, direction, row.mygene_entrez])

diffex_df = pandas.DataFrame(rows, columns=['doid', 'direction', 'entrez_gene_id'])
diffex_df = diffex_df.sort_values(['doid', 'direction', 'entrez_gene_id'])
diffex_df = diffex_df.drop_duplicates()
diffex_df = diffex_df[-(
    diffex_df.duplicated(['doid', 'entrez_gene_id'], 'first') |
    diffex_df.duplicated(['doid', 'entrez_gene_id'], 'last')
    )]
diffex_df.entrez_gene_id = diffex_df.entrez_gene_id.astype(int)

In [15]:
count_df = diffex_df.groupby(['doid', 'direction']).count().reset_index()
count_df = count_df.pivot_table('entrez_gene_id', 'doid', 'direction').fillna(0).reset_index()

In [43]:
summary_df = query_df[['doid', 'slim_id', 'slim_name']].merge(count_df)
summary_df.to_csv('summary.tsv', sep='\t', index=False)

In [17]:
summary_df

Unnamed: 0,doid,slim_id,slim_name,down,up
0,DOID_0050156,DOID:0050156,idiopathic pulmonary fibrosis,592,977
1,DOID_0050741,DOID:0050741,alcohol dependence,210,199
2,DOID_0050742,DOID:0050742,nicotine dependence,4734,3072
3,DOID_10283,DOID:10283,prostate cancer,1496,1489
4,DOID_10652,DOID:10652,Alzheimer's disease,1580,1612
5,DOID_11612,DOID:11612,polycystic ovary syndrome,285,255
6,DOID_12365,DOID:12365,malaria,324,382
7,DOID_1324,DOID:1324,lung cancer,99,321
8,DOID_14227,DOID:14227,azoospermia,3319,2462
9,DOID_14330,DOID:14330,Parkinson's disease,564,387


In [27]:
matrix_df = diffex_df.pivot('doid', 'entrez_gene_id', 'direction')
matrix_df = matrix_df.fillna(0).replace(['up', 'down'], [1, -1])
matrix_df = matrix_df[(matrix_df != 0).sum(axis=1) >= 500]

## Disease similarities

In [28]:
def get_overlap_coef(series_0, series_1):
    """jaccard"""
    numerator = sum((series_0 == series_1) & (series_0 * series_1 != 0))
    denominator = sum((series_0 != 0) | (series_1 != 0))
    return numerator / denominator

In [34]:
rows = list()
for doid_0, doid_1 in itertools.permutations(matrix_df.index, 2):
    series_0 = matrix_df.loc[doid_0, :]
    series_1 = matrix_df.loc[doid_1, :]
    coef = get_overlap_coef(series_0, series_1)
    row = doid_0, doid_1, coef
    rows.append(row)

similarity_df = pandas.DataFrame(rows, columns=['doid_0', 'doid_1', 'similarity'])

In [37]:
similarity_df = query_df[['doid', 'slim_name']].rename(columns={'doid': 'doid_0', 'slim_name': 'disease_0'}).merge(
query_df[['doid', 'slim_name']].rename(columns={'doid': 'doid_1', 'slim_name': 'disease_1'}).merge(similarity_df))

In [40]:
similarity_df = similarity_df.sort_values(['doid_0', 'similarity'], ascending=False)
similarity_df.to_csv('data/similarity.tsv', sep='\t', index=False)

In [42]:
similarity_df.sort_values('similarity', ascending=False).iloc[::2, :].head(20)

Unnamed: 0,doid_0,disease_0,doid_1,disease_1,similarity
281,DOID_8577,ulcerative colitis,DOID_263,kidney cancer,0.27999
199,DOID_263,kidney cancer,DOID_0050742,nicotine dependence,0.233395
209,DOID_263,kidney cancer,DOID_2986,IgA glomerulonephritis,0.200397
204,DOID_263,kidney cancer,DOID_14227,azoospermia,0.197451
285,DOID_8577,ulcerative colitis,DOID_8778,Crohn's disease,0.190311
271,DOID_8577,ulcerative colitis,DOID_0050742,nicotine dependence,0.185342
23,DOID_0050742,nicotine dependence,DOID_14227,azoospermia,0.176065
317,DOID_9206,Barrett's esophagus,DOID_263,kidney cancer,0.163544
280,DOID_8577,ulcerative colitis,DOID_219,colon cancer,0.161671
190,DOID_219,colon cancer,DOID_263,kidney cancer,0.158751


In [26]:
diffex_detailed_df = query_df[['doid', 'slim_id', 'slim_name']].merge(diffex_df).merge(
    entrez_df[['GeneID', 'Symbol']].rename(
    columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'})
    ).sort_values(['slim_name', 'direction', 'gene_symbol']).drop('doid', 1)
diffex_detailed_df.to_csv('data/diffex.tsv', sep='\t', index=False)