# Combine analyses across diseases

In [1]:
import os
import itertools

import pandas
from statsmodels.sandbox.stats.multicomp import multipletests

In [2]:
# Read entrez_df
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/5352b31e04ec136e99d25a0ba63e8867aa71b69f/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
entrez_df = entrez_df[entrez_df.type_of_gene == 'protein-coding']

In [3]:
# Read queries
commit = '153688869fb87412dd716ac27994e51baeb16078'
url = 'https://raw.githubusercontent.com/dhimmel/stargeo/{}/data/queries.tsv'.format(commit)
query_df = pandas.read_table(url)
query_df['doid'] = query_df.slim_id.map(lambda x: x.replace(':', '_'))

In [5]:
# Read files
rows = list()
filenames = ['log.txt', 'samples.tsv', 'balanced_permutation.tsv']
for doid in os.listdir('data/doslim'):
    if not doid.startswith('DOID'):
        continue
    row = [os.path.exists(os.path.join('data', 'doslim', doid, filename)) for filename in filenames]
    rows.append([doid] + row)

file_df = pandas.DataFrame(rows, columns=['doid'] + filenames)
file_df = file_df.merge(query_df)
file_df.to_csv('data/files.tsv', sep='\t', index=False)

In [6]:
doids = file_df[file_df['balanced_permutation.tsv']].doid

In [7]:
#assert not any(meta_df.mygene_entrez.duplicated())

In [9]:
rows = list()
for doid in doids:
    path = os.path.join('data', 'doslim', doid, 'balanced_permutation.tsv')
    meta_df = pandas.read_table(path)
    meta_df = meta_df[meta_df.mygene_entrez.isin(entrez_df.GeneID)]
    mult_tests = multipletests(meta_df.random_pval, alpha=0.05, method='fdr_bh')
    reject, pvals_corrected, alphacSidak, alphacBonf = mult_tests
    meta_df = meta_df[reject]
    for i, row in meta_df.iterrows():
        direction = 'up' if row.random_TE > 0 else 'down'
        rows.append([doid, direction, row.mygene_entrez])

diffex_df = pandas.DataFrame(rows, columns=['doid', 'direction', 'entrez_gene_id'])
diffex_df = diffex_df.sort_values(['doid', 'direction', 'entrez_gene_id'])
diffex_df = diffex_df.drop_duplicates()
diffex_df = diffex_df[-(
    diffex_df.duplicated(['doid', 'entrez_gene_id'], 'first') |
    diffex_df.duplicated(['doid', 'entrez_gene_id'], 'last')
    )]
diffex_df.entrez_gene_id = diffex_df.entrez_gene_id.astype(int)

AttributeError: 'DataFrame' object has no attribute 'mygene_entrez'

In [None]:
count_df = diffex_df.groupby(['doid', 'direction']).count().reset_index()
count_df = count_df.pivot_table('entrez_gene_id', 'doid', 'direction').fillna(0).reset_index()

In [None]:
summary_df = query_df[['doid', 'slim_id', 'slim_name']].merge(count_df)
summary_df.to_csv('data/summary.tsv', sep='\t', index=False)

In [None]:
summary_df

In [None]:
matrix_df = diffex_df.pivot('doid', 'entrez_gene_id', 'direction')
matrix_df = matrix_df.fillna(0).replace(['up', 'down'], [1, -1])
matrix_df = matrix_df[(matrix_df != 0).sum(axis=1) >= 500]

## Disease similarities

In [None]:
def get_overlap_coef(series_0, series_1):
    """jaccard"""
    numerator = sum((series_0 == series_1) & (series_0 * series_1 != 0))
    denominator = sum((series_0 != 0) | (series_1 != 0))
    return numerator / denominator

In [None]:
rows = list()
for doid_0, doid_1 in itertools.permutations(matrix_df.index, 2):
    series_0 = matrix_df.loc[doid_0, :]
    series_1 = matrix_df.loc[doid_1, :]
    coef = get_overlap_coef(series_0, series_1)
    row = doid_0, doid_1, coef
    rows.append(row)

similarity_df = pandas.DataFrame(rows, columns=['doid_0', 'doid_1', 'similarity'])

In [None]:
similarity_df = query_df[['doid', 'slim_name']].rename(columns={'doid': 'doid_0', 'slim_name': 'disease_0'}).merge(
query_df[['doid', 'slim_name']].rename(columns={'doid': 'doid_1', 'slim_name': 'disease_1'}).merge(similarity_df))

In [None]:
similarity_df = similarity_df.sort_values(['doid_0', 'similarity'], ascending=False)
similarity_df.to_csv('data/similarity.tsv', sep='\t', index=False)

In [None]:
similarity_df.sort_values('similarity', ascending=False).iloc[::2, :].head(20)

In [None]:
diffex_detailed_df = query_df[['doid', 'slim_id', 'slim_name']].merge(diffex_df).merge(
    entrez_df[['GeneID', 'Symbol']].rename(
    columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'})
    ).sort_values(['slim_name', 'direction', 'gene_symbol']).drop('doid', 1)
diffex_detailed_df.to_csv('data/diffex.tsv', sep='\t', index=False)