# ADEPTUS -- differential gene expression signatures of disease

Processing the differential expression gene sets from "Annotated Disease Expression Profiles Transformed into a Unified Suite" ([publication](https://dx.doi.org/10.1093/nar/gkv810), [website](http://acgt.cs.tau.ac.il/adeptus/)).

In [1]:
urls = [
    'http://acgt.cs.tau.ac.il/adeptus/data/supp_table_1.txt',
    'http://acgt.cs.tau.ac.il/adeptus/data/gene2name.txt',
]
for url in urls:
    ! wget --no-verbose --timestamping --directory-prefix download {url}

In [2]:
import re

import requests
import pandas

## Mappings

In [3]:
# Symbol to entrez gene mapping
gene_map = pandas.read_table('download/gene2name.txt', names=['entrez_gene_id', 'Gene'])

In [4]:
# Disease ontology mapping
url = 'https://github.com/dhimmel/disease-ontology/blob/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv?raw=true'
do_map_df = pandas.read_table(url)
do_map_df = do_map_df.rename(columns={'doid': 'doid_id', 'name': 'Disease'})

In [5]:
# Disease ontology slim mappings
url = 'https://github.com/dhimmel/disease-ontology/blob/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv?raw=true'
doslim_map_df = pandas.read_table(url)
doslim_map_df = doslim_map_df.rename(columns={'subsumed_id': 'doid_id'})
doslim_map_df.head(2)

Unnamed: 0,slim_id,slim_name,doid_id,subsumed_name,min_distance
0,DOID:0050156,idiopathic pulmonary fibrosis,DOID:0050156,idiopathic pulmonary fibrosis,0
1,DOID:0050425,restless legs syndrome,DOID:0050425,restless legs syndrome,0


## Read gene sets

In [6]:
adeptus_df = pandas.read_table('download/supp_table_1.txt')
adeptus_df['direction'] = adeptus_df['Differential Expression'].str.extract('vs negatives: (\S+)')
adeptus_df.head()

Unnamed: 0,Disease,Gene,Differential Expression,PB-ROC,PN-ROC,direction
0,cancer,RFC2,"vs bgc: up, vs negatives: up",0.7,0.71,up
1,cancer,CBX3,"vs bgc: up, vs negatives: up",0.68,0.72,up
2,cancer,ERH,"vs bgc: up, vs negatives: up",0.65,0.66,up
3,cancer,ZNF146,"vs bgc: up, vs negatives: up",0.66,0.68,up
4,cancer,ILF2,"vs bgc: up, vs negatives: up",0.73,0.71,up


In [7]:
adeptus_df = do_map_df.merge(gene_map.merge(adeptus_df))
adeptus_df = adeptus_df.drop_duplicates(['doid_id', 'entrez_gene_id', 'direction'])
adeptus_df = adeptus_df.drop(['Differential Expression', 'type'], 1)
adeptus_df.head()

Unnamed: 0,doid_id,Disease,entrez_gene_id,Gene,PB-ROC,PN-ROC,direction
0,DOID:331,central nervous system disease,91137,SLC25A46,0.67,0.72,down
1,DOID:331,central nervous system disease,79590,MRPL24,0.68,0.69,down
2,DOID:331,central nervous system disease,10237,SLC35B1,0.68,0.75,down
3,DOID:331,central nervous system disease,4118,MAL,0.65,0.8,up
4,DOID:331,central nervous system disease,79084,WDR77,0.66,0.7,down


In [8]:
adeptus_df.to_csv('data/gene-sets.tsv', sep='\t', index=False)

In [9]:
# differential expression summary
adeptus_df.groupby(['doid_id', 'Disease']).apply(
    lambda df: pandas.Series({'up': sum(df.direction == 'up'), 'down': sum(df.direction == 'down')})
).reset_index()

Unnamed: 0,doid_id,Disease,down,up
0,DOID:0050686,organ system cancer,24,149
1,DOID:0050687,cell type cancer,162,135
2,DOID:1037,lymphoblastic leukemia,55,50
3,DOID:1240,leukemia,28,53
4,DOID:1287,cardiovascular system disease,165,229
5,DOID:1289,neurodegenerative disease,42,22
6,DOID:1612,breast cancer,61,68
7,DOID:162,cancer,36,222
8,DOID:2531,hematologic cancer,24,9
9,DOID:3119,gastrointestinal system cancer,177,286


## DO slim

In [10]:
slim_df = doslim_map_df.merge(adeptus_df)
slim_df = slim_df[['slim_id', 'slim_name', 'entrez_gene_id', 'Gene', 'direction']].drop_duplicates()
slim_df.to_csv('data/gene-sets-slim.tsv', sep='\t', index=False)

In [11]:
# differential expression summary
slim_df.groupby(['slim_id', 'slim_name']).apply(
    lambda df: pandas.Series({'up': sum(df.direction == 'up'), 'down': sum(df.direction == 'down')})
).reset_index()

Unnamed: 0,slim_id,slim_name,down,up
0,DOID:1324,lung cancer,101,211
1,DOID:1612,breast cancer,61,68
2,DOID:2531,hematologic cancer,512,631
