In [161]:
import numpy
import pandas

import rwwr

In [162]:
# restart probability
r = 0.2

In [163]:
# Read genetic overlap between diseases
dice_df = pandas.read_table('data/disease-similarity.tsv', index_col=0)
len(dice_df)

92

In [164]:
# Filter diseases without any similarity
doid_ids = dice_df.columns[(dice_df > 0).sum() > 1]
dice_df = dice_df.loc[doid_ids, doid_ids]
len(dice_df)

84

In [165]:
# Run random walk for each disease
rows = list()

for doid_id in dice_df.columns:
    df = dice_df.copy()
    df = df.drop(doid_id, axis=0) # drop row
    seed = df[doid_id]
    df = df.drop(doid_id, axis=1) # drop column
    mat = df.as_matrix()
    probs, steps = rwwr.walk(r, seed, mat)
    rows.extend(zip([doid_id] * len(df), df.columns, probs))

rw_df = pandas.DataFrame(rows, columns=['source_id', 'target_id', 'proximity'])

In [166]:
# Add Disease Ontology names
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv'
doid_df = pandas.read_table(url).query("type == 'name'").drop('type', 1)
s_df = doid_df.rename(columns={'doid': 'source_id', 'name': 'source_name'})
t_df = doid_df.rename(columns={'doid': 'target_id', 'name': 'target_name'})
rw_df = s_df.merge(t_df.merge(rw_df))

In [167]:
# Sort by disease_name and proximity
rw_df = rw_df.sort(['source_name', 'proximity'], ascending=[True, False])
# Save as a tsv
rw_df.to_csv('data/proximities.tsv', sep='\t', index=False, float_format='%.6f')

In [168]:
# Top MS proximities
rw_df.query("source_name == 'multiple sclerosis'").head(15)

Unnamed: 0,source_id,source_name,target_id,target_name,proximity
1464,DOID:2377,multiple sclerosis,DOID:0050589,inflammatory bowel disease,0.054966
1445,DOID:2377,multiple sclerosis,DOID:10608,celiac disease,0.052474
1419,DOID:2377,multiple sclerosis,DOID:8778,Crohn's disease,0.050761
1490,DOID:2377,multiple sclerosis,DOID:2841,asthma,0.050183
1493,DOID:2377,multiple sclerosis,DOID:9744,type 1 diabetes mellitus,0.046992
1438,DOID:2377,multiple sclerosis,DOID:8577,ulcerative colitis,0.044306
1429,DOID:2377,multiple sclerosis,DOID:7148,rheumatoid arthritis,0.040536
1443,DOID:2377,multiple sclerosis,DOID:4481,allergic rhinitis,0.038726
1441,DOID:2377,multiple sclerosis,DOID:12236,primary biliary cirrhosis,0.03685
1488,DOID:2377,multiple sclerosis,DOID:9074,systemic lupus erythematosus,0.031171
