# Compute the prior probability of treatment using permutation 

In [1]:
import itertools
import statistics

import pandas

from hetio.permute import permute_pair_list

In [2]:
# Read treatments
treatment_df = pandas.read_table('../summary/indications.tsv')
treatment_df = treatment_df.query("rel_type == 'TREATS_CtD'")
treatment_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type
0,DB01048,Abacavir,DOID:635,acquired immunodeficiency syndrome,TREATS_CtD
1,DB05812,Abiraterone,DOID:10283,prostate cancer,TREATS_CtD


In [3]:
# Create node to degree dictionaries
compound_to_degree = dict(treatment_df.compound_id.value_counts())
disease_to_degree = dict(treatment_df.disease_id.value_counts())

In [4]:
# A degree (compound_degree, disease_degree) to all potential edges with that degree
degree_to_edges = dict()

rows = list()
for (c, c_deg), (d, d_deg) in itertools.product(compound_to_degree.items(), disease_to_degree.items()):
    rows.append((c, d, c_deg, d_deg))
    degree = c_deg, d_deg
    edge = c, d
    degree_to_edges.setdefault(degree, set()).add(edge)

pair_df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'compound_treats', 'disease_treats'])
pair_df = pair_df.sort_values(['compound_id', 'disease_id'])

In [5]:
treatments = list(zip(treatment_df.compound_id, treatment_df.disease_id))

In [6]:
# Burn In
pair_list, stats = permute_pair_list(treatments, multiplier=10)
pandas.DataFrame(stats)

Unnamed: 0,attempts,complete,cumulative_attempts,duplicate,excluded,same_edge,self_loop,unchanged,undirected_duplicate
0,756,0.100132,755,0.191799,0.0,0.001323,0.0,0.254305,0.0
1,755,0.200132,1510,0.229139,0.0,0.001325,0.0,0.144371,0.0
2,755,0.300132,2265,0.201325,0.0,0.002649,0.0,0.115232,0.0
3,755,0.400132,3020,0.210596,0.0,0.001325,0.0,0.09404,0.0
4,755,0.500132,3775,0.239735,0.0,0.0,0.0,0.103311,0.0
5,755,0.600132,4530,0.215894,0.0,0.0,0.0,0.112583,0.0
6,755,0.700132,5285,0.231788,0.0,0.002649,0.0,0.127152,0.0
7,755,0.800132,6040,0.225166,0.0,0.001325,0.0,0.112583,0.0
8,755,0.900132,6795,0.2,0.0,0.001325,0.0,0.101987,0.0
9,754,1.0,7549,0.238727,0.0,0.001326,0.0,0.107285,0.0


In [7]:
# Set the multiplier based on the burn in stats
multiplier = 3

In [8]:
# Calculate the number of perms
n_perm = treatment_df.compound_id.nunique() * treatment_df.disease_id.nunique()
n_perm = int(n_perm * 25)
n_perm

744975

In [9]:
%%time

# Initialize a dictionary of degree to empirical probability list
degree_to_probs = {x: list() for x in degree_to_edges}

# Perform n_perm permutations
for i in range(n_perm):
    # Permute
    pair_list, stats = permute_pair_list(pair_list, multiplier=multiplier, seed=i)
    
    # Update
    pair_set = set(pair_list)
    for degree, probs in degree_to_probs.items():
        edges = degree_to_edges[degree]
        probs.append(len(edges & pair_set) / len(edges))

CPU times: user 3h 54min 20s, sys: 18.7 s, total: 3h 54min 39s
Wall time: 3h 54min 42s


In [25]:
%%time
rows = list()
for (c_deg, d_deg), probs in degree_to_probs.items():
    mean = statistics.mean(probs)
    std_error = statistics.stdev(probs) / len(probs) ** 0.5
    rows.append((c_deg, d_deg, mean, std_error))
perm_df = pandas.DataFrame(rows, columns=['compound_treats', 'disease_treats', 'prior_perm', 'prior_perm_stderr'])
perm_df = perm_df.sort_values(['compound_treats', 'disease_treats'])

CPU times: user 59min 23s, sys: 15.7 s, total: 59min 39s
Wall time: 59min 39s


In [26]:
# Add unpermuted treatment prevalence columns
rows = list()
treatment_set = set(treatments)
for (c_deg, d_deg), edges in degree_to_edges.items():
    n_treatments = len(edges & treatment_set)
    rows.append((c_deg, d_deg, n_treatments, len(edges)))
degree_prior_df = pandas.DataFrame(rows, columns=['compound_treats', 'disease_treats', 'n_treatments', 'n_possible'])
degree_prior_df = perm_df.merge(degree_prior_df)
degree_prior_df = degree_prior_df.sort_values(['compound_treats', 'disease_treats'])

In [27]:
degree_prior_df.tail(2)

Unnamed: 0,compound_treats,disease_treats,prior_perm,prior_perm_stderr,n_treatments,n_possible
406,19,51,0.733662,0.000512,0,1
407,19,68,0.795997,0.000467,1,1


In [28]:
degree_prior_df.to_csv('data/degree-prior.tsv', sep='\t', index=False, float_format='%.6g')

In [29]:
obs_prior_df = pair_df.merge(perm_df)

In [30]:
obs_prior_df.head(2)

Unnamed: 0,compound_id,disease_id,compound_treats,disease_treats,prior_perm,prior_perm_stderr
0,DB00014,DOID:0050741,2,4,0.009801,5e-06
1,DB00014,DOID:10652,2,4,0.009801,5e-06


In [31]:
len(obs_prior_df)

29799

In [32]:
obs_prior_df.to_csv('data/observation-prior.tsv', sep='\t', index=False, float_format='%.6g')