# Example Query for Survival Probability of 1-hop Queries

Queries our system in the form of:<br>
$P(survival\_time > X | Drug \wedge Disease)$<br>
Returned is a knowledge graph containing probability of survival time and genes that contributed strongly to the question of survival time w.r.t a drug and disease. It is our hope that these have some indication of gene sensitivites.

In [1]:
import requests
import json
import csv

# /predicate functionality example
By running /predicates you can extract a json object with the following predicates:<br>
1.) biolink:affects<br>
2.) biolink:has_phenotype<br>

The above predicates link the following biolink entities:<br>
1.) biolink:Gene<br>
2.) biolink:Drug<br>
3.) biolink:Disease<br>
4.) biolink:PhenotypicFeature<br>

In [2]:
r = requests.get('http://chp.thayer.dartmouth.edu/predicates/')
json_formatted_str = json.dumps(json.loads(r.content), indent=2)
print(json_formatted_str)

{
  "biolink:Disease": {
    "biolink:PhenotypicFeature": [
      "biolink:has_phenotype"
    ]
  },
  "biolink:Drug": {
    "biolink:Disease": [
      "biolink:affects"
    ],
    "biolink:Gene": [
      "biolink:affects"
    ]
  },
  "biolink:Gene": {
    "biolink:Disease": [
      "biolink:affects"
    ]
  }
}


# Build Query
Constructs a json query object and can take in a survival time, a disease and a drug.

In [3]:
# Function: buildQuery
#
# Input:
# -----------
# survival time, a disease and a drug
#
# Output:
# -----------
# A query graph that asks this probablistic question: 
# P(survival_time > X | Drug = d1 and Disease = Breast Cancer)

def buildQuery(st, disease, drug):
    
    # empty response
    reasoner_std = { "query_graph": dict()
                   }
    # empty query graph
    reasoner_std["query_graph"] = { "edges": dict(),
                                    "nodes": dict()
                                  }
    
    node_count = 0
    edge_count = 0
    
    # wildcard gene slot
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Gene',
                                                                     }
    node_count += 1
    
    # drug
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Drug',
                                                                       'id':'{}'.format(drug[1])
                                                                     }
    node_count += 1
    
    # add in disease node
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Disease',
                                                                       'id':'{}'.format(disease[1])
                                                                     }
    node_count += 1
    
    # link gene evidence to disease
    reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:affects',
                                                                       'subject': 'n{}'.format(node_count -3),
                                                                       'object': 'n{}'.format(node_count -1)   # should be disease node
                                                                      }
    edge_count += 1
    
    # link drug evidence to disease
    reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:affects',
                                                                       'subject': 'n{}'.format(node_count -2),
                                                                       'object': 'n{}'.format(node_count -1)  # should be disease node
                                                                     }
    edge_count += 1
            
    # add target survival node
    phenotype = ('Survival_Time', 'EFO:0000714')
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category': 'biolink:PhenotypicFeature',
                                                                       'id': '{}'.format(phenotype[1]),
                                                                     }
    node_count += 1
    
    # link disease to target
    reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:has_phenotype',
                                                                       'subject': 'n{}'.format(node_count-2),
                                                                       'object': 'n{}'.format(node_count-1),
                                                                       'properties': { 'qualifier':'>=',
                                                                                       'days': st
                                                                                     }
                                                                     }
    return reasoner_std

# Read Drugs
Functionality to read in our set of available drugs with respective chembl curie IDs.

In [4]:
def readDrugs():
    with open('drug_curie_map.csv', 'r') as drug_file:
        reader = csv.reader(drug_file)
        next(reader)
        rows = [(row[0],row[1]) for row in reader]
    return rows

# Constructing the Query and pinging CHP
You can use the commented out functionality to check which drugs are available. Disease and drug are passed in as a tuple shown below. Currently only breast cancer can be used as the disease.

In [5]:
# list of drugs (and curies) we can query over
#drug_list = readDrugs()

survival_time = 1000
disease = ('Breast_Cancer', 'MONDO:0007254')
drug = ('CYCLOPHOSPHAMIDE', 'CHEMBL:CHEMBL88')

query = buildQuery(survival_time, disease, drug)
payload = {'message': query}

#increase max_results
payload['max_results'] = 100

r = requests.post('http://chp.thayer.dartmouth.edu/query/', json=payload)
chp_res = json.loads(r.content)

# Extract sensitive genes
The very first result will be for the predicted survivability. Every result thereafter contains a gene with its respective sensitivity. Sensitivty values range between -1 and 1. Genes closer to -1 can be thought of as having contributed more to the false assignment of $P(survival\_time > X | Drug \wedge Disease)$. Similarly genes closer to 1 can be thought of as having contributed more to the true assignment. Gene sensitivities are order by their absolute value.

## 1. Extract predicted survivability

In [6]:
KG = chp_res['message']['knowledge_graph']
QG = chp_res['message']['query_graph']
results = chp_res['message']['results']

# holds probability of survival
survival_result = results[0]

for qge_id in survival_result['edge_bindings'].keys():
    if QG['edges'][qge_id]['predicate'] == 'biolink:has_phenotype':
        kge_id = survival_result['edge_bindings'][qge_id][0]['id']
        probability = KG['edges'][kge_id]['attributes'][0]['value']
        
print("P(survival_time > {} | drug & disease):".format(survival_time),probability)

P(survival_time > 1000 | drug & disease): 0.5300454864137873


## 2. Extract sensitive gene rankings

In [7]:
KG = chp_res['message']['knowledge_graph']
QG = chp_res['message']['query_graph']
results = chp_res['message']['results']

# holds gene sensitivites
sensitivity_results = results[1:]

genes = []
for sr in sensitivity_results:
    for qge_id in sr['edge_bindings'].keys():
        if QG['edges'][qge_id]['predicate'] == 'biolink:affects':
            kge_id = sr['edge_bindings'][qge_id][0]['id']
            sensitivity = KG['edges'][kge_id]
            gene_curie = sensitivity['subject']
            gene_weight = sensitivity['attributes'][0]['value']    
    for qgn_id in sr['node_bindings'].keys():
        if QG['nodes'][qgn_id]['category'] == 'biolink:Gene':
            kgn_id = sr['node_bindings'][qgn_id][0]['id']
            gene_name = KG['nodes'][kgn_id]['name']
    genes.append((gene_name, gene_curie, gene_weight))
    
for gene in genes:
    print(gene)

('TP53', 'CHEMBL:CHEMBL88', -0.033660234999710104)
('VPS13C', 'CHEMBL:CHEMBL88', -0.031878458032680954)
('RYR2', 'CHEMBL:CHEMBL88', 0.028681867597806714)
('WNK3', 'CHEMBL:CHEMBL88', -0.028082711800518235)
('CHD4', 'CHEMBL:CHEMBL88', -0.027544443554162527)
('ROBO1', 'CHEMBL:CHEMBL88', -0.027317485660542835)
('MYCBP2', 'CHEMBL:CHEMBL88', -0.026678035049294144)
('SGIP1', 'CHEMBL:CHEMBL88', -0.026346134957570193)
('SPTA1', 'CHEMBL:CHEMBL88', 0.02588583100880638)
('ERBB2', 'CHEMBL:CHEMBL88', -0.023752457155211085)
('ARID1B', 'CHEMBL:CHEMBL88', -0.023371729944973774)
('DNAH17', 'CHEMBL:CHEMBL88', 0.02304638426048171)
('NRK', 'CHEMBL:CHEMBL88', 0.022920967940909094)
('COL5A2', 'CHEMBL:CHEMBL88', -0.022218245042049008)
('UBR4', 'CHEMBL:CHEMBL88', -0.02216565214717962)
('BPTF', 'CHEMBL:CHEMBL88', -0.02201212047905177)
('GRIN2A', 'CHEMBL:CHEMBL88', -0.021934934420348655)
('KCNA4', 'CHEMBL:CHEMBL88', -0.021934934420348655)
('MUC16', 'CHEMBL:CHEMBL88', -0.021731907069193528)
('LRP1B', 'CHEMBL:CHEM