# Example Query for Survival Probability of 1-hop Queries

Queries our system in the form of:<br>
$P(survival\_time > X | Drug \wedge Disease)$<br>
Returned is a knowledge graph containing probability of survival time and genes/drugs that contributed strongly to the question of survival time.

In [1]:
import requests
import json
import csv

# /predicate functionality example
By running /predicates you can extract a json object with the following predicates:<br>
1.) biolink:treats<br>
2.) biolink:gene_associated_with_condition<br>
3.) biolink:interacts_with<br>
2.) biolink:has_phenotype<br>

The above predicates link the following biolink entities:<br>
1.) biolink:Gene<br>
2.) biolink:Drug<br>
3.) biolink:Disease<br>
4.) biolink:PhenotypicFeature<br>

In [2]:
r = requests.get('http://chp.thayer.dartmouth.edu/predicates/')
json_formatted_str = json.dumps(json.loads(r.content), indent=2)
print(json_formatted_str)

{
  "biolink:Gene": {
    "biolink:Disease": [
      "biolink:gene_associated_with_condition"
    ],
    "biolink:Drug": [
      "biolink:interacts_with"
    ]
  },
  "biolink:Drug": {
    "biolink:Disease": [
      "biolink:treats"
    ],
    "biolink:Gene": [
      "biolink:interacts_with"
    ]
  },
  "biolink:Disease": {
    "biolink:PhenotypicFeature": [
      "biolink:has_phenotype"
    ]
  }
}


# Build Query
Constructs a json query object and can take in a survival time, a disease and a drug.

In [3]:
# Function: buildQuery
#
# Input:
# -----------
# survival time, a disease and a drug
#
# Output:
# -----------
# A query graph that asks this probablistic question: 
# P(survival_time > X | drug and/or mutated_gene and Disease)

def buildQuery(disease, gene=None, gene_node=False, drug=None, drug_node=False, survival_time=None, survival_node=True):
    
    # empty response
    reasoner_std = { "query_graph": dict()
                   }
    # empty query graph
    reasoner_std["query_graph"] = { "edges": dict(),
                                    "nodes": dict()
                                  }
    
    node_count = 0
    edge_count = 0
    
    # add gene node
    gene_node_idx = None
    if gene_node:
        if gene is not None:
            reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = {'category':'biolink:Gene',
                                                                              'id':gene}
            gene_node_idx = node_count
            node_count += 1
        else:
            reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = {'category':'biolink:Gene'}
            gene_node_idx = node_count
            node_count += 1
    
    # add drug node
    drug_node_idx = None
    if drug_node:
        if drug is not None:
            reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = {'category':'biolink:Drug',
                                                                              'id':drug}
            drug_node_idx = node_count
            node_count += 1
        else:
            reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = {'category':'biolink:Drug'}
            drug_node_idx = node_count
            node_count += 1
    
    # add in disease node
    disease_node_idx = node_count
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Disease',
                                                                       'id':disease}
    node_count += 1
    
    # add survival node
    survival_node_idx = None
    if survival_node:
        phenotype = ('Survival_Time', 'EFO:0000714')
        reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category': 'biolink:PhenotypicFeature',
                                                                           'id': 'EFO:0000714'}
        survival_node_idx = node_count
    
    # link evidence to disease node
    if gene_node_idx is not None:
        reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:gene_associated_with_condition',
                                                                           'subject': 'n{}'.format(gene_node_idx),
                                                                           'object': 'n{}'.format(disease_node_idx)}
        edge_count += 1
        
    if drug_node_idx is not None:
        reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:treats',
                                                                           'subject': 'n{}'.format(drug_node_idx),
                                                                           'object': 'n{}'.format(disease_node_idx)}
        edge_count += 1
            
    
    # link disease to survival node
    if survival_node:
        reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = {'predicate':'biolink:has_phenotype',
                                                                          'subject': 'n{}'.format(disease_node_idx),
                                                                          'object': 'n{}'.format(survival_node_idx)}
        if survival_time is not None:
            reasoner_std['query_graph']['edges']['e{}'.format(edge_count)]['properties'] = {'qualifier':'>=',
                                                                                            'days':survival_time}
        
    return reasoner_std

# Constructing the Query and pinging CHP
# Example 1 - Drug wildcard to disease one hop

In [4]:
query = buildQuery('MONDO:0007254', drug_node=True, survival_node=False)
payload = {'message': query}

#increase max_results
payload['max_results'] = 10

r = requests.post('http://chp.thayer.dartmouth.edu/query/', json=payload)
chp_res = json.loads(r.content)

# Extract Contributing Drugs
Contribution values range between -1 and 1. Drugs closer to -1 can be thought of as having contributed more to the false assignment of $P(survival\_time > X | Disease)$. Similarly Drugs closer to 1 can be thought of as having contributed more to the true assignment. Drugs are ordered by absolute value.

In [5]:
KG = chp_res['message']['knowledge_graph']
QG = chp_res['message']['query_graph']
results = chp_res['message']['results']

drugs = []
for sr in results:
    for qge_id in sr['edge_bindings'].keys():
        if QG['edges'][qge_id]['predicate'] == 'biolink:treats':
            kge_id = sr['edge_bindings'][qge_id][0]['id']
            sensitivity = KG['edges'][kge_id]
            drug_curie = sensitivity['subject']
            drug_weight = sensitivity['attributes'][0]['value']    
    for qgn_id in sr['node_bindings'].keys():
        if QG['nodes'][qgn_id]['category'] == 'biolink:Drug':
            kgn_id = sr['node_bindings'][qgn_id][0]['id']
            drug_name = KG['nodes'][kgn_id]['name']
    drugs.append((drug_name, drug_curie, drug_weight))
                 
for drug in drugs:
    print(drug)

('DOXORUBICIN', 'CHEMBL:CHEMBL53463', 0.22874944370271488)
('PACLITAXEL', 'CHEMBL:CHEMBL428647', 0.17985333101139664)
('CYCLOPHOSPHAMIDE', 'CHEMBL:CHEMBL88', 0.12557806543990868)
('TAMOXIFEN', 'CHEMBL:CHEMBL83', 0.05811613552369377)
('TAXOTERE', 'CHEMBL:CHEMBL92', -0.056432731564791616)
('HERCEPTIN', 'CHEMBL:CHEMBL1201585', 0.03390994756293421)
('ARIMIDEX', 'CHEMBL:CHEMBL1399', 0.03325206555600718)
('FLUOROURACIL', 'CHEMBL:CHEMBL185', -0.028405023122617584)
('LETROZOLE', 'CHEMBL:CHEMBL1444', 0.027698767438710595)
('ADRIAMYCIN-CYCLOPHOSPHAMIDE', 'CHEMBL:CHEMBL1200796', -0.024941467850854274)


# Example 2 - Gene wildcard to disease one hop

In [6]:
query = buildQuery('MONDO:0007254', gene_node=True, survival_node=False)
payload = {'message': query}

#increase max_results
payload['max_results'] = 10

r = requests.post('http://chp.thayer.dartmouth.edu/query/', json=payload)
chp_res = json.loads(r.content)

# Extract Contributing Genes

In [7]:
KG = chp_res['message']['knowledge_graph']
QG = chp_res['message']['query_graph']
results = chp_res['message']['results']

genes = []
for sr in results:
    for qge_id in sr['edge_bindings'].keys():
        if QG['edges'][qge_id]['predicate'] == 'biolink:gene_associated_with_condition':
            kge_id = sr['edge_bindings'][qge_id][0]['id']
            sensitivity = KG['edges'][kge_id]
            gene_curie = sensitivity['subject']
            gene_weight = sensitivity['attributes'][0]['value']    
    for qgn_id in sr['node_bindings'].keys():
        if QG['nodes'][qgn_id]['category'] == 'biolink:Gene':
            kgn_id = sr['node_bindings'][qgn_id][0]['id']
            gene_name = KG['nodes'][kgn_id]['name']
    genes.append((gene_name, gene_curie, gene_weight))
    
for gene in genes:
    print(gene)

('PIK3CA', 'ENSEMBL:ENSG00000121879', -0.04130144540546843)
('MUC16', 'ENSEMBL:ENSG00000181143', -0.021980998819682317)
('ERBB2', 'ENSEMBL:ENSG00000141736', -0.0187109382558387)
('PDZD2', 'ENSEMBL:ENSG00000133401', -0.0187109382558387)
('MYO7B', 'ENSEMBL:ENSG00000169994', -0.018701263520442716)
('MYCBP2', 'ENSEMBL:ENSG00000005810', -0.018701263520442716)
('WNK3', 'ENSEMBL:ENSG00000196632', -0.018701263520442716)
('BPTF', 'ENSEMBL:ENSG00000171634', -0.018701263520442716)
('CR1', 'ENSEMBL:ENSG00000203710', -0.018691588785046728)
('NIN', 'ENSEMBL:ENSG00000100503', -0.018691588785046728)


# Example 3 - Drug wildcard with gene/disease evidence

In [8]:
survival_time = 1000
query = buildQuery('MONDO:0007254', drug_node=True, gene_node=True, gene='ENSEMBL:ENSG00000132155', survival_node=True, survival_time = survival_time)
payload = {'message': query}

#increase max_results
payload['max_results'] = 10

r = requests.post('http://chp.thayer.dartmouth.edu/query/', json=payload)
chp_res = json.loads(r.content)

# Extracting probability of survival

In [9]:
KG = chp_res['message']['knowledge_graph']
QG = chp_res['message']['query_graph']
results = chp_res['message']['results']

# holds probability of survival
survival_result = results[0]

for qge_id in survival_result['edge_bindings'].keys():
    if QG['edges'][qge_id]['predicate'] == 'biolink:has_phenotype':
        kge_id = survival_result['edge_bindings'][qge_id][0]['id']
        probability = KG['edges'][kge_id]['attributes'][0]['value']
        
print("P(survival_time > {} | gene & disease):".format(survival_time),probability)

P(survival_time > 1000 | gene & disease): 0.3334874322159554


# Extracting Contributing Drugs

In [10]:
# holds drug sensitivites
sensitivity_results = results[1:]

drugs = []
for sr in sensitivity_results:
    for qge_id in sr['edge_bindings'].keys():
        if QG['edges'][qge_id]['predicate'] == 'biolink:treats':
            kge_id = sr['edge_bindings'][qge_id][0]['id']
            sensitivity = KG['edges'][kge_id]
            drug_curie = sensitivity['subject']
            drug_weight = sensitivity['attributes'][0]['value']    
    for qgn_id in sr['node_bindings'].keys():
        if QG['nodes'][qgn_id]['category'] == 'biolink:Drug':
            kgn_id = sr['node_bindings'][qgn_id][0]['id']
            drug_name = KG['nodes'][kgn_id]['name']
    drugs.append((drug_name, drug_curie, drug_weight))
                 
for drug in drugs:
    print(drug)

('DOXORUBICIN', 'CHEMBL:CHEMBL53463', 0.004673195938649988)
('TAXOL', 'CHEMBL:CHEMBL306601', 0.004670816547534668)
('CYCLOPHOSPHAMIDE', 'CHEMBL:CHEMBL88', 0.0023397734021146275)
('ARIMIDEX', 'CHEMBL:CHEMBL1399', -0.0023358729371045343)
('TAXOTERE', 'CHEMBL:CHEMBL92', -0.002333196806875803)
('PACLITAXEL', 'CHEMBL:CHEMBL428647', 1.5224821212680814e-06)
('TAMOXIFEN', 'CHEMBL:CHEMBL83', 1.2625307856160299e-06)
('LETROZOLE', 'CHEMBL:CHEMBL1444', 4.960920307140109e-07)
('HERCEPTIN', 'CHEMBL:CHEMBL1201585', 4.735175000614108e-07)
('FLUOROURACIL', 'CHEMBL:CHEMBL185', 2.9288996091814156e-07)


# Example 4 - Gene wildcard with drug/disease evidence

In [11]:
#survival_time = 1000
query = buildQuery('MONDO:0007254', drug_node=True, drug = 'CHEMBL:CHEMBL88', gene_node=True, survival_node=True, survival_time =survival_time)
payload = {'message': query}

#increase max_results
payload['max_results'] = 10

r = requests.post('http://chp.thayer.dartmouth.edu/query/', json=payload)
chp_res = json.loads(r.content)

# Extracting probability of survival

In [12]:
KG = chp_res['message']['knowledge_graph']
QG = chp_res['message']['query_graph']
results = chp_res['message']['results']

# holds probability of survival
survival_result = results[0]

for qge_id in survival_result['edge_bindings'].keys():
    if QG['edges'][qge_id]['predicate'] == 'biolink:has_phenotype':
        kge_id = survival_result['edge_bindings'][qge_id][0]['id']
        probability = KG['edges'][kge_id]['attributes'][0]['value']
        
print("P(survival_time > {} | drug & disease):".format(survival_time),probability)

P(survival_time > 1000 | drug & disease): 0.5300454864137873


# Extracting Contributing Genes

In [13]:
# holds drug sensitivites
sensitivity_results = results[1:]

genes = []
for sr in sensitivity_results:
    for qge_id in sr['edge_bindings'].keys():
        if QG['edges'][qge_id]['predicate'] == 'biolink:gene_associated_with_condition':
            kge_id = sr['edge_bindings'][qge_id][0]['id']
            sensitivity = KG['edges'][kge_id]
            gene_curie = sensitivity['subject']
            gene_weight = sensitivity['attributes'][0]['value']    
    for qgn_id in sr['node_bindings'].keys():
        if QG['nodes'][qgn_id]['category'] == 'biolink:Gene':
            kgn_id = sr['node_bindings'][qgn_id][0]['id']
            gene_name = KG['nodes'][kgn_id]['name']
    genes.append((gene_name, gene_curie, gene_weight))
                 
for gene in genes:
    print(gene)

('TP53', 'ENSEMBL:ENSG00000141510', -0.02228686754997919)
('VPS13C', 'ENSEMBL:ENSG00000129003', -0.021107130472441893)
('RYR2', 'ENSEMBL:ENSG00000198626', 0.0189906274939515)
('WNK3', 'ENSEMBL:ENSG00000196632', -0.018593918858492306)
('CHD4', 'ENSEMBL:ENSG00000111642', -0.018237524641013057)
('ROBO1', 'ENSEMBL:ENSG00000169855', -0.01808725294758703)
('MYCBP2', 'ENSEMBL:ENSG00000005810', -0.0176638646058902)
('SGIP1', 'ENSEMBL:ENSG00000118473', -0.01744410935509824)
('SPTA1', 'ENSEMBL:ENSG00000163554', 0.017139336285661264)
('ERBB2', 'ENSEMBL:ENSG00000141736', -0.015726802460211825)
