# /query and /predicate functionality for CHP

Queries for our system are in the probabilistic form:<br>
$P(survival\_time > X | Mut\_g_1 = True, Mut\_g_2 = True, ..., Mut\_g_n = True, Drug = d_1, Drug = d_2, ..., Drug = d_n)$<br>
Returned is a knowledge graph containing probability of survival time and contribution analysis (if specified in QG additional properties).

In [1]:
import requests
import json
import csv

# /predicate functionality example
By running /predicates you can extract a json object with the following predicates:<br>
1.) biolink:treats<br>
2.) biolink:gene_associated_with_condition<br>
3.) biolink:interacts_with<br>
2.) biolink:has_phenotype<br>

The above predicates link the following biolink entities:<br>
1.) biolink:Gene<br>
2.) biolink:Drug<br>
3.) biolink:Disease<br>
4.) biolink:PhenotypicFeature<br>

In [2]:
r = requests.get('http://chp.thayer.dartmouth.edu/predicates/')
json_formatted_str = json.dumps(json.loads(r.content), indent=2)
print(json_formatted_str)

{
  "biolink:Gene": {
    "biolink:Disease": [
      "biolink:gene_associated_with_condition"
    ],
    "biolink:Drug": [
      "biolink:interacts_with"
    ]
  },
  "biolink:Drug": {
    "biolink:Disease": [
      "biolink:treats"
    ],
    "biolink:Gene": [
      "biolink:interacts_with"
    ]
  },
  "biolink:Disease": {
    "biolink:PhenotypicFeature": [
      "biolink:has_phenotype"
    ]
  }
}


# Build Query
Constructs a json query object and can take in a survival time, a disease and a set of genes and drugs.

In [3]:
# Function: buildQuery
#
# Input:
# -----------
# Survival time, a disease, and a set of genes/drugs
#
# Output:
# -----------
# A query graph that answers this probablistic question: 
# P(survival_time > X | Mut_g1 = True, Mut_g2 = True, ..., Mut_gn = True, Drug = d1, Drug = d2, ..., Drug = dn)

def buildQuery(st, disease, genes=[], drugs=[]):
    
    # empty response
    reasoner_std = { "query_graph": dict()
                   }
    # empty query graph
    reasoner_std["query_graph"] = { "edges": dict(),
                                    "nodes": dict()
                                  }
    
    node_count = 0
    edge_count = 0
    
    # add genes
    for gene in genes:
        reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Gene',
                                                                           'id':'{}'.format(gene[1])
                                                                         }
        node_count += 1
    
    # add drugs
    for drug in drugs:
        reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Drug',
                                                                           'id':'{}'.format(drug[1])
                                                                         }
        node_count += 1
    
    # add in disease node
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category':'biolink:Disease',
                                                                       'id':'{}'.format(disease[1])
                                                                     }
    node_count += 1
    
    # link all evidence to disease
    for node_id in reasoner_std['query_graph']['nodes'].keys():
        node = reasoner_std['query_graph']['nodes'][node_id]
        if node['category'] == 'biolink:Gene':
            reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:gene_associated_with_condition',
                                                                               'subject': node_id,
                                                                               'object': 'n{}'.format(node_count -1)   # should be disease node
                                                                             }
            edge_count += 1
        elif node['category'] == 'biolink:Drug':
            reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:treats',
                                                                               'subject': node_id,
                                                                               'object': 'n{}'.format(node_count -1)  # should be disease node
                                                                             }
            edge_count += 1
            
    # add target survival node
    phenotype = ('survival_time', 'EFO:0000714')
    reasoner_std['query_graph']['nodes']['n{}'.format(node_count)] = { 'category': 'biolink:PhenotypicFeature',
                                                                       'id': '{}'.format(phenotype[1]),
                                                                     }
    node_count += 1
    
    # link disease to target
    reasoner_std['query_graph']['edges']['e{}'.format(edge_count)] = { 'predicate':'biolink:has_phenotype',
                                                                       'subject': 'n{}'.format(node_count-2),
                                                                       'object': 'n{}'.format(node_count-1),
                                                                       # properties is optional - if not specified default 
                                                                       # default qualifier is '>=', can be '<='
                                                                       'properties': { 'qualifier':'>=',
                                                                                       'days': st
                                                                                     }
                                                                     }
    return reasoner_std

# Read Genes and Drugs
Functionality to read in our set of available genes and drugs with respective ensemble and chembl curie IDs.

In [4]:
def readGenes():
    with open('gene_curie_map.csv', 'r') as gene_file:
        reader = csv.reader(gene_file)
        next(reader)
        rows = [(row[0],row[1]) for row in reader]
    return rows

In [5]:
def readDrugs():
    with open('drug_curie_map.csv', 'r') as drug_file:
        reader = csv.reader(drug_file)
        next(reader)
        rows = [(row[0],row[1]) for row in reader]
    return rows

# Constructing the Query and pinging CHP
You can use the commented out functionality to check which genes and drugs are available. Survival time, genes and drugs are passed in as evidence. Genes and drugs are formed as a list of tuples. Currently only breast cancer can be used as the disease.

In [6]:
# list of genes (and curies) we can query over
#gene_list = readGenes()

# list of drugs (and curies) we can query over
#drug_list = readDrugs()

# or pull from above lists
genes = [('RAF1', 'ENSEMBL:ENSG00000132155'),
         ('MAP3K13','ENSEMBL:ENSG00000073803')]
drugs = [('CYCLOPHOSPHAMIDE', 'CHEMBL:CHEMBL88')]
disease = ('Breast_Cancer', 'MONDO:0007254')
survival_time = 970 #days
# if genes or drugs is an empty list you can omit them
query = buildQuery(survival_time, disease, genes=genes, drugs=drugs)
payload = {'message': query}
r = requests.post('http://chp.thayer.dartmouth.edu/query/', json=payload)

# Extract end probabilitiy and contributions

Extract the end probabilities and contributions (if specified in QG additional properties).

In [7]:
chp_res = json.loads(r.content)
KG = chp_res['message']['knowledge_graph']
for edge_key in KG['edges'].keys():
    edge = KG['edges'][edge_key]
    if edge['predicate'] == 'biolink:has_phenotype':
        p_survival = edge['attributes'][0]['value']

# probability of surival given QG specification
print("Probability of survival > {} days is:".format(survival_time), p_survival)

Probability of survival > 970 days is: 0.6653032869408916
