In [None]:
import pandas as pd
import ipywidgets as ipw
import numpy as np
import networkx as nx

In [None]:
#simple helper functions
def gmean(data, axis=0):
    return np.exp(np.mean(np.log(data), axis=axis))

def combine_ranks(data, hpo_terms):
    # get geometric mean of ranks for given terms
    hpo_terms = [x for x in hpo_terms if x in data.columns] # remove terms not in dataset    
    return pd.Series(gmean(data.loc[:, hpo_terms], axis=1), index=data.index)

def reduce_terms(data, hpo_tree, hpo_terms):
    #given a list of hpo terms, get the closest ancestor in dataset columns
    term_list = set()
    for term in hpo_terms:
        #get the list of terms that actually are in dataset
        path = [x for x in nx.shortest_path(hpo_tree, term, 'HP:0000118') if x in data.columns]
        if len(path) > 0:
            term_list.add(path[0])
    return term_list

def build_table(df):
    spool = """
    <table width='100%'>
    <tr>
    <th>Entrez ID</th>
    <th>Gene Symbol</th>
    <th>Name</th>
    <th>Score</th>
    </tr>"""
    for gene in df.index:
        g_name = gene_annotation.node[gene]['name']
        g_symbol = list(gene_annotation.neighbors(gene))[0]
        spool += "<tr><td>%s</td><td>%s</td><td>%s</td><td>%.4f</td></tr>" % (gene, g_symbol, g_name, df.loc[gene])
    spool += "</table>"
    return spool

def get_valid_genes(gene_string):
    # first retain only genes that are in our annotation
    genes = gene_string.split('\n')
    final_genes = set()
    genes_in = [x for x in genes if x in gene_annotation.nodes()]
    genes_out = [x for x in genes if not x in gene_annotation.nodes()]
    for g in genes_in:
        if gene_annotation.node[g]['type'] == 'entrez':
            final_genes.add(g)
        else:
            #a gene symbol is given
            [final_genes.add(x) for x in gene_annotation.neighbors(g)]
    return final_genes, genes_out

def add_warning_excl(genes_out):
    spool = '<hr><div class="excluded"><b>Warning!</b> The following items were excluded from the analysis:<br>'
    for g in genes_out:
        spool += '%s<br>' % g
    spool += "</div>"
    return spool

In [None]:
#load data
#github does not allow files larger than 100 MB, so I splitted our model
model_ranks = pd.read_pickle("data/global_ind_pheno_tumor_munge.best_pred.00.pickle")
for chunk in range(1, 9):
    model_ranks = pd.concat([model_ranks, pd.read_pickle("data/global_ind_pheno_tumor_munge.best_pred.%02d.pickle" % chunk)])
    
#scored terms with some annotation
hp_annot = pd.read_pickle("data/global_ind_pheno_tumor_munge.best.pickle")
#combined ranks for OMIM diseases
disease_net = nx.gpickle.read_gpickle("data/disease_annot.gpickle")
#the HPO obo tree
hpo_net = nx.gpickle.read_gpickle("data/hp.180127.obo.gpickle")
#gene annotation to have gene symbol and/or entrez ID
gene_annotation = nx.gpickle.read_gpickle("data/gene_annotation.gpickle")

In [None]:
hpo_ids = [x for x in hpo_net.nodes() if nx.has_path(hpo_net, x, 'HP:0000118')]
hpo_ids.sort()
hpo_ids = hpo_ids[1:]
hpo_names = [hpo_net.node[x]['name'] for x in hpo_ids]
hpo_terms = ["%s: %s" % (hpo_ids[x], hpo_names[x]) for x in range(len(hpo_ids))]
disease_ids = [x for x in disease_net.nodes() if disease_net.node[x]['type'] == 'disease']
disease_ids.sort()
disease_names = [disease_net.node[x]['name'] for x in disease_ids]
diseases = ["%s: %s" % (disease_ids[x], disease_names[x]) for x in range(len(disease_ids))]
selected_terms = set()

#define the widget for HPO search
#the elements
gene_list = ipw.Textarea(
    placeholder='Enter your gene list',
    description='Genes:',
    disabled=False
)

search_hp_widget = ipw.Text(placeholder='Start typing phenotypes...') #to search the phenotypes
search_dis_widget = ipw.Text(placeholder='Start typing description...') #to search the phenotypes
options_widget = ipw.SelectMultiple(options=hpo_terms) #this lists all terms to be selected
disease_widget = ipw.Select(options=diseases)
add_hp_button = ipw.Button(description='Add term')
remove_button = ipw.Button(description='Remove term')
add_dis_button = ipw.Button(description='Add disease')
selected_widget = ipw.SelectMultiple()
submit_button = ipw.Button(description='Submit')
reset_button = ipw.Button(description='Reset')
out = ipw.Output()
results = ipw.HTML()

#stitch together
hp_search = ipw.VBox([search_hp_widget, options_widget, add_hp_button])
dis_search = ipw.VBox([search_dis_widget, disease_widget, add_dis_button])
chosen_area = ipw.VBox([selected_widget, remove_button])
bottom_area = ipw.HBox([reset_button, submit_button])
selection_area = ipw.HBox([hp_search, ipw.HTML(value="<b> OR </b>"), dis_search])
multi_select = ipw.VBox([selection_area, chosen_area])

#define actions
def on_hp_search(change):
    search_input = change['new']
    if search_input == '':
        # Reset search field
        new_options = hpo_terms
    else:
        # Filter by search field 
        new_options = [x for x in hpo_terms if search_input.lower() in x.lower()]
    options_widget.options = new_options

def on_dis_search(change):
    search_input = change['new']
    if search_input == '':
        # Reset search field
        new_options = diseases
    else:
        # Filter by search field 
        new_options = [x for x in diseases if search_input.lower() in x.lower()]
    disease_widget.options = new_options

    
def on_add_hp(b):
    new_options = set([x for x in selected_widget.options] + [x for x in options_widget.value])
    new_options = list(new_options)
    new_options.sort()
    selected_widget.options = new_options

def on_add_dis(b):
    dis_id = disease_widget.value[:11]
    hp_map = [x for x in disease_net.neighbors(dis_id) if nx.has_path(hpo_net, x, 'HP:0000118')]
    hp_names = [hpo_net.node[x]['name'] for x in hp_map]
    hp_terms = ["%s: %s" % (hp_map[x], hp_names[x]) for x in range(len(hp_map))]
    new_options = set([x for x in selected_widget.options] + hp_terms)
    new_options = list(new_options)
    new_options.sort()
    selected_widget.options = new_options

    
    
def on_remove(b):
    v = [x for x in selected_widget.options if not x in selected_widget.value]
    selected_widget.options = v

def on_submit(b):
    input_genes = []
    input_terms = []
    excluded_genes = []
    if not gene_list.value:
        output = "<b>You should provide some genes</b>"
    else:
        input_genes, excluded_genes = get_valid_genes(gene_list.value)
    if not selected_widget.options:
        output = "<b>You should provide some HPO terms</b>"
    input_terms = [x[:10] for x in selected_widget.options]
    if not input_genes:
        output = "<b>None of the genes provided is valid</b>"
    elif input_terms:
        #we have genes and terms
        subset_ranks = model_ranks.loc[input_genes] #subset data to perform less calculation
        ranked_genes = combine_ranks(subset_ranks, reduce_terms(subset_ranks, hpo_net, input_terms)).sort_values()
        output = build_table(ranked_genes)
    if excluded_genes:
        output += add_warning_excl(excluded_genes)
    results.value = output


def on_reset(b):
    search_hp_widget.value = ''
    search_dis_widget.value = ''
    gene_list.value = ''
    selected_widget.options = ()
    results.value = ''

#link actions
add_hp_button.on_click(on_add_hp)
add_dis_button.on_click(on_add_dis)                      
remove_button.on_click(on_remove)    
submit_button.on_click(on_submit)
reset_button.on_click(on_reset)
search_hp_widget.observe(on_hp_search, names='value')
search_dis_widget.observe(on_dis_search, names='value')
#display



## DISCAN notebook
This is a simple implementation of [DISCAN](https://doi.org/10.1101/120121) using Jupyter interactive notebook. 

This web application is intended to score a gene list against a given set of phenotypes encoded in the [Human Phenotype Ontology](http://human-phenotype-ontology.github.io). Phenotypes can be added by selecting a specifc disease from the disease menu.

---
**Warning**

*If you are using this notebook on [binder](http://mybinder.org), you may expect poor performances*

----

#### 1) Enter a list of genes here

In [None]:
display(gene_list)

#### 2) Choose some phenotypes

In [None]:
display(multi_select)

#### 3) Submit your query and check results

In [None]:
display(bottom_area, results)

----
<small>
Davide Cittaro<br>
[Center for Translational Genomics and Bioinformatics, Milano](http://www.hsr.it/research/organization/divisions-centers/center-for-translational-genomics-and-bioinformatics/)
</small>