# Functionally Annotate a Hierarchical Model

This notebook performs functional analysis on a hierarchical model.

Results of this analysis: 
 - communities in the model are annotated with functional enrichment results. 
  - different enrichment sources are treated separately, nodes annotated with the results on separate properties
 - a tentative node name is computed
 - annotations are added to nodes from a tabular data file
 - a hierarchy style is applied

Inputs:
 - NDEx UUID of the hierarchical model network created by the CyCommunityDetection app
 - NDEx UUID of an style template network

Outputs:
 - An annotated copy of the hierarchical model network

 


### Imports

In [None]:
from ndex2.nice_cx_network import NiceCXNetwork
import ndex2.client as nc
import ndex2
print("ndex2 version: " + ndex2.__version__)
import networkx as nx
print("networkx version: " + nx.__version__)
import pandas as pd
print("pandas version: " + pd.__version__)
import os
import sys

# functional enrichment service
from gprofiler import GProfiler
gp = GProfiler(
    user_agent='hierarchical model analysis', #optional user agent
    return_dataframe=True, #return pandas dataframe or plain python structures    
    )

# used to prompt user for NDEx password in this notebook
import getpass
import plotly.graph_objects as go


### Setup the NDEx Account for Output
Be sure to hit enter in the field to set the value.

In [None]:
# used to prompt user for NDEx password in this notebook
import getpass
my_account = getpass.getpass(prompt='Enter the NDEx account name: ')
my_password = getpass.getpass(prompt='Enter the NDEx account password ')
my_server = "http://public.ndexbio.org" # edit this if you want to use an NDEx server other than the public server
# validate the account and password
try:
    my_ndex=nc.Ndex2(my_server, my_account, my_password)
    my_ndex.update_status()
    print("Success.  Please continue.")
except Exception as inst:
    print("Could not access account %s with password %s" % (my_account, my_password))
    print(inst.args)

### Functionally Annotate a Community

In [None]:
annotation_sources = []
def gprofiler_annotate_community(node_id, model):
    members = model.get_node_attribute_value(node_id, "CD_MemberList")
    profile = gp.profile(organism='hsapiens',
                     query = members
                     no_evidences=False).head(20)
    # for each source, get the top annotation 
    annotation_map = {}
    name = None
    for index, row in profile.iterrows():
        if row["significant"] is True:
            if name is none:
                model.set_node_name(node_id, profile["name"]) # the first significant annotation is the node name
            source = row["source"]
            name = profile["name"]
        if annotation_map[source] is None:
            annotation_map[source] = name
            
    for source, annotation in annotation_map:
        model.set_node_attribute_value(node_id, model, source, annotation)

    
def protein_data_annotate_community(node_id, model, pd_df, attribute_column_name):
    members = model.get_node_attribute_value(node_id, "CD_MemberList")
    annotated_genes = pd_df[attribute_column_name]
    attribute_member_names = attribute_column_name + "_genes"
    attribute_member_count = attribute_column_name + "_count"
    
    for member in members:
        if member in annotated_genes:
            annotated_members.append[member]
            annotated_member_count += 1
    
    model.set_node_attribute_value(node_id, model, attribute_member_names, annotated_members, "list_of_string")
    model.set_node_attribute_value(node_id, model, attribute_member_count, annotated_members, "integer")
    return model
        

### Get the Model

In [None]:
model_uuid = "5d97a04a-6fab-11ea-bfdc-0ac135e8bacf" 
model = ndex2.create_nice_cx_from_server(server='public.ndexbio.org', 
                                                          uuid=model_uuid, 
                                                          password=my_password, 
                                                          username=my_account)

### Get the Annotation Table

In [None]:
protein_annotation_file = ""
protein_annotation_df = pd.read_csv(protein_annotation_file)

### Get the Style Template

In [None]:
style_template_uuid = "5d97a04a-6fab-11ea-bfdc-0ac135e8bacf" 
stylt_template = ndex2.create_nice_cx_from_server(server='public.ndexbio.org', 
                                                          uuid=style_template_uuid, 
                                                          password=my_password, 
                                                          username=my_account)

### Annotate Communities

In [None]:
# iterate over the nodes (communities) in the model
for node_id, node in model.get_nodes():
    gprofiler_annotate_community(node_id,model)
    protein_data_annotate_community(node_id, model, protein_annotation_df, attribute_column_name)


### Perform Functional Annotation of All Communities

In [None]:
# iterate over the nodes (communities) in the model
for node_id, node in model.get_nodes():
    annotate_community(node_id,model)


### Save the Annotated Model to NDEx

In [None]:
upload_message = model.upload_to(my_server, my_account, my_password)
upload_message

## OLD STUFF AFTER HERE

In [None]:
interaction_network_uuid = "5d97a04a-6fab-11ea-bfdc-0ac135e8bacf" 
interaction_map = ndex2.create_nice_cx_from_server(server='public.ndexbio.org', 
                                                          uuid=sars_cov2_interactions_uuid, 
                                                          password=my_password, 
                                                          username=my_account)

### Utilities

In [None]:
 def nice_cx_to_node_dataframe(nice_cx_network):
    """
    Create a Pandas DataFrame in which each row is a node and columns are node attributes.
     Example:
        ``df = nice_cx_to_node_dataframe(my_nice_cx_network) # df is now a pandas dataframe``
    :return: Pandas dataframe
    :rtype: Pandas dataframe
    """
    #TODO expand documentation
    rows = []
    node_items = None
    if sys.version_info.major == 3:
        node_items = nice_cx_network.nodes.items()
    else:
        node_items = nice_cx_network.nodes.iteritems()
    # v is the node item values
    # k is the index of the node
    attribute_names = {"name", "node_id"}
    for node_id, v in node_items:
        row = {}
        row["name"] = v.get('n')
        row["node_id"] = node_id
        for attribute in nice_cx_network.get_node_attributes(node_id):
            attribute_names.add(attribute.get('n'))
            row[attribute.get('n')] = attribute.get('v')
        rows.append(row)
    df_columns = list(attribute_names)
    return_df = pd.DataFrame(rows, columns=df_columns)
    return return_df

def jaccard_similarity_lists(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def jaccard_similarity_sets(set1, set2):
    intersection = len(list(set1.intersection(set2)))
    union = (len(set1) + len(set2)) - intersection
    return float(intersection) / union

### Add Query Shortcuts

### Save the Output Network

### Analysis Functions

Output Table:
- term name  (name is CD_CommunityName otherwise method:name)
- viral proteins
- curated: yes, no, similar, more specific, more general
- curated interactors
- hidef
- louvain
- infomap
- oslom
 
for each method:

- members
- interactor members
- interactor members direct
- google members


##### Annotate Communities with Interactor Proteins, both direct and all

##### Annotate Communities with Viral Proteins

#####

The goal is to have the viral proteins at the core of the network, connecting to their interactors. Each interactor is in turn connected to the terms in the hierarchy to which it is _directly_ annotated. Unfortunately the direct annotations are not distinguished from the indirect in this data. We therefore need to compute that by finding the terms that contain the interactor AND that have no child node that contains the interactor

We populate a dict, int_to_com in which the keys are interactors and the values are sets of communities, initiallized to empty sets.
- for each interactor:
-- for each community:
    if the the interactor is in the community, add the community to the set associated with the interactor.
    
We now prune each set of communities. For each community C in the list, iterate over the edges in the network. We only want to keep C if it has no parents, i.e. if there is any edge in which C is the target, remove it from the set. 

When the set is pruned, add an edge from the interactor to each remaining community.


Issues:
- connecting the interactors to the communities isn't working - connections were all to corum root
- need to ditch the big nodes
- get rid of terms that don't connect to an interactor
- need a column for label
- need a column for type
- where are the viral protein links
- include all the bioplex edges between interactors

In [None]:
interactor_viral_map = {}

for edge_id, edge in sars_cov2_interactions.get_edges():
    preys = sars_cov2_interactions.get_edge_attribute_value(edge_id, "Preys")
    #print(preys)
    if isinstance(preys, str):
        bait = sars_cov2_interactions.nodes.get(edge["s"]).get("n")
        prey = sars_cov2_interactions.nodes.get(edge["t"]).get("n")
        interactor_viral_map[prey] = bait
    

def format_google_search(list_of_strings):
    search_string = "+".join(list_of_strings)
    return '=HYPERLINK("https://www.google.com/search?q=" + search_string, "google")'

# there is a classy list comprehension way to do this...:-(
def get_viral_proteins(interactor_list, interactor_viral_map):
    viral = set()
    for interactor in interactor_list:
        vp = interactor_viral_map.get(interactor)
        if vp is not None:
            viral.add(vp)
    return list(viral)

def community_name_analysis(hierarchy_network, community_name_map, cd_method, interactors, interactor_viral_map):
    interactor_set = set(interactors)
    for node_id, node in hierarchy_network.get_nodes():
        # determine the community name
        community_name = hierarchy_network.get_node_attribute_value(node_id, "CD_CommunityName")
        if community_name == "(none)":
            community_name = node.get('n')
        # community_name = hierarchy_network.nodes.get(node_id).get("n")
        cdict = community_name_map.get(node.get("n"))
        # if there is no entry in the community_name_map, make one
        if cdict is None:
            cdict = {}
            community_name_map[node.get("n")] = cdict
           # cdict["curated"] = "?"  # later, this will be filled in manually in the resulting spreadsheet
            #cdict["curated_interactors"] = "?"

        cdict["name"] = community_name
        members = hierarchy_network.get_node_attribute_value(node_id, "CD_MemberList").split(" ")
        interactors = list(set(members).intersection(interactor_set))
        if cd_method is None:
            cdict["members"] = members
            cdict["interactors"] = interactors
            cdict["#int"] = len(interactors)
            cdict["viral"] = get_viral_proteins(interactors, interactor_viral_map)
        else:
            cdict[cd_method] = True # shows that this method found a term with this name  
            cdict[cd_method + ":members"] = members
            cdict[cd_method + ":interactors"] = interactors
            cdict[cd_method + ":#int"] = len(interactors)
            cdict[cd_method + ":viral"] = get_viral_proteins(interactors, interactor_viral_map)
            # TODO: interactors directly annotated to term
            #if len(members)<10:
            #    cdict[cd_method + ":google"] = format_google_search(members)
    
    return community_name_map
              
def analyze_cd_methods(community_name_map, methods, interactor_viral_map):
    first_columns = [] #["curated", "curated_interactors"]
    method_name_columns = []
    method_member_columns = []
    method_interactor_columns = []
    for method_name, method_model in methods.items():
        community_name_analysis(method_model, community_name_map, method_name, 
                                sars_cov2_interactors, interactor_viral_map)
        method_name_columns.append(method_name)
        method_member_columns.append(method_name + ":members")
        method_interactor_columns.append(method_name + ":viral")
        method_interactor_columns.append(method_name + ":interactors")
        method_interactor_columns.append(method_name + ":#int")
    
    # post-process
    for method_name, method in methods.items():
        for community_name, columns in community_name_map.items():
            if columns.get(method_name) is None:
                columns[method_name] = False
                columns[method_name + ":members"] = []
                columns[method_name + ":interactors"] = []
                columns[method_name + ":#int"] = 0
                columns[method_name + ":viral"] = []
                
    columns = first_columns + method_name_columns + method_interactor_columns + method_member_columns
    return columns



### Process One Hierarchy

In [None]:
community_name_map = {}
community_name_map = community_name_analysis(hidef_model, 
                                             community_name_map, 
                                             None, 
                                             sars_cov2_interactors, 
                                             interactor_viral_map)
community_name_df = pd.DataFrame.from_dict(community_name_map, orient="index")
community_name_df = community_name_df.sort_values(by=['#int'])
only_two = community_name_df.loc[community_name_df['#int'] == 2]
only_two

### Process Multiple Hierarchies

In [None]:
cd_methods = {
    "hidef": hidef_model,
    #"louvain": louvain_model,
    #"infomap": infomap_model,
    #"oslom" : oslom_model
    }
community_name_map = {}

columns = analyze_cd_methods(community_name_map, cd_methods, interactor_viral_map)
#community_name_map = community_name_analysis(louvain_model, community_name_map, "louvain", sars_cov2_interactors)
#community_name_map = community_name_analysis(oslom_model, community_name_map, "oslom", sars_cov2_interactors)

community_name_df = pd.DataFrame.from_dict(community_name_map, orient="index")
community_name_df = community_name_df[columns]
community_name_df = community_name_df.sort_values(by=['hidef:#int'])

community_name_df

### Interpret one community
Use the g:Profiler client to get the top n terms enriched for the community

Create a dataframe in which the the terms are rows sorted by overlap
The columns are:

 - name (term_name)
 - id (term_id)
 - anot (count of annotated proteins)
 - anot_list
 - pv (enrichment p-value)
 - vir (viral proteins in anot_list)
 - int (count of interactors in anot_list)
 - columns for community
 -  value is 0, 1, 2 where 2 indicates an interactor gene
  - sorted by # occurances across communities, or maybe just sum of the gene column


In [None]:


def make_community_interpretation(name, community_name_map):
    community = community_name_map.get(name)
    profile = gp.profile(organism='hsapiens',
                         query= community.get("members"),
                         no_evidences=False).head(20)
    
    #i_df = profile
    i_df = profile.loc[:, ("name", "native", "intersection_size", "p_value")]
    i_df.rename(columns={'native': 'id', 'intersection_size': '#anot', "p_value": "pv"},
                             inplace=True)
    term_columns = list(i_df.columns.values)
    annotation_counts = {}
    interactors = community.get("interactors")
    for gene_name in community.get("members"):
        # add a column to i_df for each gene_name
        i_df[gene_name] = 0
        annotation_counts[gene_name] = 0
           
    for index, row in profile.iterrows():
        #print(index)
        #print(row["name"])
        for gene_name in row["intersections"]:
            #print(i_df[gene_name][index])
            if gene_name in interactors:
                i_df.at[index, gene_name] = 2
            else:
                i_df.at[index, gene_name] = 1
            annotation_counts[gene_name] += 1
    
    gene_columns = [k for k, v in sorted(annotation_counts.items(), key=lambda item: item[1], reverse=True)]
    i_df_columns = term_columns + gene_columns
    i_df = i_df[i_df_columns]
                           
    return i_df
    
    

In [None]:
x = {"foo": 2, "bar": 4, "baz": 3, "qux": 1, "ick": 0}
[k for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)]

In [None]:
comm_name = "C124261"
inter_df = make_community_interpretation(comm_name, community_name_map)


inter_df

In [None]:
inter_df.to_excel("~/Desktop/" + comm_name + ".xlsx")

In [None]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(inter_df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=inter_df.transpose().values.tolist(),
               fill_color='lavender',
               align='left'))
])
fig.show()

In [None]:
community_name_df.to_excel("~/Desktop/community_name_4.xlsx")

In [None]:
        in
        interactor_name_to_id_map = {}
        for interactor_name in sars_cov2_interactors:
            interactor_to_commmunity_map[interactor_name] = set()
            if isinstance(members, str):
                # the node is a community if it has members
                #print(members, node)
                for member_name in members.split(" "):
                    #print(member_name)
                    community_set = interactor_to_commmunity_map.get(member_name)
                    #print(community_set)
                    if community_set is not None:
                        community_set.add(node_id)
else:
                # the node is an interactor
                interactor_name_to_id[node["n"]] = node["@id"]
    
interactor_name_to_id
interactor_to_commmunity_map = {}

In [None]:
#print("hierarchy edges before: ", len(hierarchy_network.edges))

for interactor_name, community_set in int_to_com.items():
    communities_to_remove = set()
    for edge_id, edge in hierarchy_network.edges.items():
        if edge['s'] in community_set and edge['t'] in community_set:
            communities_to_remove.add(edge['s'])
    print("communities to remove: ", len(communities_to_remove))
    communities_to_connect = community_set.difference(communities_to_remove)
    
    for community_id in communities_to_connect:
        # add edge from interactor_id to community_id
        interactor_id = interactor_name_to_id[interactor_name]
        hierarchy_network.create_edge(edge_source=interactor_id, edge_target=community_id, edge_interaction='member_of')
        
            
#print("hierarchy edges after: ", len(hierarchy_network.edges))


### Add New Attributes to Hierarchy

In [None]:
node_df.to_excel(hierarchy_network.get_name() + " nodes.xlsx")