### Imports

In [1]:
from ndex2.nice_cx_network import NiceCXNetwork
import ndex2.client as nc
import ndex2
print("ndex2 version: " + ndex2.__version__)
import networkx as nx
print("networkx version: " + nx.__version__)
import pandas as pd
print("pandas version: " + pd.__version__)
import os
import sys
from gprofiler import GProfiler
gp = GProfiler(
    user_agent='hierarchical model analysis', #optional user agent
    return_dataframe=True, #return pandas dataframe or plain python structures    
    )

import plotly.graph_objects as go


ndex2 version: 3.3.1
networkx version: 2.4
pandas version: 1.0.3


In [2]:
 def nice_cx_to_node_dataframe(nice_cx_network):
    """
    Create a Pandas DataFrame in which each row is a node and columns are node attributes.
     Example:
        ``df = nice_cx_to_node_dataframe(my_nice_cx_network) # df is now a pandas dataframe``
    :return: Pandas dataframe
    :rtype: Pandas dataframe
    """
    #TODO expand documentation
    rows = []
    node_items = None
    if sys.version_info.major == 3:
        node_items = nice_cx_network.nodes.items()
    else:
        node_items = nice_cx_network.nodes.iteritems()
    # v is the node item values
    # k is the index of the node
    attribute_names = {"name", "node_id"}
    for node_id, v in node_items:
        row = {}
        row["name"] = v.get('n')
        row["node_id"] = node_id
        for attribute in nice_cx_network.get_node_attributes(node_id):
            attribute_names.add(attribute.get('n'))
            row[attribute.get('n')] = attribute.get('v')
        rows.append(row)
    df_columns = list(attribute_names)
    return_df = pd.DataFrame(rows, columns=df_columns)
    return return_df

def jaccard_similarity_lists(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def jaccard_similarity_sets(set1, set2):
    intersection = len(list(set1.intersection(set2)))
    union = (len(set1) + len(set2)) - intersection
    return float(intersection) / union

def get_interactors(df, column_name, value, negation=False):
    if negation:
        filtered_df = df.loc[df[column_name] != value]
    else:
        filtered_df = df.loc[df[column_name] == value]
    
    return list(filtered_df["name"])
    

### The NDEx Account

In [3]:
my_account="dexterpratt"
my_password="cytoscaperules"
my_server="http://public.ndexbio.org"

if my_account == 'enter your username here':
    print('*******WARNING!!!!*******')
    print('Please change the username and password before proceeding')
else:
    try:
        my_ndex=nc.Ndex2(my_server, my_account, my_password)
        my_ndex.update_status()
        print("Success.  Please continue.")
    except Exception as inst:
        print("Could not access account %s with password %s" % (my_account, my_password))
        print(inst.args)

Success.  Please continue.


### mTb Interaction Map Network

Penn et al. "An Mtb-Human Protein-Protein Interaction Map Identifies a Switch between Host Antiviral and Antibacterial Responses," Mol. Cell 2018 Aug 16;71(4):637-648.

In [6]:
mtb_interactions_uuid = "50b4abbf-392d-11e9-9f06-0ac135e8bacf" 
mtb_interactions = ndex2.create_nice_cx_from_server(server='public.ndexbio.org', 
                                                          uuid=mtb_interactions_uuid, 
                                                          password=my_password, 
                                                          username=my_account)
mtb_node_table = nice_cx_to_node_dataframe(mtb_interactions)
mtb_node_table

Unnamed: 0,canonicalName,cytoscape.alias.list,type,name,Column 1,uniprot,TBBAIT,tb.names,node_id
0,FUBP1,[],protein,FUBP1,,C9JSZ1,,,267
1,DOK3,[],protein,DOK3,,A0A024R7M5,,,266
2,SFPQ,[],protein,SFPQ,,H0Y9K7,,,265
3,NONO,[],protein,NONO,,A0A0S2Z4Z9,,,264
4,EIF2C2,[],protein,EIF2C2,,,,,262
...,...,...,...,...,...,...,...,...,...
204,Rv0999,[Rv0999],protein,Rv0999,Rv0999,,TBbait,,568
205,Q8N290-Putative uncharacterized,[],protein,Q8N290-Putative uncharacterized,,,,,567
206,Lpqw,[Lpqw],protein,Lpqw,Lpqw,,TBbait,,566
207,Lpqn,[Lpqn],protein,Lpqn,Lpqn,,TBbait,,565


### Hierarchical Models


In [26]:
hidef_mtb_model_uuid = "8bd05fff-84fe-11ea-aaef-0ac135e8bacf" #input network: biopax3 interconnect query

hidef_mtb_model = ndex2.create_nice_cx_from_server(server='public.ndexbio.org', uuid=hidef_mtb_model_uuid, 
                                                    password=my_password, username=my_account)


hidef_mtb_node_df = nice_cx_to_node_dataframe(hidef_mtb_model)
hidef_mtb_node_df

Unnamed: 0,CD_CommunityName,CD_MemberList,CD_MemberList_LogSize,CD_AnnotatedMembers,CD_AnnotatedMembers_SourceDB,CD_NonAnnotatedMembers,CD_AnnotatedMembers_Overlap,name,CD_MemberList_Size,CD_AnnotatedAlgorithm,CD_AnnotatedMembers_SourceTerm,CD_AnnotatedMembers_Size,selected,CD_AnnotatedMembers_Pvalue,node_id,CD_Labeled
0,(none),MRPL1 MRPL12 MRPL57,1.585,,,,0.0,C595960,3,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622869,false
1,(none),GNB1 GNB2 PFDN6 WDR61,2.0,,,,0.0,C595959,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622868,false
2,(none),FBXW4 PGAM4 TXNDC9 WDR86,2.0,,,,0.0,C595958,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622867,false
3,(none),AHCYL2 CAPN10 FOXK2 ZBTB3,2.0,,,,0.0,C595957,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622866,false
4,(none),H2AC17 NOL10 UTP18 ZNF768,2.0,,,,0.0,C595956,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622865,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,RNA splicing,ABCF2 ABRA ACAP1 ACIN1 AGO4 AHCYL1 AHCYL2 AKAP...,8.948,ACIN1 AKAP17A AQR C14orf166 CACTIN CASC3 CDC40...,GO:BP,ABCF2 ABRA ACAP1 AGO4 AHCYL1 AHCYL2 ALDH16A1 A...,0.079,C595809,494,Annotated by gProfiler [Docker: coleslawndex/c...,GO:0008380,67,,9.093257352764108E-62,622718,true
152,ribonucleoprotein complex biogenesis,AATF ABCF2 ABRA ABT1 ACIN1 ADARB1 AGO4 AHCYL1 ...,9.05,AATF ABT1 AGO4 BMS1 BOP1 BRIX1 BTRC BYSL C1QBP...,GO:BP,ABCF2 ABRA ACIN1 ADARB1 AHCYL1 AHCYL2 AKAP17A ...,0.142,C595808,530,Annotated by gProfiler [Docker: coleslawndex/c...,GO:0022613,118,,8.151808311400743E-140,622717,true
153,Mitotic Prometaphase,ABRA ACAP1 ACBD3 ACSL4 ACTBL2 ACTR1A ACTR1B AC...,9.127,ACTR1A BUB3 CDC20 CENPM CENPQ CENPU DYNLL1 DYN...,REAC,ABRA ACAP1 ACBD3 ACSL4 ACTBL2 ACTR1B ACTR3 ACT...,0.058,C595807,559,Annotated by gProfiler [Docker: coleslawndex/c...,REAC:R-HSA-68877,39,,9.999386776922086E-39,622716,true
154,RNA binding,AATF ABCF2 ABRA ABT1 ACAP1 ACBD3 ACIN1 ACSL4 A...,10.05,AATF ABT1 ACIN1 ADARB1 AGO4 AHCYL1 AKAP17A AQR...,GO:MF,ABCF2 ABRA ACAP1 ACBD3 ACSL4 ACTBL2 ACTR1A ACT...,0.129,C595806,1060,Annotated by gProfiler [Docker: coleslawndex/c...,GO:0003723,333,,3.595687760769186E-242,622715,true


### Annotate community with interactors in community and their overlap with the members
i.e. which communities are enriched for interactors

In [30]:
def in_community(row, interactors):
    #print(len(row.get("CD_MemberList")))
    #result = list(set(row.get("CD_MemberList")).intersection(interactors))
    #if (len(result) > 0):
    #    print(result)
    return list(set(row.get("CD_MemberList").split()).intersection(set(interactors)))

def community_overlap(row, interactors):
    intersection = len(set(row.get("CD_MemberList").split()).intersection(set(interactors)))
    memberlist_size = int(row.get("CD_MemberList_Size"))
    if isinstance(memberlist_size, (int, float)):
        return intersection / memberlist_size
    else:
        return 0

def jacquard_interactors(row, interactors):
    return jaccard_similarity_sets(set(row.get("CD_MemberList").split()), interactors)
                            
hidef_mtb_node_df["CD_InteractorMembers"] = hidef_mtb_node_df.apply(lambda row: in_community(row, interactors), axis=1)
                            
hidef_mtb_node_df["CD_InteractorMemberOverlap"] = hidef_mtb_node_df.apply(lambda row: community_overlap(row, interactors), axis=1)
hidef_mtb_node_df

Unnamed: 0,CD_CommunityName,CD_MemberList,CD_MemberList_LogSize,CD_AnnotatedMembers,CD_AnnotatedMembers_SourceDB,CD_NonAnnotatedMembers,CD_AnnotatedMembers_Overlap,name,CD_MemberList_Size,CD_AnnotatedAlgorithm,CD_AnnotatedMembers_SourceTerm,CD_AnnotatedMembers_Size,selected,CD_AnnotatedMembers_Pvalue,node_id,CD_Labeled,CD_InteractorMembers,CD_InteractorMemberOverlap
0,(none),MRPL1 MRPL12 MRPL57,1.585,,,,0.0,C595960,3,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622869,false,[],0.000000
1,(none),GNB1 GNB2 PFDN6 WDR61,2.0,,,,0.0,C595959,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622868,false,[PFDN6],0.250000
2,(none),FBXW4 PGAM4 TXNDC9 WDR86,2.0,,,,0.0,C595958,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622867,false,[],0.000000
3,(none),AHCYL2 CAPN10 FOXK2 ZBTB3,2.0,,,,0.0,C595957,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622866,false,[],0.000000
4,(none),H2AC17 NOL10 UTP18 ZNF768,2.0,,,,0.0,C595956,4,Annotated by gProfiler [Docker: coleslawndex/c...,,0,,,622865,false,[],0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,RNA splicing,ABCF2 ABRA ACAP1 ACIN1 AGO4 AHCYL1 AHCYL2 AKAP...,8.948,ACIN1 AKAP17A AQR C14orf166 CACTIN CASC3 CDC40...,GO:BP,ABCF2 ABRA ACAP1 AGO4 AHCYL1 AHCYL2 ALDH16A1 A...,0.079,C595809,494,Annotated by gProfiler [Docker: coleslawndex/c...,GO:0008380,67,,9.093257352764108E-62,622718,true,"[FMNL1, SORBS1, ANKRD58, SNAPIN, ERO1L, FAM175...",0.178138
152,ribonucleoprotein complex biogenesis,AATF ABCF2 ABRA ABT1 ACIN1 ADARB1 AGO4 AHCYL1 ...,9.05,AATF ABT1 AGO4 BMS1 BOP1 BRIX1 BTRC BYSL C1QBP...,GO:BP,ABCF2 ABRA ACIN1 ADARB1 AHCYL1 AHCYL2 AKAP17A ...,0.142,C595808,530,Annotated by gProfiler [Docker: coleslawndex/c...,GO:0022613,118,,8.151808311400743E-140,622717,true,"[ZC3H18, MCM2, RPLP0, SF1, CIP29, SORBS1, PTMS...",0.084906
153,Mitotic Prometaphase,ABRA ACAP1 ACBD3 ACSL4 ACTBL2 ACTR1A ACTR1B AC...,9.127,ACTR1A BUB3 CDC20 CENPM CENPQ CENPU DYNLL1 DYN...,REAC,ABRA ACAP1 ACBD3 ACSL4 ACTBL2 ACTR1B ACTR3 ACT...,0.058,C595807,559,Annotated by gProfiler [Docker: coleslawndex/c...,REAC:R-HSA-68877,39,,9.999386776922086E-39,622716,true,"[RAB5A, FMNL1, SEC24D, ANKRD58, SNAPIN, ERO1L,...",0.214669
154,RNA binding,AATF ABCF2 ABRA ABT1 ACAP1 ACBD3 ACIN1 ACSL4 A...,10.05,AATF ABT1 ACIN1 ADARB1 AGO4 AHCYL1 AKAP17A AQR...,GO:MF,ABCF2 ABRA ACAP1 ACBD3 ACSL4 ACTBL2 ACTR1A ACT...,0.129,C595806,1060,Annotated by gProfiler [Docker: coleslawndex/c...,GO:0003723,333,,3.595687760769186E-242,622715,true,"[RAB5A, FMNL1, SEC24D, SORBS1, PTMS, ANKRD58, ...",0.161321


### Add interactor annotations to hierarchy

In [31]:
# for each of the new columns, iterate setting node attributes
for index, row in hidef_mtb_node_df.iterrows(): 
    hidef_mtb_model.set_node_attribute(row["node_id"],
                                         "CD_InteractorMembers", 
                                         row["CD_InteractorMembers"], 
                                         type="list_of_string" )
    hidef_mtb_model.set_node_attribute(row["node_id"],
                                         "CD_InteractorMemberOverlap", 
                                         row["CD_InteractorMemberOverlap"], 
                                         type="double" )

### Save annotated hierarchy back to NDEx

In [33]:
if my_account == 'enter your username here':
    print('*******WARNING!!!!*******')
    print('Please change the username and password before proceeding')
else:
    upload_message = hidef_mtb_model.upload_to(my_server, my_account, my_password)
    print(upload_message)

Generating CX
http://public.ndexbio.org/v2/network/44708856-851e-11ea-aaef-0ac135e8bacf


In [9]:
interactors = get_interactors(mtb_node_table, "TBBAIT", "TBbait", negation=True)
mtb_proteins = get_interactors(mtb_node_table, "TBBAIT", "TBbait", negation=False)
#mtb_proteins

In [12]:
# to look up the bacterial protein if you know the interactor
interactor_mtb_map = {}

for edge_id, edge in mtb_interactions.get_edges():
    bait = mtb_interactions.nodes.get(edge["s"]).get("n")
    prey = mtb_interactions.nodes.get(edge["t"]).get("n")
    interactor_mtb_map[prey] = bait

#interactor_mtb_map       

In [20]:


def format_google_search(list_of_strings):
    search_string = "+".join(list_of_strings)
    return '=HYPERLINK("https://www.google.com/search?q=" + search_string, "google")'

def community_name_analysis(hierarchy_network, community_name_map, interactors, interactor_map):
    interactor_set = set(interactors)
    for node_id, node in hierarchy_network.get_nodes():
        # determine the community name
        community_name = hierarchy_network.get_node_attribute_value(node_id, "CD_CommunityName")
        if community_name == "(none)":
            community_name = node.get('n')
        # community_name = hierarchy_network.nodes.get(node_id).get("n")
        cdict = community_name_map.get(node.get("n"))
        # if there is no entry in the community_name_map, make one
        if cdict is None:
            cdict = {}
            community_name_map[node.get("n")] = cdict
           # cdict["curated"] = "?"  # later, this will be filled in manually in the resulting spreadsheet
            #cdict["curated_interactors"] = "?"

        cdict["name"] = community_name
        members = hierarchy_network.get_node_attribute_value(node_id, "CD_MemberList").split(" ")
        community_interactors = list(set(members).intersection(interactor_set))
        cdict["members"] = members
        cdict["interactors"] = community_interactors
        cdict["#int"] = len(community_interactors)
        cdict["mtb"] = [interactor_map[x] for x in community_interactors]

    return community_name_map


### Process One Hierarchy

In [22]:
community_name_map = {}
community_name_map = community_name_analysis(hidef_mtb_model, 
                                             community_name_map, 
                                             interactors, 
                                             interactor_mtb_map)
community_name_df = pd.DataFrame.from_dict(community_name_map, orient="index")
community_name_df = community_name_df.sort_values(by=['#int'])
only_two = community_name_df.loc[community_name_df['#int'] == 2]
only_two

Unnamed: 0,name,members,interactors,#int,mtb
C595849,Bcl-2 family protein complex,"[BAX, BCL2L1, BCL2L11, DEDD, DYNLL1, DYNLL2, F...","[BAX, DYNLL1]",2,"[Pe25, Pe35]"
C595841,Bcl-2 family protein complex,"[BAX, BCL2L1, BCL2L11, DEDD, DYNLL1, DYNLL2, F...","[BAX, DYNLL1]",2,"[Pe25, Pe35]"
C595835,BORC complex,"[BCAS4, BLOC1S1, BLOC1S5, BLOC1S6, BORCS6, BOR...","[BLOC1S1, SNAPIN]",2,"[Rv1827, Rv1827]"
C595861,Circadian rhythm,"[ABRA, AMER1, BTRC, CREB3L3, DLD, FBXW11, PER2...","[DLD, FBXW11]",2,"[Rv1075c, Espr]"
C595862,profilin binding,"[ENAH, EVL, FBLIM1, KIAA1522, PFN2, TACC3, UTP...","[EVL, VASP]",2,"[Apa, Apa]"
C595864,C595864,"[CLLU1-AS1, COX5B, DPP8, DPP9, FOXF2, KEAP1, R...","[DPP9, USP11]",2,"[Rv3267, Rv2401]"
C595955,C595955,"[BHLHE23, CSNK2A1, FUCA1, Rv1816]","[CSNK2A1, FUCA1]",2,"[Rv1816, Rv1816]"
C595868,C595868,"[FAM175B, LURAP1, MICU2, MYO5A, PLD1, Rv2491, ...","[SCP2, FAM175B]",2,"[Rv2491, Rv2491]"
C595850,"eIF3 complex (EIF3S6, EIF3S5, EIF3S4, EIF3S3, ...","[EIF3A, EIF3C, EIF3D, EIF3E, EIF3F, EIF3G, EIF...","[EIF3A, EIF3C]",2,"[Lpqn, Lpqn]"
C595865,ILK-HSP90-CDC37 complex,"[CDC37, CDK11A, HSP90AA1, HSP90AA5P, IKBKG, Lp...","[PCBP1, CDC37]",2,"[Rv3033, Lpqr]"


### Interpret one community
Use the g:Profiler client to get the top n terms enriched for the community

Create a dataframe in which the the terms are rows sorted by overlap
The columns are:

 - name (term_name)
 - id (term_id)
 - anot (count of annotated proteins)
 - anot_list
 - pv (enrichment p-value)
 - vir (viral proteins in anot_list)
 - int (count of interactors in anot_list)
 - columns for community
 -  value is 0, 1, 2 where 2 indicates an interactor gene
  - sorted by # occurances across communities, or maybe just sum of the gene column


In [23]:


def make_community_interpretation(name, community_name_map):
    community = community_name_map.get(name)
    profile = gp.profile(organism='hsapiens',
                         query= community.get("members"),
                         no_evidences=False).head(20)
    
    #i_df = profile
    i_df = profile.loc[:, ("name", "native", "intersection_size", "p_value")]
    i_df.rename(columns={'native': 'id', 'intersection_size': '#anot', "p_value": "pv"},
                             inplace=True)
    term_columns = list(i_df.columns.values)
    annotation_counts = {}
    interactors = community.get("interactors")
    for gene_name in community.get("members"):
        # add a column to i_df for each gene_name
        i_df[gene_name] = 0
        annotation_counts[gene_name] = 0
           
    for index, row in profile.iterrows():
        #print(index)
        #print(row["name"])
        for gene_name in row["intersections"]:
            #print(i_df[gene_name][index])
            if gene_name in interactors:
                i_df.at[index, gene_name] = 2
            else:
                i_df.at[index, gene_name] = 1
            annotation_counts[gene_name] += 1
    
    gene_columns = [k for k, v in sorted(annotation_counts.items(), key=lambda item: item[1], reverse=True)]
    i_df_columns = term_columns + gene_columns
    i_df = i_df[i_df_columns]
                           
    return i_df
    
    

In [36]:
comm_name = "C595951"
inter_df = make_community_interpretation(comm_name, community_name_map)
inter_df

Unnamed: 0,name,id,#anot,pv,NPM1,RPLP2,SCYL2,TUFM,LUC7L2
0,Nop56p-associated pre-rRNA complex,CORUM:3055,4,2.9e-05,2,2,2,2,0
