# Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
import matplotlib.pyplot as plt
# import pyreadr

# Load data

In [2]:
community = pd.read_csv ('inputs/community_db.csv', index_col=None)

In [3]:
community

Unnamed: 0.1,Unnamed: 0,True_LR,Pair.Name,Ligand,Ligand.Name,Receptor,Receptor.Name,complex_pair,source,target,...,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy
0,1,True,S100A10_TRPV6,S100A10,S100 calcium binding protein A10,TRPV6,transient receptor potential cation channel su...,,P60903,Q9H1D0,...,0,1,1,0,CellTalkDB;HPRD;TRIP,CellTalkDB:18187190;HPRD:12660155;TRIP:1266015...,5,3,3,both
1,2,True,JAG2_NOTCH1,JAG2,jagged canonical Notch ligand 2,NOTCH1,notch receptor 1,,Q9Y219,P46531,...,1,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;Cel...,Baccin2019:1100613311006130;CellChatDB:2235346...,19,11,20,both
2,3,True,DLL1_NOTCH1,DLL1,delta like canonical Notch ligand 1,NOTCH1,notch receptor 1,,O00548,P46531,...,0,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;Cel...,Baccin2019:1006133;Baccin2019:98194281;CellCha...,18,9,20,both
3,4,True,IGF1_IGF1R,IGF1,insulin like growth factor 1,IGF1R,insulin like growth factor 1 receptor,,P05019,P08069,...,0,1,1,0,Baccin2019;CA1;CellCall;CellChatDB;CellPhoneDB...,Baccin2019:1852007;Baccin2019:2877871;CA1:8408...,27,15,26,both
4,5,True,JAG1_NOTCH1,JAG1,jagged canonical Notch ligand 1,NOTCH1,notch receptor 1,,P78504,P46531,...,1,1,1,0,ACSN;Baccin2019;BioGRID;CellCall;CellChatDB;Ce...,ACSN:22330899;ACSN:22363130;Baccin2019:7697721...,36,20,27,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6936,29501,False,FLT1_NRP1,FLT1,fms related receptor tyrosine kinase 1,NRP1,neuropilin 1,NRP1_FLT1_KDR,O14786,COMPLEX:P17948_P35968,...,0,1,1,0,CellChatDB-cofactors,,0,0,1,LR
6937,29521,False,NRP2_KDR,NRP2,neuropilin 2,KDR,kinase insert domain receptor,NRP2_FLT1_KDR,O60462,COMPLEX:P17948_P35968,...,0,1,1,0,CellChatDB-cofactors,,0,0,1,LR
6938,29531,False,FLT1_NRP2,FLT1,fms related receptor tyrosine kinase 1,NRP2,neuropilin 2,NRP2_FLT1_KDR,O60462,COMPLEX:P17948_P35968,...,0,1,1,0,CellChatDB-cofactors,,0,0,1,LR
6939,29541,False,FLT4_NRP2,FLT4,fms related receptor tyrosine kinase 4,NRP2,neuropilin 2,NRP2_FLT4_KDR,O60462,COMPLEX:P35916_P35968,...,0,1,1,0,CellChatDB-cofactors,,0,0,1,LR


**Note on CellPhoneDB:**

CellPhoneDB uses an SQL database, which is provided as a .db file containing 6 tables. To work with these tables, we extract their data as CSV files, as explained in detail in the notebook RestructureCPDB/BreakdownCPDB.ipynb.

One feature of CPDB is that it doesn't have directionality. To make the data easier to work with, we put the ligands of each pair in one column and the receptors in another column. However, if the pair consists of adhesion molecules that bind to each other, the order doesn't matter.

Another challenge we faced is that CPDB contains complex molecules, which are not present in the databases used by `community` and `NicheNet`. To address this, we break down each complex into its individual components and create pairwise links between them. For example, if we have a complex called R1_R2 (consisting of two receptors) that is paired with ligand L1, we break down this pair into L1_R1 and L1_R2.


In their default DB, a complex molecule is given an ID that is greater than 1282

In [4]:
cpdb = pd.read_csv ('inputs/cpdb_db.csv', index_col=None)

In [5]:
nnet_lr = pd.read_csv ('inputs/nichenetr_db.csv', index_col=None)

In [6]:
#create pairs in nnet
nnet_lr["pairs"]=nnet_lr["from"] + "_" + nnet_lr["to"]

In [7]:
cellchat = pd.read_csv('inputs/cellchat_db.csv')

In [8]:
cellchat

Unnamed: 0,interaction_name,pathway_name,ligand,receptor,agonist,antagonist,co_A_receptor,co_I_receptor,evidence,annotation,...,receptor.symbol,receptor.family,receptor.location,receptor.keyword,receptor.surfaceome_main,receptor.surfaceome_sub,receptor.adhesome,receptor.secreted_type,receptor.transmembrane,version
0,TGFB1_TGFBR1_TGFBR2,TGFb,TGFB1,TGFbR1_R2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,KEGG: hsa04350,Secreted Signaling,...,"TGFBR2, TGFBR1","Protein kinase superfamily, TKL Ser/Thr protei...","Cell membrane, Secreted, Membrane raft, Cell s...","Membrane, Secreted, Disulfide bond, Kinase, Tr...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
1,TGFB2_TGFBR1_TGFBR2,TGFb,TGFB2,TGFbR1_R2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,KEGG: hsa04350,Secreted Signaling,...,"TGFBR2, TGFBR1","Protein kinase superfamily, TKL Ser/Thr protei...","Cell membrane, Secreted, Membrane raft, Cell s...","Membrane, Secreted, Disulfide bond, Kinase, Tr...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
2,TGFB3_TGFBR1_TGFBR2,TGFb,TGFB3,TGFbR1_R2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,KEGG: hsa04350,Secreted Signaling,...,"TGFBR2, TGFBR1","Protein kinase superfamily, TKL Ser/Thr protei...","Cell membrane, Secreted, Membrane raft, Cell s...","Membrane, Secreted, Disulfide bond, Kinase, Tr...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
3,TGFB1_ACVR1B_TGFBR2,TGFb,TGFB1,ACVR1B_TGFbR2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,PMID: 27449815,Secreted Signaling,...,"TGFBR2, ACVR1B","Protein kinase superfamily, TKL Ser/Thr protei...","Cell membrane, Secreted, Membrane raft","Membrane, Secreted, Disulfide bond, Kinase, Tr...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
4,TGFB1_ACVR1C_TGFBR2,TGFb,TGFB1,ACVR1C_TGFbR2,TGFb agonist,TGFb antagonist,,TGFb inhibition receptor,PMID: 27449815,Secreted Signaling,...,"TGFBR2, ACVR1C","Protein kinase superfamily, TKL Ser/Thr protei...","Cell membrane, Secreted, Membrane raft, Membrane","Membrane, Secreted, Disulfide bond, Kinase, Tr...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2234,TULP1_MERTK,TULP,TULP1,MERTK,,,,,PMID: 20978474,Secreted Signaling,...,MERTK,"Protein kinase superfamily, Tyr protein kinase",Cell membrane,"Membrane, Disulfide bond, Transmembrane, Kinas...",Receptors,Axl;Kinase,,,True,CellChatDB v2
2235,TUB_MERTK,TULP,TUB,MERTK,,,,,PMID: 20978475,Secreted Signaling,...,MERTK,"Protein kinase superfamily, Tyr protein kinase",Cell membrane,"Membrane, Disulfide bond, Transmembrane, Kinas...",Receptors,Axl;Kinase,,,True,CellChatDB v2
2236,PROS1_MERTK,PROS,PROS1,MERTK,,,,,PMID: 34631419,Secreted Signaling,...,MERTK,"Protein kinase superfamily, Tyr protein kinase",Cell membrane,"Membrane, Disulfide bond, Transmembrane, Kinas...",Receptors,Axl;Kinase,,,True,CellChatDB v2
2237,PLAU_PLAUR,PLAU,PLAU,PLAUR,,,,,uniprot,Secreted Signaling,...,PLAUR,,"Invadopodium membrane, Cell projection, Cell m...","Membrane, Disulfide bond, Secreted, Receptor, ...",,,Adhesion receptor,secreted,True,CellChatDB v2


In [9]:
cellchat[cellchat["ligand.symbol"]== "INHBA, INHBB"]

Unnamed: 0,interaction_name,pathway_name,ligand,receptor,agonist,antagonist,co_A_receptor,co_I_receptor,evidence,annotation,...,receptor.symbol,receptor.family,receptor.location,receptor.keyword,receptor.surfaceome_main,receptor.surfaceome_sub,receptor.adhesome,receptor.secreted_type,receptor.transmembrane,version
135,INHBABB_ACVR1B_ACVR2A,ACTIVIN,Activin AB,ACVR1B_ACVR2A,,ACTIVIN antagonist,,ACTIVIN inhibition receptor,PMID: 2457849; PMID: 22991378,Secreted Signaling,...,"ACVR1B, ACVR2A","Protein kinase superfamily, TKL Ser/Thr protei...","Cell membrane, Membrane","Membrane, Transmembrane, Kinase, ATP-binding, ...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
136,INHBABB_ACVR1B_ACVR2B,ACTIVIN,Activin AB,ACVR1B_ACVR2B,,ACTIVIN antagonist,,ACTIVIN inhibition receptor,PMID: 2457849; PMID: 22991378,Secreted Signaling,...,"ACVR1B, ACVR2B","Protein kinase superfamily, TKL Ser/Thr protei...",Cell membrane,"Membrane, Transmembrane, Kinase, ATP-binding, ...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
137,INHBABB_ACVR1C_ACVR2A,ACTIVIN,Activin AB,ACVR1C_ACVR2A,,ACTIVIN antagonist,,ACTIVIN inhibition receptor,PMID: 2457849; PMID: 22991378,Secreted Signaling,...,"ACVR1C, ACVR2A","Protein kinase superfamily, TKL Ser/Thr protei...",Membrane,"Membrane, Transmembrane, Kinase, ATP-binding, ...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1
138,INHB_ACVR1C_ACVR2B,ACTIVIN,Activin AB,ACVR1C_ACVR2B,,ACTIVIN antagonist,,ACTIVIN inhibition receptor,PMID: 2457849; PMID: 22991378,Secreted Signaling,...,"ACVR1C, ACVR2B","Protein kinase superfamily, TKL Ser/Thr protei...","Membrane, Cell membrane","Membrane, Transmembrane, Kinase, ATP-binding, ...",Receptors,Act.TGFB;Kinase,,,True,CellChatDB v1


**Note on CellChat DB**

The interaction_name column indicates the pair name, which is concatenation of Ligand and Receptor with an underscore. However, when multiple ligands or receptors are involved, some of them may not be fully represented in the interaction name. For example, in the case of R = PTCH1_SMO, not all components are included. Additionally, some entries use abbreviations, such as ligands INHBA and INHBB, which are abbreviated as INHBABB_Receptor.

For this reason, we have split and produced individual pairs for each interaction, as NicheNet and Community do not accommodate complex molecules.

We also excluded non-protein signaling(i.e., metabolic and synaptic signaling) from CellChatDB


In [10]:
def generate_interaction_ids(row):
    # Split ligand and receptor symbols by commas
    ligands = row['ligand.symbol'].split(',')
    receptors = row['receptor.symbol'].split(',')
    
    # Generate all combinations of ligands and receptors
    interaction_ids = [f"{ligand}_{receptor}" for ligand, receptor in product(ligands, receptors)]
    
    return interaction_ids

In [11]:
from itertools import product

In [12]:
cellchat['interaction_ID'] = cellchat.apply(generate_interaction_ids, axis=1)

In [13]:
cellchat = cellchat.explode('interaction_ID').reset_index(drop=True)

In [14]:
# cellchat[cellchat["receptor"]== "ACVR1B_ACVR2A_CFC1"]

In [15]:
# check = cellchat[cellchat["ligand.symbol"]== "INHBA, INHBB"]

In [16]:
# check[["interaction_name", "interaction_ID"]]

# Ligand-receptor pairs

## Compare original databases

In [17]:
#get unique list of pairs in Commpackage
community_pairs = set(community['Pair.Name'])

In [18]:
#get unique list of pairs in cpdb

cpdb_pairs = set(cpdb['pairs'])

In [19]:
#get unique list of pairs in nnet

nnet_pairs = set(nnet_lr['pairs'])

In [20]:
cellchat_pairs = set(cellchat['interaction_ID'])

**Curated + predicted**

In [21]:
pairs_list = set(list(community_pairs)+list(cpdb_pairs)+list(nnet_pairs)+list(cellchat_pairs))

In [22]:
to_export = {
    'ID': list(pairs_list),
    'community': [True if pair in community_pairs else False for pair in pairs_list],
    'CPDB': [True if pair in cpdb_pairs else False for pair in pairs_list],
    'NicheNet': [True if pair in nnet_pairs else False for pair in pairs_list],
    'CellChat': [True if pair in cellchat_pairs else False for pair in pairs_list]
}

In [23]:
to_export = pd.DataFrame(to_export)

In [24]:
to_export.to_csv("inputs/pairs_curated_predicted.csv")

In [25]:
notin_community=to_export[to_export["community"]==False]

In [26]:
notin=list()
count=0
for x in notin_community["ID"]:
    rev=x.split('_')
    rev=rev[1]+"_"+rev[0]
    if rev not in community_pairs:
        notin.append(x)
    else:
        count += 1

We are checking if any pairs that are not in the community exist in reverse order in other databases. We have found that 289 pairs appear in other databases in reverse order.

In addition, we are investigating each database separately to determine why the pairs are not in our database. Specifically, we are currently examining CPDB to identify any reasons why certain pairs are missing.

The `pid1` and `pid2` columns in CPDB dataset indicate the original pairID from CPDB. Although CPDB assigns pairIDs starting from 1, complex molecules are assigned IDs greater than or equal to 1282. We have observed that the pairs that are not present in our community database are actually part of complex molecules in CPDB.

In [27]:
not_community_yes_cpdb=notin_community[notin_community["ID"].isin(notin) & notin_community["CPDB"] == True]

In [28]:
#retrieve the table from the original df, so we can check where these pairs are originating from
x=cpdb[cpdb["pairs"].isin(not_community_yes_cpdb["ID"])]

In [29]:
len((x['pid1'] > 1280 )| (x['pid2'] > 1280))

680

**Note:** We can do a deeper investigation for NN pairs

**Curated only**

In [30]:
#nnet predicted 10.629 pairs, lets see how does the comparison look without the predicted pairs
nnet_nopridcition=nnet_lr[~nnet_lr["database"].str.contains("ppi_")]
nopredict_pairs=set(nnet_nopridcition["pairs"])

In [31]:
#get curated pairs from community
community_curated = community[community["annotation_strategy"].isin(["both","curated"])]
community_curated = set(community_curated["Pair.Name"])

In [32]:
len(nnet_nopridcition)

2022

In [33]:
# pairs_list = set(list(community_pairs)+list(cpdb_pairs)+list(nnet_pairs)+list(cellchat_pairs))

# to_export = {
#     'ID': list(pairs_list),
#     'community': [True if pair in community_pairs else False for pair in pairs_list],
#     'CPDB': [True if pair in cpdb_pairs else False for pair in pairs_list],
#     'NicheNet': [True if pair in nnet_pairs else False for pair in pairs_list],
#     'CellChat': [True if pair in cellchat_pairs else False for pair in pairs_list],
#     'community_curated': [True if pair in community_curated else False for pair in pairs_list],
#     'NicheNet_curated': [True if pair in nopredict_pairs else False for pair in pairs_list],
    
# }

# to_export = pd.DataFrame(to_export)
# to_export.to_csv("inputs/pairs_full.csv")

In [34]:
pairs_list = set(list(community_curated)+list(cpdb_pairs)+list(nopredict_pairs)+list(cellchat_pairs))

to_export = {
    'ID': list(pairs_list),
    'community': [True if pair in community_curated else False for pair in pairs_list],
    'CPDB': [True if pair in cpdb_pairs else False for pair in pairs_list],
    'NicheNet': [True if pair in nopredict_pairs else False for pair in pairs_list],
    'CellChat': [True if pair in cellchat_pairs else False for pair in pairs_list]
    
}

to_export = pd.DataFrame(to_export)
to_export.to_csv("inputs/pairs_curated.csv")

In [35]:
to_export[to_export["NicheNet"]==True]

Unnamed: 0,ID,community,CPDB,NicheNet,CellChat
11,DLL4_NOTCH3,True,True,True,True
20,EFNB2_EPHB2,True,True,True,True
24,WNT4_FZD6,False,True,True,True
26,LAMC3_ITGA2,False,False,True,True
36,CALM2_PDE1B,True,False,True,False
...,...,...,...,...,...
6664,COL5A3_SDC3,True,False,True,False
6668,EFNA2_EPHA5,True,True,True,True
6671,CCL20_CCR6,True,True,True,True
6672,RLN2_RXFP1,True,True,True,False


# Ligand and receptor gene space

In [36]:
community_gene_space = set([substring for value in list(community_pairs) for substring in value.split('_')])

In [37]:
cpdb_gene_space = set([substring for value in list(cpdb_pairs) for substring in value.split('_')])

In [38]:
nnet_gene_space = set([substring for value in list(nnet_pairs) for substring in value.split('_')])

In [39]:
nnet_nopredict_gene_space = set([substring for value in list(nopredict_pairs) for substring in value.split('_')])

In [40]:
community_curated_gene_space = set([substring for value in list(community_curated) for substring in value.split('_')])

In [41]:
cellchat_gene_space = set([substring for value in list(cellchat_pairs) for substring in value.split('_')])

**Curated + predicted**

In [42]:
gene_list = set(list(community_gene_space)+list(cpdb_gene_space)+list(nnet_gene_space))

to_export = {
    'ID': list(gene_list),
    'community': [True if gene in community_gene_space else False for gene in gene_list],
    'CPDB': [True if gene in cpdb_gene_space else False for gene in gene_list],
    'NicheNet': [True if gene in nnet_gene_space else False for gene in gene_list],
    'CellChat': [True if gene in cellchat_gene_space else False for gene in gene_list]
}

to_export = pd.DataFrame(to_export)
to_export.to_csv("inputs/genes_curated_predicted.csv")

We are investigating why some genes are not included in our database. As previously mentioned, CPDB assigns `pid` values greater than 1282 if the protein is part of a complex. We have discovered that the genes that are not present in our database are once again coming from these complex molecules.

In [43]:
len(nnet_gene_space)

1430

In [44]:
880+283+932+197

2292

In [45]:
notin_community=to_export[to_export["community"]==False]

In [46]:
cpdb[cpdb["ligand"].isin(notin_community["ID"])]

Unnamed: 0,L,R,ligand,receptor,l_adhesion_mol,r_adhesion_mol,pid1,pid2,pairs
0,901,917,PTGR1,LTB4R,0,0,1282.0,917.0,PTGR1_LTB4R
1,315,917,LTA4H,LTB4R,0,0,1430.0,917.0,LTA4H_LTB4R
2,901,1189,PTGR1,LTB4R2,0,0,1282.0,1189.0,PTGR1_LTB4R2
3,315,1189,LTA4H,LTB4R2,0,0,1430.0,1189.0,LTA4H_LTB4R2
4,315,864,LTA4H,GPR17,0,0,1430.0,864.0,LTA4H_GPR17
...,...,...,...,...,...,...,...,...,...
2627,544,698,RXRB,ALDH1A3,1,0,1513.0,1481.0,RXRB_ALDH1A3
2628,698,372,ALDH1A3,RARG,1,0,1513.0,1482.0,ALDH1A3_RARG
2629,372,698,RARG,ALDH1A3,1,0,1513.0,1482.0,RARG_ALDH1A3
2630,698,708,ALDH1A3,RXRG,1,0,1513.0,1483.0,ALDH1A3_RXRG


Upon investigating why some genes are included in NicheNet but not in our database, we discovered that most of them were added based on predictions, mainly through Gene Ontology (GO). Additionally, we found that there are a few genes in NicheNet that originate from the Ramilowski dataset, but genesymbols are listed under alternative names according to genecards and uniprot. For instance, CTGF's approved gene symbol is CCN2, CYR61 is included as CCN1, and DEFB4A is included as DEFB4B.

In [47]:
nnet_lr[nnet_lr["from"].isin(notin_community["ID"])]["database"].value_counts()

ppi_prediction_go     180
ppi_prediction         27
ramilowski             14
kegg                    6
guide2pharmacology      4
Name: database, dtype: int64

In [48]:
x=nnet_lr[nnet_lr["from"].isin(notin_community["ID"])]

In [49]:
x[x["database"]=="ramilowski"]

Unnamed: 0.1,Unnamed: 0,from,to,source,database,pairs
1109,1110,CTGF,ITGAM,ramilowski_known,ramilowski,CTGF_ITGAM
1110,1111,CTGF,ITGB2,ramilowski_known,ramilowski,CTGF_ITGB2
1111,1112,CTGF,LRP6,ramilowski_known,ramilowski,CTGF_LRP6
1136,1137,CYR61,ITGA5,ramilowski_known,ramilowski,CYR61_ITGA5
1137,1138,CYR61,ITGAM,ramilowski_known,ramilowski,CYR61_ITGAM
1138,1139,CYR61,ITGB2,ramilowski_known,ramilowski,CYR61_ITGB2
1139,1140,CYR61,ITGB3,ramilowski_known,ramilowski,CYR61_ITGB3
1140,1141,DEFB4A,CCR6,ramilowski_known,ramilowski,DEFB4A_CCR6
1429,1430,IFNA13,IFNAR1,ramilowski_known,ramilowski,IFNA13_IFNAR1
1430,1431,IFNA13,IFNAR2,ramilowski_known,ramilowski,IFNA13_IFNAR2


**Curated only**

In [50]:
palette = {'NicheNet': '#cdad00', 'community': '#6e8b3d', 'CellPhoneDB': '#8b8878'}

In [51]:
gene_list = set(list(community_curated_gene_space)+list(cpdb_gene_space)+list(nnet_nopredict_gene_space)+list(cellchat_gene_space))

to_export = {
    'ID': list(gene_list),
    'community': [True if gene in community_curated_gene_space else False for gene in gene_list],
    'CPDB': [True if gene in cpdb_gene_space else False for gene in gene_list],
    'NicheNet': [True if gene in nnet_nopredict_gene_space else False for gene in gene_list],
    'CellChat': [True if gene in cellchat_gene_space else False for gene in gene_list]
}

to_export = pd.DataFrame(to_export)
to_export.to_csv("inputs/genes_curated.csv")

In [52]:
len(nnet_nopredict_gene_space)

997

In [53]:
len(nopredict_pairs)

1390