In [1]:
import pandas as pd
import numpy as np
# from unipressed import IdMappingClient

In [2]:
#list of intersaction pairs

combined = pd.read_csv ('../OmniPath/L_R_OmniPathFull.csv', index_col=False)

In [3]:
#CellPhoneDB is using protein_name_a and protein_name_b column names for genesymbols
#in our version its Ligand and Receptor column
combined.rename({'Ligand': 'protein_name_a', 'Receptor': 'protein_name_b'}, axis=1, inplace=True)

In [4]:
#CellPhoneDB mandatory fields to build a customDB
combined["source"]="OmniPath"

In [5]:
#CellphoneDB requires ENSEMBL IDs so we retrive it through UniProtIDs

In [6]:
ligand_ids=list(set(combined["partner_a"].values))

In [7]:
from unipressed import IdMappingClient
import time
request = IdMappingClient.submit(
    source="UniProtKB_AC-ID", dest="Ensembl", ids=ligand_ids
)
time.sleep(2.0)

In [8]:
lig_list=list(request.each_result())

In [9]:
#put it in dictionary key: UniProtID, value: ENSEMBL
lig_dict=dict()
for x in lig_list:
    lig_dict[x["from"]]=x["to"]

In [10]:
# some uniprot IDs do not have ENSEMBL, so we mock for those
count=1
for x in ligand_ids:
    if x not in lig_dict:
        lig_dict[x]="ENSG000000000"+str(count)
        count+=1

In [11]:
#do the same for receptor
receptor_ids=list(set(combined["partner_b"].values))

In [12]:
request = IdMappingClient.submit(
    source="UniProtKB_AC-ID", dest="Ensembl", ids=receptor_ids
)
time.sleep(2.0)

In [13]:
rec_list=list(request.each_result())

In [14]:
rec_dict=dict()
for x in rec_list:
    rec_dict[x["from"]]=x["to"]
    
for x in receptor_ids:
    if x not in rec_dict:
        rec_dict[x]="ENSG000000000"+str(count)
        count+=1

In [15]:
#merge two of them into one dictionary
rec_dict.update(lig_dict)

In [16]:
# dictionary with genesymbols and uniprotIDs
name2id=dict()
for x in range(0,len(combined)):
    if combined.iloc[x].protein_name_a in name2id and combined.iloc[x].protein_name_b in name2id:
        continue
    else:
        name2id[combined.iloc[x].protein_name_a]=combined.iloc[x].partner_a
        name2id[combined.iloc[x].protein_name_b]=combined.iloc[x].partner_b
        

In [17]:
#CPDB requires a file csv to generate custom DB, with genesymbol, uniprot and ensembl IDS
df={"gene_name":[],
   "uniprot":[],
   "hgnc_symbol":[],
   "ensembl":[]}

In [18]:
for symbol,uniprot in name2id.items():
    df["gene_name"].append(symbol)
    df["uniprot"].append(uniprot)
    df["hgnc_symbol"].append(symbol)
    df["ensembl"].append(rec_dict[uniprot])

In [19]:
df=pd.DataFrame(df)

In [20]:
#no need versions
df["ensembl"] = df["ensembl"].str.split(".").str[0]

In [21]:
#CPDB requires another file with uniprot and genesymbols
prot = df.loc[:, ["uniprot", "hgnc_symbol"]]

In [22]:
prot["hgnc_symbol"] = df["hgnc_symbol"] + "_HUMAN"

In [23]:
#the column name must be protein_name in this one
prot = prot.rename(columns={"hgnc_symbol": "protein_name"})

In [24]:
#tag the ones that are receptors
prot['receptor']=[1 if uniprot in combined['partner_b'].values else 0 for uniprot in prot['uniprot']]

In [25]:
df[df["gene_name"]=="PIK3CD"]

Unnamed: 0,gene_name,uniprot,hgnc_symbol,ensembl


In [26]:
# prot['other']=1
# prot['peripheral']=1
# prot['secreted']=1
# prot['integrin']=1
# prot['transmembrane']=1
# prot['tags']="To_comment"
# prot['tags_reason']="curation"
# prot['secreted_highlight']=1

In [27]:
#write those to a file
df.to_csv('gene_user2.csv', index=False)
prot.to_csv('prot_user2.csv', index=False)
combined.to_csv('newLR2.csv', index=False)

# generate custom DB with the below command

```cellphonedb database generate --user-interactions newLR2.csv.csv --user-interactions-only --user-protein prot_user1.csv --user-gene gene_user1.csv --result-path combined12```

# Sanity check, lets see if the edges are as we submit it to CPDB

Each table in CellPhoneDB's sql database are converted into csv files. This is explained in ```Breakdown CPDB.ipynb```

In [28]:
#includes IDs of interactions called multi_id. The IDs are assigned by cpdb.

interaction = pd.read_csv ('./combined18/csv/interaction_table.csv', index_col=False)
interaction = interaction.iloc[:, [2,3]]

In [29]:
#tables for gene_symbols and uniprotIDs
gene_table = pd.read_csv ('./combined18/csv/gene_table.csv', index_col=False)
gene_table = gene_table.iloc[:, [3,4]].copy()
protein_name = pd.read_csv ('./combined18/csv/protein_table.csv', index_col=False)
protein_name = protein_name.iloc[:, [1,5]].copy()

In [30]:
#includes multi_id, name, receptor, receptor_desc, secreted etc... 
#This is the table we check whether a protein is ligand or receptor
multi = pd.read_csv ('./combined18/csv/multidata_table.csv', index_col=None)

#we seperate ligands and receptors into different df
receptor = multi[multi['receptor'] == 1]
receptor = receptor.iloc[:, [0,1]]
ligand = multi[multi['receptor'] == 0]
ligand = ligand.iloc[:, [0,1]]

In [31]:
# assign gene symbols for ligand /// assign prot_names not gene symbols.
df = pd.DataFrame()
for x in range(len(interaction)):
    p_id1 = interaction.loc[x].multidata_1_id
    p_id2 = interaction.loc[x].multidata_2_id

    symbol1 = gene_table.loc[gene_table['protein_id'] == p_id1]
    symbol1 = symbol1[symbol1['hgnc_symbol'].notna()]
    symbol1 = symbol1.hgnc_symbol.unique()
    p_name1 = protein_name.loc[protein_name['protein_multidata_id'] == p_id1].protein_name
    p_name1 = p_name1.str.replace('_HUMAN', '')
    symbol1 = p_name1
    
    symbol2 = gene_table.loc[gene_table['protein_id'] == p_id2]
    symbol2 = symbol2[symbol2['hgnc_symbol'].notna()]
    symbol2 = symbol2.hgnc_symbol.unique()
    p_name2 = protein_name.loc[protein_name['protein_multidata_id'] == p_id2].protein_name
    p_name2 = p_name2.str.replace('_HUMAN', '')
    symbol2 = p_name2

#     if not list(symbol1)[0] or not list(symbol2)[0]:
#         print(symbol1, symbol2)
    
    #cellphoneDB, does not follow our structure, where ligands in one column and receptors others. 
    #so to convert the structure into ours we aggregate ligands under multidata_1_id 
    #and receptors under multidata_2_id
    
    
    # if ligand is in other column, swap the values.
    if p_id1 in receptor.id_multidata.values and p_id2 in ligand.id_multidata.values:
        interaction.at[x, 'reversed'] = 1
        interaction.at[x, 'multidata_1_id'] = p_id2
        interaction.at[x, 'multidata_2_id'] = p_id1
        
        interaction.at[x, 'ligand'] = list(symbol2)[0]
        interaction.at[x, 'receptor'] = list(symbol1)[0]
        
    # if both of them are annotated as receptor, leave it that way.
    elif p_id1 in receptor.id_multidata.values and p_id2 in receptor.id_multidata.values:
        interaction.at[x, 'r_adhesion_mol'] = 1
        interaction.at[x, 'ligand'] = list(symbol1)[0]
        interaction.at[x, 'receptor'] = list(symbol2)[0]
    else:
        interaction.at[x, 'ligand'] = list(symbol1)[0]
        interaction.at[x, 'receptor'] = list(symbol2)[0]
        
        
        
#     interaction.at[x, 'ligand'] = list(p_name1)[0]
#     interaction.at[x, 'l_complex'] = 0
    #get those ligands that have more than one symbol in a seperate df
    #this part can be taken out as no need anymore?
#     if len(symbol) > 1:
#         print("hereeeee", symbol, "---", p_id)
#         entry=i.loc[x].to_frame().transpose()
#         df = pd.concat([df,entry])
#         df.at[x, 'ligand'] = symbol


In [32]:
# create pairs
interaction["pairs"]=interaction[["ligand", "receptor"]].apply("_".join, axis=1)

In [33]:
# mask = i['r_adhesion_mol'] == 1
# rows_to_swap = i.loc[mask]
# i.loc[mask, 'ligand'], i.loc[mask, 'receptor'] = i.loc[mask, 'receptor'], i.loc[mask, 'ligand']
# i.loc[mask, 'multidata_1_id'], i.loc[mask, 'multidata_2_id'] = i.loc[mask, 'multidata_2_id'], i.loc[mask, 'multidata_1_id']

In [34]:
#some pairs that we submited are missing.
notincpdb=[x for x in combined["Pair.Name"].values if x not in interaction.pairs.values]

In [35]:
len(notincpdb)

1484

In [36]:
# there are pairs we did not submit when building tje DBs. 
notinours=[x for x in interaction.pairs.values if x not in combined["Pair.Name"].values]

In [37]:
len(notinours)

1103

In [38]:
#seems like all the pairs that we did not submit but happens to be in CPDB, 
#are adhesion_mol which means both of them are receptors. 
interaction.loc[interaction['pairs'].isin(notinours)]["r_adhesion_mol"].value_counts()

1.0    1103
Name: r_adhesion_mol, dtype: int64

In [39]:
interaction[interaction["r_adhesion_mol"]==1]

Unnamed: 0,multidata_1_id,multidata_2_id,ligand,receptor,r_adhesion_mol,reversed,pairs
4,419,799,GNAI2,LHCGR,1.0,,GNAI2_LHCGR
10,277,477,EGFR,ADRB2,1.0,,EGFR_ADRB2
11,15,477,PRNP,ADRB2,1.0,,PRNP_ADRB2
12,90,477,GPR37,ADRB2,1.0,,GPR37_ADRB2
14,301,477,EGF,ADRB2,1.0,,EGF_ADRB2
...,...,...,...,...,...,...,...
6858,2029,2055,LRRC4,PARD6A,1.0,,LRRC4_PARD6A
6859,2067,2084,RTN4,TNFRSF19,1.0,,RTN4_TNFRSF19
6862,2133,2225,NTM,THSD7A,1.0,,NTM_THSD7A
6863,2140,2182,PTGFRN,TMEM59L,1.0,,PTGFRN_TMEM59L


In [40]:
# ok so, CPDB assigns their own IDs to each gene, and these are assigned by sorting their uniprotIDs
# alphabetically. 

#and the smaller ids gets to multidata_1_id and greater ids gets to multidata_2_id

letsee=interaction.loc[interaction['pairs'].isin(notinours)]
for x in range(len(letsee)):
    p_id1 = letsee.iloc[x].multidata_1_id
    p_id2 = letsee.iloc[x].multidata_2_id
    
    if p_id1 > p_id2:
        print(p_id1)

In [41]:
# lets check if those ones in our submited as reversed pairs. 
for x in notinours:
    component=x.split("_")
    part1=component[0]
    part2=component[1]
    repair=part2+"_"+part1
    if repair not in combined["Pair.Name"].values:
        print(x,"fuck")
    if interaction[interaction["pairs"]==x].r_adhesion_mol.values==0 and interaction[interaction["pairs"]==x].l_adhesion_mol.values==0:
        print(x)
        

# we get the intersection of these two

In [42]:
intersection=[x for x in interaction.pairs.values if x in combined["Pair.Name"].values]

In [43]:
intersect_db=combined.loc[combined['Pair.Name'].isin(intersection)]

In [44]:
intersect_db=intersect_db.reset_index()

In [45]:
intersect_db

Unnamed: 0,index,Pair.Name,protein_name_a,Ligand.Name,protein_name_b,Receptor.Name,complex_pair,partner_a,partner_b,source,...,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy,db
0,0,A2M_LRP1,A2M,alpha-2-macroglobulin,LRP1,LDL receptor related protein 1,,P01023,Q07954,OmniPath,...,1,1,0,AlzPathway;Baccin2019;CellTalkDB;EMBRACE;Fanto...,AlzPathway:19026743;Baccin2019:10652313;Baccin...,11,4,11,LR,both
1,1,AANAT_MTNR1A,AANAT,aralkylamine N-acetyltransferase,MTNR1A,melatonin receptor 1A,,Q16613,P48039,OmniPath,...,1,1,0,Baccin2019;CellTalkDB;Fantom5_LRdb;HPMR;HPMR_L...,Baccin2019:12943195;CellTalkDB:12943195;HPMR:1...,5,1,9,LR,both
2,2,AANAT_MTNR1B,AANAT,aralkylamine N-acetyltransferase,MTNR1B,melatonin receptor 1B,,Q16613,P49286,OmniPath,...,1,1,0,Baccin2019;CellTalkDB;Fantom5_LRdb;HPMR_LRdb;H...,Baccin2019:12943195;CellTalkDB:12943195;LRdb:1...,4,1,8,LR,both
3,3,ABCA1_SHANK1,ABCA1,ATP binding cassette subfamily A member 1,SHANK1,SH3 and multiple ankyrin repeat domains 1,,O95477,Q9Y566,OmniPath,...,0,0,0,Baccin2019;HPRD;Ramilowski2015_Baccin2019,HPRD:16192279,1,1,2,LR,LR
4,4,ACE_AGTR2,ACE,angiotensin I converting enzyme,AGTR2,angiotensin II receptor type 2,,P12821,P50052,OmniPath,...,0,0,0,Baccin2019;CellTalkDB;Fantom5_LRdb;HPRD;HPRD_L...,Baccin2019:11459796;HPRD:11459796;LRdb:11459796,3,1,7,LR,LR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5757,7237,TNFRSF10B_TNFSF10,TNFRSF10B,TNF receptor superfamily member 10b,TNFSF10,TNF superfamily member 10,,O14763,P50591,OmniPath,...,0,0,0,BioGRID;CellCall;HPMR;HPRD;InnateDB;IntAct;Lit...,BioGRID:11094155;BioGRID:9311998;HPMR:15766588...,30,20,9,curated,curated
5758,7238,TNFRSF10C_TNFSF10,TNFRSF10C,TNF receptor superfamily member 10c,TNFSF10,TNF superfamily member 10,,O14798,P50591,OmniPath,...,0,0,0,HPMR;HPRD;Lit-BM-17;SIGNOR;SPIKE,HPMR:9325248;HPRD:9314565;HPRD:9325248;HPRD:96...,8,4,5,curated,curated
5759,7240,TNFRSF11B_TNFSF11,TNFRSF11B,TNF receptor superfamily member 11b,TNFSF11,TNF superfamily member 11,,O00300,O14788,OmniPath,...,0,0,0,CellPhoneDB;HPMR;HPRD;IntAct;NetPath;Wang,HPMR:9568710;HPRD:12221720;HPRD:9520411;IntAct...,6,5,6,curated,curated
5760,7243,TNFRSF6B_FASLG,TNFRSF6B,TNF receptor superfamily member 6b,FASLG,Fas ligand,,O95407,P48023,OmniPath,...,0,0,0,BioGRID;CellPhoneDB;HPMR;HPRD;SIGNOR;Wang,BioGRID:9872321;HPMR:14697332;HPRD:10318773;HP...,6,4,6,curated,curated


# put the intersection DB in to a structure so we can generate CPDB database

In [46]:
name2id=dict()
for x in range(0,len(intersect_db)):
    if intersect_db.iloc[x].protein_name_a in name2id and intersect_db.iloc[x].protein_name_b in name2id:
        continue
    else:
        name2id[intersect_db.iloc[x].protein_name_a]=intersect_db.iloc[x].partner_a
        name2id[intersect_db.iloc[x].protein_name_b]=intersect_db.iloc[x].partner_b
        

In [47]:
df={"gene_name":[],
   "uniprot":[],
   "hgnc_symbol":[],
   "ensembl":[]}


for symbol,uniprot in name2id.items():
    df["gene_name"].append(symbol)
    df["uniprot"].append(uniprot)
    df["hgnc_symbol"].append(symbol)
    df["ensembl"].append(rec_dict[uniprot])

df=pd.DataFrame(df)

df["ensembl"] = df["ensembl"].str.split(".").str[0]
prot = df.loc[:, ["uniprot", "hgnc_symbol"]]
prot["hgnc_symbol"] = df["hgnc_symbol"] + "_HUMAN"
prot = prot.rename(columns={"hgnc_symbol": "protein_name"})
prot['receptor']=[1 if uniprot in combined['partner_b'].values else 0 for uniprot in prot['uniprot']]
df.to_csv('gene_user1.csv', index=False)
prot.to_csv('prot_user1.csv', index=False)
intersect_db.to_csv('intersect_db.csv', index=False)

# generate custom DB with the below command again. 

```cellphonedb database generate --user-interactions intersect_db.csv --user-interactions-only --user-protein prot_user1.csv --user-gene gene_user1.csv --result-path combined17```

In [48]:
#includes IDs of interactions called multi_id. The IDs are assigned by cpdb.

interaction = pd.read_csv ('./combined19/csv/interaction_table.csv', index_col=False)
interaction = interaction.iloc[:, [2,3]]

#tables for gene_symbols and uniprotIDs
gene_table = pd.read_csv ('./combined19/csv/gene_table.csv', index_col=False)
gene_table = gene_table.iloc[:, [3,4]].copy()
protein_name = pd.read_csv ('./combined19/csv/protein_table.csv', index_col=False)
protein_name = protein_name.iloc[:, [1,5]].copy()

#includes multi_id, name, receptor, receptor_desc, secreted etc... 
#This is the table we check whether a protein is ligand or receptor
multi = pd.read_csv ('./combined19/csv/multidata_table.csv', index_col=None)

#we seperate ligands and receptors into different df
receptor = multi[multi['receptor'] == 1]
receptor = receptor.iloc[:, [0,1]]
ligand = multi[multi['receptor'] == 0]
ligand = ligand.iloc[:, [0,1]]


# assign gene symbols for ligand /// assign prot_names not gene symbols.
df = pd.DataFrame()
for x in range(len(interaction)):
    p_id1 = interaction.loc[x].multidata_1_id
    p_id2 = interaction.loc[x].multidata_2_id

    symbol1 = gene_table.loc[gene_table['protein_id'] == p_id1]
    symbol1 = symbol1[symbol1['hgnc_symbol'].notna()]
    symbol1 = symbol1.hgnc_symbol.unique()
    p_name1 = protein_name.loc[protein_name['protein_multidata_id'] == p_id1].protein_name
    p_name1 = p_name1.str.replace('_HUMAN', '')
    symbol1 = p_name1
    
    symbol2 = gene_table.loc[gene_table['protein_id'] == p_id2]
    symbol2 = symbol2[symbol2['hgnc_symbol'].notna()]
    symbol2 = symbol2.hgnc_symbol.unique()
    p_name2 = protein_name.loc[protein_name['protein_multidata_id'] == p_id2].protein_name
    p_name2 = p_name2.str.replace('_HUMAN', '')
    symbol2 = p_name2

#     if not list(symbol1)[0] or not list(symbol2)[0]:
#         print(symbol1, symbol2)
    
    #cellphoneDB, does not follow our structure, where ligands in one column and receptors others. 
    #so to convert the structure into ours we aggregate ligands under multidata_1_id 
    #and receptors under multidata_2_id
    
    
    # if ligand is in other column, swap the values.
    if p_id1 in receptor.id_multidata.values and p_id2 in ligand.id_multidata.values:
        interaction.at[x, 'reversed'] = 1
        interaction.at[x, 'multidata_1_id'] = p_id2
        interaction.at[x, 'multidata_2_id'] = p_id1
        
        interaction.at[x, 'ligand'] = list(symbol2)[0]
        interaction.at[x, 'receptor'] = list(symbol1)[0]
        
    # if both of them are annotated as receptor, leave it that way.
    elif p_id1 in receptor.id_multidata.values and p_id2 in receptor.id_multidata.values:
        interaction.at[x, 'r_adhesion_mol'] = 1
        interaction.at[x, 'ligand'] = list(symbol1)[0]
        interaction.at[x, 'receptor'] = list(symbol2)[0]
    else:
        interaction.at[x, 'ligand'] = list(symbol1)[0]
        interaction.at[x, 'receptor'] = list(symbol2)[0]
        
        
        
#     interaction.at[x, 'ligand'] = list(p_name1)[0]
#     interaction.at[x, 'l_complex'] = 0
    #get those ligands that have more than one symbol in a seperate df
    #this part can be taken out as no need anymore?
#     if len(symbol) > 1:
#         print("hereeeee", symbol, "---", p_id)
#         entry=i.loc[x].to_frame().transpose()
#         df = pd.concat([df,entry])
#         df.at[x, 'ligand'] = symbol


# create pairs
interaction["pairs"]=interaction[["ligand", "receptor"]].apply("_".join, axis=1)

In [49]:
#list of intersaction pairs

combined = pd.read_csv ('intersect_db.csv', index_col=False)

In [50]:
len([x for x in combined["Pair.Name"].values if x not in interaction.pairs.values])

0

In [51]:
[x for x in interaction.pairs.values if x not in combined["Pair.Name"].values]

[]

In [None]:
pwd