# Pairing sequence embeddings for ligand-receptor database

1. Download ligand receptor database from [OmniPath](https://omnipath.readthedocs.io/en/latest/installation.html)

2. Translate UniProt IDS to ProtTrans5 embeddings.

In [1]:
import pandas as pd
import numpy as np
import omnipath as op

In [2]:
df = op.interactions.LigRecExtra.get()

In [3]:
hdf = pd.read_csv('ProtT5_embeddings/Human_preprocessed.csv', index_col=0)

In [4]:
L_id = []
R_id = []
X = []

for row in df.iterrows():
    if row[1]['source'] in hdf.index:
        if row[1]['target'] in hdf.index:
            L_id.append(row[1]['source'])
            R_id.append(row[1]['target'])
            emb = np.hstack((hdf.loc[row[1]['source']].to_numpy(),
                             hdf.loc[row[1]['target']].to_numpy()))
            X.append(emb)

In [5]:
pair_df = pd.DataFrame(X,
            columns=["L_PT5_%d" % i for i in range(1024)]+
            ["R_PT5_%d" % i for i in range(1024)])
pair_df['L UniProt'] = L_id
pair_df['R UniProt'] = R_id

pair_df

Unnamed: 0,L_PT5_0,L_PT5_1,L_PT5_2,L_PT5_3,L_PT5_4,L_PT5_5,L_PT5_6,L_PT5_7,L_PT5_8,L_PT5_9,...,R_PT5_1016,R_PT5_1017,R_PT5_1018,R_PT5_1019,R_PT5_1020,R_PT5_1021,R_PT5_1022,R_PT5_1023,L UniProt,R UniProt
0,0.206500,0.143800,-0.08920,0.087600,0.100200,-0.002910,-0.063050,-0.11255,-0.112550,-0.063230,...,-0.015330,0.02512,0.04855,0.019580,0.014496,-0.003325,-0.033170,0.006145,P0DP23,Q13507
1,0.018220,0.051030,0.05066,0.027760,-0.029160,0.005320,-0.011100,-0.03760,0.035740,-0.010345,...,-0.001158,-0.01534,0.02208,-0.003060,-0.025240,-0.002840,0.011650,0.028370,P46531,Q9Y219
2,0.005455,0.057800,0.02992,0.019350,-0.004814,-0.002748,0.006336,-0.05344,0.056820,-0.012700,...,-0.011380,-0.01611,0.04593,0.028210,-0.043730,-0.005104,0.003342,0.017910,Q9Y219,P46531
3,0.004310,0.066600,0.03260,0.019780,-0.003508,-0.000731,0.011980,-0.05740,0.057700,-0.022550,...,-0.011380,-0.01611,0.04593,0.028210,-0.043730,-0.005104,0.003342,0.017910,O00548,P46531
4,0.045780,0.028350,0.05770,0.016770,-0.053470,0.032620,-0.004917,-0.14950,-0.003765,-0.001492,...,0.001918,-0.01102,0.06134,0.012990,0.009730,0.012700,0.015045,0.020200,P05019,P08069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4491,-0.002394,0.065700,0.05057,0.001718,-0.054700,0.020030,-0.046880,-0.05573,0.037230,0.037080,...,0.000333,-0.08200,0.03370,0.028820,0.000588,0.010254,-0.026930,-0.003159,P17936,Q6UXB1
4492,-0.007520,0.055540,0.06350,-0.007694,-0.044650,0.032560,-0.061500,-0.05720,0.015900,0.041230,...,0.000333,-0.08200,0.03370,0.028820,0.000588,0.010254,-0.026930,-0.003159,P18065,Q6UXB1
4493,0.031040,0.010155,0.02255,0.032620,-0.004540,0.007164,-0.028200,-0.02136,-0.015760,0.009560,...,-0.012860,-0.02338,-0.02720,0.028060,-0.009540,-0.041440,0.004772,0.041630,O75487,P01308
4494,-0.011480,0.059970,0.08180,0.010864,-0.075300,-0.013220,0.027920,-0.19930,-0.001354,0.001966,...,-0.042480,-0.02810,-0.03568,0.035030,0.009960,0.015880,0.041930,0.041960,Q969E1,Q9UBU3


In [6]:
pair_df.drop_duplicates()

Unnamed: 0,L_PT5_0,L_PT5_1,L_PT5_2,L_PT5_3,L_PT5_4,L_PT5_5,L_PT5_6,L_PT5_7,L_PT5_8,L_PT5_9,...,R_PT5_1016,R_PT5_1017,R_PT5_1018,R_PT5_1019,R_PT5_1020,R_PT5_1021,R_PT5_1022,R_PT5_1023,L UniProt,R UniProt
0,0.206500,0.143800,-0.08920,0.087600,0.100200,-0.002910,-0.063050,-0.11255,-0.112550,-0.063230,...,-0.015330,0.02512,0.04855,0.019580,0.014496,-0.003325,-0.033170,0.006145,P0DP23,Q13507
1,0.018220,0.051030,0.05066,0.027760,-0.029160,0.005320,-0.011100,-0.03760,0.035740,-0.010345,...,-0.001158,-0.01534,0.02208,-0.003060,-0.025240,-0.002840,0.011650,0.028370,P46531,Q9Y219
2,0.005455,0.057800,0.02992,0.019350,-0.004814,-0.002748,0.006336,-0.05344,0.056820,-0.012700,...,-0.011380,-0.01611,0.04593,0.028210,-0.043730,-0.005104,0.003342,0.017910,Q9Y219,P46531
3,0.004310,0.066600,0.03260,0.019780,-0.003508,-0.000731,0.011980,-0.05740,0.057700,-0.022550,...,-0.011380,-0.01611,0.04593,0.028210,-0.043730,-0.005104,0.003342,0.017910,O00548,P46531
4,0.045780,0.028350,0.05770,0.016770,-0.053470,0.032620,-0.004917,-0.14950,-0.003765,-0.001492,...,0.001918,-0.01102,0.06134,0.012990,0.009730,0.012700,0.015045,0.020200,P05019,P08069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4491,-0.002394,0.065700,0.05057,0.001718,-0.054700,0.020030,-0.046880,-0.05573,0.037230,0.037080,...,0.000333,-0.08200,0.03370,0.028820,0.000588,0.010254,-0.026930,-0.003159,P17936,Q6UXB1
4492,-0.007520,0.055540,0.06350,-0.007694,-0.044650,0.032560,-0.061500,-0.05720,0.015900,0.041230,...,0.000333,-0.08200,0.03370,0.028820,0.000588,0.010254,-0.026930,-0.003159,P18065,Q6UXB1
4493,0.031040,0.010155,0.02255,0.032620,-0.004540,0.007164,-0.028200,-0.02136,-0.015760,0.009560,...,-0.012860,-0.02338,-0.02720,0.028060,-0.009540,-0.041440,0.004772,0.041630,O75487,P01308
4494,-0.011480,0.059970,0.08180,0.010864,-0.075300,-0.013220,0.027920,-0.19930,-0.001354,0.001966,...,-0.042480,-0.02810,-0.03568,0.035030,0.009960,0.015880,0.041930,0.041960,Q969E1,Q9UBU3


In [7]:
pair_df.to_csv('ProtT5_embeddings/OP_LR.csv')