# Pairing sequence embeddings for kinase-target database

1. Download kinase database from [OmniPath](https://omnipath.readthedocs.io/en/latest/installation.html)

2. Translate UniProt IDS to ProtTrans5 embeddings.

In [1]:
import pandas as pd
import numpy as np
import omnipath as op

In [2]:
df = op.interactions.KinaseExtra.get()

In [3]:
hdf = pd.read_csv('ProtT5_embeddings/Human_preprocessed.csv', index_col=0)

In [4]:
L_id = []
R_id = []
X = []

for row in df.iterrows():
    if row[1]['source'] in hdf.index:
        if row[1]['target'] in hdf.index:
            L_id.append(row[1]['source'])
            R_id.append(row[1]['target'])
            emb = np.hstack((hdf.loc[row[1]['source']].to_numpy(),
                             hdf.loc[row[1]['target']].to_numpy()))
            X.append(emb)

In [5]:
pair_df = pd.DataFrame(X,
            columns=["Kinase_PT5_%d" % i for i in range(1024)]+
            ["Target_PT5_%d" % i for i in range(1024)])
pair_df['Kinase UniProt'] = L_id
pair_df['Target UniProt'] = R_id

pair_df

Unnamed: 0,Kinase_PT5_0,Kinase_PT5_1,Kinase_PT5_2,Kinase_PT5_3,Kinase_PT5_4,Kinase_PT5_5,Kinase_PT5_6,Kinase_PT5_7,Kinase_PT5_8,Kinase_PT5_9,...,Target_PT5_1016,Target_PT5_1017,Target_PT5_1018,Target_PT5_1019,Target_PT5_1020,Target_PT5_1021,Target_PT5_1022,Target_PT5_1023,Kinase UniProt,Target UniProt
0,0.054350,0.13370,0.043760,0.012240,-0.015300,0.02644,-0.045650,-0.06726,0.02415,-0.02481,...,-0.015330,0.02512,0.04855,0.01958,0.014496,-0.003325,-0.033170,0.006145,Q13976,Q13507
1,0.047760,0.10940,0.048220,0.013860,-0.004500,0.02605,-0.021160,-0.04140,-0.00449,-0.01134,...,-0.017350,0.02925,0.05130,0.01660,0.015550,-0.000610,-0.034700,0.003052,P06241,Q9Y210
2,0.054350,0.13370,0.043760,0.012240,-0.015300,0.02644,-0.045650,-0.06726,0.02415,-0.02481,...,-0.017350,0.02925,0.05130,0.01660,0.015550,-0.000610,-0.034700,0.003052,Q13976,Q9Y210
3,0.038000,0.08356,0.045620,0.007027,-0.016020,0.03108,-0.027980,-0.03867,-0.00390,-0.01342,...,-0.017350,0.02925,0.05130,0.01660,0.015550,-0.000610,-0.034700,0.003052,P12931,Q9Y210
4,0.061280,0.13620,-0.002756,0.024890,-0.019230,0.01135,-0.039400,-0.02681,-0.04138,-0.02881,...,-0.000175,-0.00861,0.04367,0.01729,0.008736,-0.014150,0.011055,0.020840,Q00535,Q8NER1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22224,0.038000,0.08356,0.045620,0.007027,-0.016020,0.03108,-0.027980,-0.03867,-0.00390,-0.01342,...,-0.084660,0.02959,0.02434,0.02911,-0.003397,-0.030230,-0.024730,0.042450,P12931,Q8TDD2
22225,0.057430,0.04740,0.096250,0.003628,0.006718,0.03647,-0.002949,-0.08154,-0.03226,-0.01266,...,0.030900,-0.12890,-0.06780,0.02168,0.027400,0.086200,0.034120,-0.054960,O00168,Q13794
22226,0.044000,0.06433,0.044250,0.005560,0.000768,0.02406,-0.018020,-0.04944,-0.00436,-0.01802,...,0.005833,0.02649,0.05814,-0.04483,-0.041700,-0.031650,-0.001902,0.020540,Q05397,Q5SRH9
22227,0.009430,-0.01784,-0.035600,0.033900,-0.043180,0.07886,-0.050100,-0.04977,0.00871,-0.00711,...,-0.057500,-0.01857,0.02472,0.05643,0.013145,-0.014670,0.004040,0.003717,P20248,Q8TDN4


In [6]:
pair_df.drop_duplicates()

Unnamed: 0,Kinase_PT5_0,Kinase_PT5_1,Kinase_PT5_2,Kinase_PT5_3,Kinase_PT5_4,Kinase_PT5_5,Kinase_PT5_6,Kinase_PT5_7,Kinase_PT5_8,Kinase_PT5_9,...,Target_PT5_1016,Target_PT5_1017,Target_PT5_1018,Target_PT5_1019,Target_PT5_1020,Target_PT5_1021,Target_PT5_1022,Target_PT5_1023,Kinase UniProt,Target UniProt
0,0.054350,0.13370,0.043760,0.012240,-0.015300,0.02644,-0.045650,-0.06726,0.02415,-0.02481,...,-0.015330,0.02512,0.04855,0.01958,0.014496,-0.003325,-0.033170,0.006145,Q13976,Q13507
1,0.047760,0.10940,0.048220,0.013860,-0.004500,0.02605,-0.021160,-0.04140,-0.00449,-0.01134,...,-0.017350,0.02925,0.05130,0.01660,0.015550,-0.000610,-0.034700,0.003052,P06241,Q9Y210
2,0.054350,0.13370,0.043760,0.012240,-0.015300,0.02644,-0.045650,-0.06726,0.02415,-0.02481,...,-0.017350,0.02925,0.05130,0.01660,0.015550,-0.000610,-0.034700,0.003052,Q13976,Q9Y210
3,0.038000,0.08356,0.045620,0.007027,-0.016020,0.03108,-0.027980,-0.03867,-0.00390,-0.01342,...,-0.017350,0.02925,0.05130,0.01660,0.015550,-0.000610,-0.034700,0.003052,P12931,Q9Y210
4,0.061280,0.13620,-0.002756,0.024890,-0.019230,0.01135,-0.039400,-0.02681,-0.04138,-0.02881,...,-0.000175,-0.00861,0.04367,0.01729,0.008736,-0.014150,0.011055,0.020840,Q00535,Q8NER1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22224,0.038000,0.08356,0.045620,0.007027,-0.016020,0.03108,-0.027980,-0.03867,-0.00390,-0.01342,...,-0.084660,0.02959,0.02434,0.02911,-0.003397,-0.030230,-0.024730,0.042450,P12931,Q8TDD2
22225,0.057430,0.04740,0.096250,0.003628,0.006718,0.03647,-0.002949,-0.08154,-0.03226,-0.01266,...,0.030900,-0.12890,-0.06780,0.02168,0.027400,0.086200,0.034120,-0.054960,O00168,Q13794
22226,0.044000,0.06433,0.044250,0.005560,0.000768,0.02406,-0.018020,-0.04944,-0.00436,-0.01802,...,0.005833,0.02649,0.05814,-0.04483,-0.041700,-0.031650,-0.001902,0.020540,Q05397,Q5SRH9
22227,0.009430,-0.01784,-0.035600,0.033900,-0.043180,0.07886,-0.050100,-0.04977,0.00871,-0.00711,...,-0.057500,-0.01857,0.02472,0.05643,0.013145,-0.014670,0.004040,0.003717,P20248,Q8TDN4


In [7]:
pair_df.to_csv('ProtT5_embeddings/OP_kinases.csv')