In [1]:
import warnings
warnings.filterwarnings("ignore")

from collections import Counter

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

sys.path.append("../src")

from utils import *
from inference import *
global_aa = list("ACDEFGHIKLMNPQRSTVWY")
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Apply model to example sequences

### Process data
- The example sequences are ELISA verified specific and nonspecific binders to Xolair/Herceptin. The idea is that the model trained on **unrelated** sticky targets can identify sequences that would bind promiscuously to many targets, including to both Xolair or Herceptin. 

In [18]:
result=pd.read_csv("../data/data/TableS1.csv", header=[0])

- One can apply these models to any sequences in the following format:

In [19]:
test=result["Sequence"]
test

0         EIRVPILIFFDY
1         KIHRRFVVSFDY
2         ELDKYPLVYFDY
3         ERRQVPQIWFDY
4      DTGFHDQDQSHYMDY
5         PAAPFYDEPFDY
6         ADPYVYHEWLDY
7        QWEKEWVEAQFDY
8         EIDYYPLIIFDY
9         HHHPKYWGGFDY
10        ELRLVPLIGFDY
11        EYSAWPLIYFDY
12        YGWWHWEAPFDY
13    DKWPDSTWYGFYEFDY
14         GATYYEEWMDY
15         ELVYYHEYLDY
16    EDRHQRHFQIQISFDY
17        EYRQAPLVDFDY
18    ELRGGWRIPVPIWFDY
19        ELSLWPLLIFDY
20         PGGYYDEAFDY
21        DPYYWWEWEFDY
22    DPWSWPSIDLYWGFDY
23        DWSDVLSPEFDY
24        EQDYHPLIWFDY
25        FSFRRFVQSLDY
26        DQSKYPLVYFDY
27        EERRPPLVIFDY
28    DYIYFDRGKRGQEFDY
29    ERFVERHWVGRKRFDY
30    DVPIVQVQGRSGVFDY
31    DYIYFLRPHRTHWFDY
32        EVRSPPQIQFDY
33        DTYRRFIDSFDY
34        ERAYYPLVYFDY
35        EIRKTPLVFFDY
36          DWPWYRALDY
37    DSVRPDIPQWKLSFDY
38        DQPWHPGATFDY
39    PTPVYFSLTSYGIFDY
40        EEQQWPLIYFDY
41        ESSQYPLIKFDY
42        EYTGWPLLYFDY
43        E

- By running the following set of processing transformations:

In [4]:
# This step adds padding to length 20 evenly on the sequence
pad=np.vectorize(pad_sequence)
test=pad(test, "J", 20)
test=test.astype("object")

# This performs one-hot encoding of data 
dat=process(test)
dat=torch.tensor(np.array(dat),  dtype = torch.float32)

### Compute unrelated target predictions
- Next, we generate predictions for each unrelated, sticky target using previously trained models.

In [5]:
bsa=compute_pred_labels("../experiments/bsa/", dat).reshape(-1)
bv=compute_pred_labels("../experiments/bv/", dat).reshape(-1)
tgfb=compute_pred_labels("../experiments/tgfb/", dat).reshape(-1)

- In this step, the final predictions are made. If a sequence binds **any** of the three sticky targets, it is labeled nonspecific. Otherwise it is labeled specific. As shown in the manuscript, the results of this align with the ELISA ('Ground-truth labels' column in result.)

In [6]:
final_pred=[]
for i in range(48):
    if bsa[i]==1 or tgfb[i]==1 or bv[i]==1:
        final_pred.append("Non-specific")
    else:
        final_pred.append("Anti-id (specific)")