# Paratopes in Absolut

We explore the paratope data in Absolut.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from NegativeClassOptimization import utils, preprocessing

  from .autonotebook import tqdm as notebook_tqdm


## Dataset 4 data organization

Looks like `Task4_Merged_Slice_ParaEpi.zip` could be the only file with everything we would currently need.

In [2]:
df = pd.read_csv("../data/Absolut/processed/paratope_epitope/Task4_Merged_Slice_ParaEpi.txt", sep="\t")
df.head(2)

Unnamed: 0,Slide,Label,hotspot_ID,seqAGEpitope,motifAGEpitope,agregatesAGEpitope,chemicalAGEpitope,seqABParatope,motifABParatope,agregatesABParatope,chemicalABParatope,AAcompoFullSlice,sizeCDR3
0,LLGLYWYFDVW,1ADQ_A,1ADQ_A_H1,S1W1V1D1V2Q1V3N1A1K1T2V1L2H1N1Y1,X1--X1--X1X1--X2X1X3--X1X1X1--X2X1X2X1--X1--X1,S1--W1--V1D1--V2Q1V3--N1A1K1--T2V1L2H1--N1--Y1,p1--r1--n1c1--n2p1n3--p1n1c1--p2n1n2c1--p1--r1,L3L4G3L1Y2Y2F2V2W2*,X3X4X3X1X2--X2X2--X2X2*,L3L4G3L1Y2--Y2F2--V2W2*,n3n4n3n1r2--r2r2--n2r2*,0_0_0.0909091_0_0.0909091_0.0909091_0_0_0_0.27...,15
1,YYSNYWYFDVW,1ADQ_A,1ADQ_A_H2,P1V1F1V3D3V1S1Q2Q1V1T1K1P1E1Y1R2V1V1,X1--X1X1--X3--X3X1X1X2--X1X1--X1X1X1--X1--X1X2...,P1--V1F1--V3--D3V1S1Q2--Q1V1--T1K1P1--E1--Y1R2...,n1--n1r1--n3--c3n1p1p2--p1n1--p1c1n1--c1--r1c2...,Y2Y3N2Y1W3Y1F4D1V4W3*,X2X3--X2X1X3X1X4X1X4X3*,Y2Y3--N2Y1W3Y1F4D1V4W3*,r2r3--p2r1r3r1r4c1n4r3*,0_0_0.0909091_0_0.0909091_0_0_0_0_0_0_0.090909...,14


In [3]:
df.groupby("Label")["hotspot_ID"].apply(lambda x: len(x.unique()))

Label
1ADQ_A    4
1FBI_X    2
1FNS_A    1
1FSK_A    5
1H0D_C    3
         ..
5MES_A    1
5T5F_A    3
5TH9_A    1
5TLJ_X    2
5TZ2_C    1
Name: hotspot_ID, Length: 159, dtype: int64

In [4]:
df.groupby("hotspot_ID")["seqAGEpitope"].apply(lambda x: len(x.unique()))

hotspot_ID
1ADQ_A_H1              23
1ADQ_A_H2              12
1ADQ_A_H3               3
1ADQ_A_H4               2
1FBI_X_H1               6
                     ... 
5T5F_A_H15T5F_A_H2      7
5T5F_A_H2               1
5TH9_A_H1               5
5TLJ_X_H1              13
Unknown               520
Name: seqAGEpitope, Length: 263, dtype: int64

In [5]:
df.groupby(["hotspot_ID", "seqAGEpitope"])["seqABParatope"].apply(lambda x: len(x.unique()))

hotspot_ID  seqAGEpitope                            
1ADQ_A_H1   D1I1S1T1W1V1D1V1Q1V1A1K1T2V1L2H1Q1N1Y1          1
            D1I1S1T1W1V1D1V2Q1V2A1K1T2V1L2N1Y1              7
            K1D1I1S1T1W1V1D1V1Q1V1A1K1V1T3V1L2N1Y1        468
            K1D1I1S1T1W1V1D1V2Q1V2A1K1T2V1L2N1Y1E1         62
            K1D1I2S1T1W1V1D1V1Q1V1A1K1T2V1L2N1Y1E1H1       14
                                                        ...  
Unknown     Y1S1L1K1F2L1G1I2I1F1I1T1L2N1L1L1                3
            Y1S1L1K1F2L1G1I2I1F1I1T1L2N1S1L1E1L1           36
            Y1S1L1K1F2L1G1I2I1F1I1T2L2N1S1L1E1L1          592
            Y1S1L1K1F2L1G1I3I1F1I1L2K1Y1N2L1L1          10749
            Y1S2L1K1F2L1G1I3F1I1F1I1L1K1Y2                  1
Name: seqABParatope, Length: 3150, dtype: int64

In [6]:
df.groupby(["hotspot_ID", "seqAGEpitope"])["seqABParatope"].apply(lambda x: len(x.unique())).sum()

7484259

## Paratope representations
Need a good, convenient way to represent paratope, such that later we can easily derive a matrix and compare with xAI results.

In [7]:
# Slide	seqABParatope	motifABParatope
s = df.iloc[0]
s["Slide"], s["seqABParatope"], s["motifABParatope"]

('LLGLYWYFDVW', 'L3L4G3L1Y2Y2F2V2W2*', 'X3X4X3X1X2--X2X2--X2X2*')

In [8]:
len(s["seqABParatope"]), len(s["motifABParatope"])

(19, 23)

In [9]:
paratope = preprocessing.get_no_degree_paratope(s["seqABParatope"], s["motifABParatope"])
print(paratope)

enc = preprocessing.onehot_encode_paratope(paratope)
print(enc)

LLGLY-YF-VW
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0.]]


## Add paratope data to slide sequences

Previously we haven't been using paratope data. Now we fetch this data from Dataset4 and append it. We also provide here an exploratory analysis of paratopes in mini-Absolut. 

In [35]:
df = utils.load_1v1_binary_dataset(
    ag_pos="1ADQ",
    ag_neg="3VRL",
    num_samples=20000,
)
print(df.shape)

df_para = pd.read_csv("../data/Absolut/processed/paratope_epitope/Task4_Merged_Slice_ParaEpi.txt", sep="\t")
df_para = df_para[[
    "Slide",
    "Label",
    "hotspot_ID",
    "agregatesAGEpitope", 
    "agregatesABParatope",
    ]].copy()

df_para = df_para.loc[
    df_para["Label"].str.split("_").str[0].isin(["1ADQ", "3VRL"])
    ].copy()


df = pd.merge(df, df_para, on="Slide", how="left")
print(df.shape)
df.head(2)

(20000, 8)
(21495, 12)


Unnamed: 0,ID_slide_Variant,CDR3,Best,Slide,Energy,Structure,UID,Antigen,Label,hotspot_ID,agregatesAGEpitope,agregatesABParatope
0,6981712_03a,CARTLLFPHWYFDVW,True,TLLFPHWYFDV,-99.34,128868-LSRRLLSUSU,1ADQ_6981712_03a,1ADQ,1ADQ_A,1ADQ_A_H1,L1--R1--P1--W1--V1D2--V1Q1V1--A1K1--T1V1L1H1Q3...,T2L3L4F3P1H1W1Y1F2D1V4*
1,4817889_07a,CAREWGAPLLWLRWAMDYW,True,PLLWLRWAMDY,-95.26,128868-LSRRLLSUSU,1ADQ_4817889_07a,1ADQ,1ADQ_A,1ADQ_A_H1,L1--R1--P1--W1--V1D2--V1Q1V1--A1K1--T1V1L1H1Q3...,P2L3L4W3L1R1W1A1M2D1Y4*


In [36]:
df.columns

Index(['ID_slide_Variant', 'CDR3', 'Best', 'Slide', 'Energy', 'Structure',
       'UID', 'Antigen', 'Label', 'hotspot_ID', 'agregatesAGEpitope',
       'agregatesABParatope'],
      dtype='object')

The table below shows that even though we removed duplicated entries corresponding to `ag_neg` in df, we still have this info in the `df_para`, leading to duplications. We can however asses fast whether in the cross-binding setting the paratope is the same!

In [37]:
df.loc[df["Slide"].duplicated(keep=False)].sort_values("Slide")

Unnamed: 0,ID_slide_Variant,CDR3,Best,Slide,Energy,Structure,UID,Antigen,Label,hotspot_ID,agregatesAGEpitope,agregatesABParatope
15188,5856736_01a,CAFDLLWYYWYFDVW,True,AFDLLWYYWYF,-99.70,137570-BSDLRDDSLU,3VRL_5856736_01a,3VRL,1ADQ_A,1ADQ_A_H1,S1--W1--V1D1--V2Q1V3--N1A1K1--T2V1L2H1--N1--Y1,A2F2--L2L3W4Y3Y1W2--F2*
15189,5856736_01a,CAFDLLWYYWYFDVW,True,AFDLLWYYWYF,-99.70,137570-BSDLRDDSLU,3VRL_5856736_01a,3VRL,3VRL_C,3VRL_C_H1,D1T1L2--V1Q1--A1--P1--C3K1--L1--A1L3G1P2--A1T1,A2F1--L2L1W3Y3Y3W1Y2F4*
2165,3895421_05a,CARCGAFGLHWYFDVW,True,AFGLHWYFDVW,-99.18,137248-RRDLDUDRDR,1ADQ_3895421_05a,1ADQ,1ADQ_A,1ADQ_A_H2,P1--V1F1--V3--D3V1S1Q2--Q1V1--T1K1P1--E1--Y1R2...,A2F3--L2H1W3Y1F4D1V4W3*
2166,3895421_05a,CARCGAFGLHWYFDVW,True,AFGLHWYFDVW,-99.18,137248-RRDLDUDRDR,1ADQ_3895421_05a,1ADQ,3VRL_C,3VRL_C_H1,D1T1L2--V1Q1--A1--P1--C3K1--L1--A1L3G1P2--A1T1,A2F1--L2H1W3Y3F3D1V2W4*
4151,3932039_01a,CAFPLRWYFDVW,True,AFPLRWYFDVW,-99.00,137248-RRDLDUDRDR,1ADQ_3932039_01a,1ADQ,3VRL_C,3VRL_C_H1,D1T1L2--V1Q1--A1--P1--C3K1--L1--A1L3G1P2--A1T1,A2F1--L2R1W3Y3F3D1V2W4*
...,...,...,...,...,...,...,...,...,...,...,...,...
21323,5912492_05a,CARLGYYTLGWYFDVW,True,YYTLGWYFDVW,-97.46,137248-RRDLDUDRDR,1ADQ_5912492_05a,1ADQ,1ADQ_A,1ADQ_A_H2,P1--V1F1--V3--D3V1S1Q2--Q1V1--T1K1P1--E1--Y1R2...,Y2Y3--L2G1W3Y1F4D1V4W3*
20642,4214718_03a,CTRYYYGFLFFAYW,True,YYYGFLFFAYW,-94.77,132966-BDURSRRDUR,1ADQ_4214718_03a,1ADQ,3VRL_C,3VRL_C_H1,W1M1--D1T2L2--V2Q1--A1--P1--C2K1--L1--L1G1P1--...,Y1Y2Y1--F1L3F3F3A1Y2W4*
20641,4214718_03a,CTRYYYGFLFFAYW,True,YYYGFLFFAYW,-94.77,132966-BDURSRRDUR,1ADQ_4214718_03a,1ADQ,1ADQ_A,1ADQ_A_H1,S1--W1--V1D1--V2Q1V3--N1A1K1--T2V1L2H1--N1--Y1,Y2Y2--G2F3L4F3F1A2--W2*
8725,648337_05a,CARSDYYYGSILFYYFDYW,True,YYYGSILFYYF,-97.87,137375-SLSDSRRSDL,3VRL_648337_05a,3VRL,1ADQ_A,1ADQ_A_H1,S1--W1--V1D1--V2Q1V3--N1A1K1--T2V1L2H1--N1--Y1,Y2Y2--S2I2L3F4Y3Y1F2*


! Track duplicates and double-check on why we are getting them here. In Task4, where are the binding energies? Are they also high-binders? Make sure it makes sense.