# Paratopes in Absolut

We explore the paratope data in Absolut.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from NegativeClassOptimization import utils

  from .autonotebook import tqdm as notebook_tqdm


## Dataset 4 data organization

Looks like `Task4_Merged_Slice_ParaEpi.zip` could be the only file with everything we would currently need.

In [2]:
df = pd.read_csv("../data/Absolut/processed/paratope_epitope/Task4_Merged_Slice_ParaEpi.txt", sep="\t")
df.head(2)

Unnamed: 0,Slide,Label,hotspot_ID,seqAGEpitope,motifAGEpitope,agregatesAGEpitope,chemicalAGEpitope,seqABParatope,motifABParatope,agregatesABParatope,chemicalABParatope,AAcompoFullSlice,sizeCDR3
0,LLGLYWYFDVW,1ADQ_A,1ADQ_A_H1,S1W1V1D1V2Q1V3N1A1K1T2V1L2H1N1Y1,X1--X1--X1X1--X2X1X3--X1X1X1--X2X1X2X1--X1--X1,S1--W1--V1D1--V2Q1V3--N1A1K1--T2V1L2H1--N1--Y1,p1--r1--n1c1--n2p1n3--p1n1c1--p2n1n2c1--p1--r1,L3L4G3L1Y2Y2F2V2W2*,X3X4X3X1X2--X2X2--X2X2*,L3L4G3L1Y2--Y2F2--V2W2*,n3n4n3n1r2--r2r2--n2r2*,0_0_0.0909091_0_0.0909091_0.0909091_0_0_0_0.27...,15
1,YYSNYWYFDVW,1ADQ_A,1ADQ_A_H2,P1V1F1V3D3V1S1Q2Q1V1T1K1P1E1Y1R2V1V1,X1--X1X1--X3--X3X1X1X2--X1X1--X1X1X1--X1--X1X2...,P1--V1F1--V3--D3V1S1Q2--Q1V1--T1K1P1--E1--Y1R2...,n1--n1r1--n3--c3n1p1p2--p1n1--p1c1n1--c1--r1c2...,Y2Y3N2Y1W3Y1F4D1V4W3*,X2X3--X2X1X3X1X4X1X4X3*,Y2Y3--N2Y1W3Y1F4D1V4W3*,r2r3--p2r1r3r1r4c1n4r3*,0_0_0.0909091_0_0.0909091_0_0_0_0_0_0_0.090909...,14


In [3]:
df.groupby("Label")["hotspot_ID"].apply(lambda x: len(x.unique()))

Label
1ADQ_A    4
1FBI_X    2
1FNS_A    1
1FSK_A    5
1H0D_C    3
         ..
5MES_A    1
5T5F_A    3
5TH9_A    1
5TLJ_X    2
5TZ2_C    1
Name: hotspot_ID, Length: 159, dtype: int64

In [4]:
df.groupby("hotspot_ID")["seqAGEpitope"].apply(lambda x: len(x.unique()))

hotspot_ID
1ADQ_A_H1              23
1ADQ_A_H2              12
1ADQ_A_H3               3
1ADQ_A_H4               2
1FBI_X_H1               6
                     ... 
5T5F_A_H15T5F_A_H2      7
5T5F_A_H2               1
5TH9_A_H1               5
5TLJ_X_H1              13
Unknown               520
Name: seqAGEpitope, Length: 263, dtype: int64

In [5]:
df.groupby(["hotspot_ID", "seqAGEpitope"])["seqABParatope"].apply(lambda x: len(x.unique()))

hotspot_ID  seqAGEpitope                            
1ADQ_A_H1   D1I1S1T1W1V1D1V1Q1V1A1K1T2V1L2H1Q1N1Y1          1
            D1I1S1T1W1V1D1V2Q1V2A1K1T2V1L2N1Y1              7
            K1D1I1S1T1W1V1D1V1Q1V1A1K1V1T3V1L2N1Y1        468
            K1D1I1S1T1W1V1D1V2Q1V2A1K1T2V1L2N1Y1E1         62
            K1D1I2S1T1W1V1D1V1Q1V1A1K1T2V1L2N1Y1E1H1       14
                                                        ...  
Unknown     Y1S1L1K1F2L1G1I2I1F1I1T1L2N1L1L1                3
            Y1S1L1K1F2L1G1I2I1F1I1T1L2N1S1L1E1L1           36
            Y1S1L1K1F2L1G1I2I1F1I1T2L2N1S1L1E1L1          592
            Y1S1L1K1F2L1G1I3I1F1I1L2K1Y1N2L1L1          10749
            Y1S2L1K1F2L1G1I3F1I1F1I1L1K1Y2                  1
Name: seqABParatope, Length: 3150, dtype: int64

In [6]:
df.groupby(["hotspot_ID", "seqAGEpitope"])["seqABParatope"].apply(lambda x: len(x.unique())).sum()

7484259

Need a good, convenient way to represent paratope, such that later we can easily derive a matrix and compare with xAI results.

In [9]:
# Slide	seqABParatope	motifABParatope
s = df.iloc[0]
s["Slide"], s["seqABParatope"], s["motifABParatope"]

('LLGLYWYFDVW', 'L3L4G3L1Y2Y2F2V2W2*', 'X3X4X3X1X2--X2X2--X2X2*')

In [18]:
len(s["seqABParatope"]), len(s["motifABParatope"])

(19, 23)

In [26]:
def get_no_degree_paratope(seqAB: str, motifAB: str) -> str:    
    simple_paratope = motifAB
    prev_char = None
    for i, char in enumerate(seqAB):
        if not char.isdigit():
            simple_paratope = simple_paratope.replace("X", char, 1)

    simple_paratope = simple_paratope.replace("*", "")
    simple_paratope = simple_paratope.replace("--", "-")

    simple_paratope_no_deg = ""
    for char in simple_paratope:
        if char.isdigit():
            pass
        else:
            simple_paratope_no_deg += char

    return simple_paratope_no_deg

get_no_degree_paratope(s["seqABParatope"], s["motifABParatope"])

'LLGLY-YF-VW'

In [27]:
# ~One hot encode the paratope

SyntaxError: invalid syntax (441527.py, line 1)