# Explore data `slack_1`

In [1]:
from pathlib import Path
import pandas as pd
from typing import List

import NegativeClassOptimization.utils as utils
import NegativeClassOptimization.config as config

In [2]:
df_files = utils.summarize_data_files(config.DATA_SLACK_1).sort_values(["filetype", "antigen"])
df_files

Unnamed: 0,filepath,filename,filetype,antigen,datatype
8,/data/sources/ab-negative-training/data/slack_...,1ADQ_top_70000_corpus.csv,csv,1ADQ,corpus
9,/data/sources/ab-negative-training/data/slack_...,1FBI_top_70000_corpus.csv,csv,1FBI,corpus
5,/data/sources/ab-negative-training/data/slack_...,1H0D_top_70000_corpus.csv,csv,1H0D,corpus
14,/data/sources/ab-negative-training/data/slack_...,1NSN_top_70000_corpus.csv,csv,1NSN,corpus
12,/data/sources/ab-negative-training/data/slack_...,1OB1_top_70000_corpus.csv,csv,1OB1,corpus
16,/data/sources/ab-negative-training/data/slack_...,1WEJ_top_70000_corpus.csv,csv,1WEJ,corpus
10,/data/sources/ab-negative-training/data/slack_...,2YPV_top_70000_corpus.csv,csv,2YPV,corpus
2,/data/sources/ab-negative-training/data/slack_...,3RAJ_top_70000_corpus.csv,csv,3RAJ,corpus
0,/data/sources/ab-negative-training/data/slack_...,3VRL_top_70000_corpus.csv,csv,3VRL,corpus
4,/data/sources/ab-negative-training/data/slack_...,5E94_top_70000_corpus.csv,csv,5E94,corpus


## File format

It looks like `features` files have more annotations for the data in `corpus` files.

In [7]:
ag_data = utils.AntigenData("3VRL", config.DATA_SLACK_1)
ag_data.df_c.head(2)

Unnamed: 0,ID_slide_Variant,CDR3,Best,Slide,Energy,Structure,UID
0,1873658_06a,CARPENLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_1873658_06a
1,7116990_04a,CARGLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_7116990_04a


## Fetch global dataset from all antigens

In [8]:
df_global = pd.read_csv(config.DATA_SLACK_1_GLOBAL, sep='\t').iloc[:, 1:]
df_global

Unnamed: 0,CDR3,Best,Slide,Energy,Structure,UID,Antigen
0,CARPENLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_1873658_06a,3VRL
1,CARGLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_7116990_04a,3VRL
2,CARGGLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_7147788_05a,3VRL
3,CARLLVWLLLFFDYW,True,LVWLLLFFDYW,-112.67,137442-BRDSLLUDLS,3VRL_4403113_04a,3VRL
4,CARFLWLLWYFDVW,True,FLWLLWYFDVW,-111.92,137442-BRDSLLUDLS,3VRL_1662693_03a,3VRL
...,...,...,...,...,...,...,...
699995,CAIGHFSTVDWYFDVW,True,IGHFSTVDWYF,-92.52,149536-DUDDRDLRDU,1OB1_5564830_02a,1OB1
699996,CALIPLIDYYGSSPCYW,True,CALIPLIDYYG,-92.52,149536-DUDDRDLRDU,1OB1_2217831_00a,1OB1
699997,CARHRDYSNIHWYFDVW,True,YSNIHWYFDVW,-92.52,137120-LUSLRDRDUD,1OB1_4076303_06a,1OB1
699998,CAREEVLLLGFAYW,True,EEVLLLGFAYW,-92.52,153631-DRLUDLULLR,1OB1_2481088_03a,1OB1


In [9]:
cdr3_lengths = df_global["CDR3"].str.len().value_counts()
cdr3_lengths = cdr3_lengths / cdr3_lengths.sum()
# cdr3_lengths.to_dict()  # record to config
cdr3_lengths

15    0.178611
14    0.178284
16    0.175533
17    0.131307
18    0.085324
13    0.072919
12    0.051557
19    0.046964
11    0.044991
20    0.020910
21    0.008224
22    0.003164
23    0.001284
24    0.000431
25    0.000160
28    0.000099
26    0.000090
27    0.000056
31    0.000019
30    0.000014
36    0.000011
29    0.000010
40    0.000009
32    0.000006
35    0.000006
39    0.000004
43    0.000004
33    0.000003
37    0.000003
34    0.000001
Name: CDR3, dtype: float64

How unique are the CDR3s in the dataset?

- it looks like a lot of redundant CDR3, mainly **between**, not within, different antibodies.

In [10]:
cdr3_uniq = df_global["CDR3"].unique()
print(f"CDR3 unique shape: {cdr3_uniq.shape}\nValue counts:")
df_global["CDR3"].value_counts()

CDR3 unique shape: (419199,)
Value counts:


CARSGLGLGLGLGLGLGLERRYYAMDYW    32
CASRWLLLFYWYFDVW                15
CARGLGLGFLLLWGDYW               13
CARDLLLLLYWYFDVW                12
CTSLLLWLLYWYFDVW                12
                                ..
CARLNWDGYFDVW                    1
CARHAYYGNFLRFW                   1
CARSEGSYYSIYWYFDVW               1
CTYYSIYWYFDVW                    1
CARHRDYSNIHWYFDVW                1
Name: CDR3, Length: 419199, dtype: int64

In [11]:
df_global.groupby(["Antigen"])["CDR3"].apply(lambda cdr3_s: cdr3_s.unique().shape[0])

Antigen
1ADQ    68713
1FBI    69976
1H0D    69550
1NSN    68242
1OB1    69743
1WEJ    69686
2YPV    69968
3RAJ    69961
3VRL    69707
5E94    69587
Name: CDR3, dtype: int64