# SN10 classifier and integrated gradients attribution

In this notebook, we develop the SN10 classifier used in `Absolut!` and the integrated-gradients method of attribution. We might also check other attribution methods.

In [1]:
import os
from pathlib import Path
from typing import List

import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import NegativeClassOptimization.config as config
import NegativeClassOptimization.preprocessing as preprocessing

  from .autonotebook import tqdm as notebook_tqdm


Let's load the data on which we are going to develop the binary classifier.

In [2]:
df = pd.read_csv(config.DATA_SLACK_1_GLOBAL, sep='\t')

ag_pos = "3VRL"
ag_neg = "1ADQ"
df = df.loc[df["Antigen"].isin([ag_pos, ag_neg])].copy()

df.head(2)

Unnamed: 0,ID_slide_Variant,CDR3,Best,Slide,Energy,Structure,UID,Antigen
0,1873658_06a,CARPENLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_1873658_06a,3VRL
1,7116990_04a,CARGLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_7116990_04a,3VRL


First, handle duplicates.

In [3]:
def prepare_data(df: pd.DataFrame, pos_ag: str) -> pd.DataFrame:
    """Prepare data for SN10 training and evaluation. 
    Most importantly - appropriately removes duplicates. 

    Args:
        df (pd.DataFrame): typical dataframe used in the project
        pos_ag (str): the antigen assuming the positive dataset role

    Returns:
        pd.DataFrame: df with new columns suitable for modelling.
    """
    
    def infer_antigen_from_duplicate_list(antigens: List[str], pos_antigen: str):
        assert len(antigens) <= 2, ">2 antigens not supported yet."
        if len(antigens) == 1:
            return antigens[0]
        else:
            if pos_antigen in antigens:
                return pos_antigen
            else:
                return list(set(antigens) - set([pos_antigen]))[0]

    df = df.groupby("Slide").apply(
        lambda df_: infer_antigen_from_duplicate_list(df_["Antigen"].unique().tolist(), pos_antigen=ag_pos)
    )
    df = pd.DataFrame(data=df, columns=["Antigen"])
    df = df.reset_index()
    return df


df = prepare_data(df, ag_pos)

preprocessing.onehot_encode_df(df);

df["X"] = df["Slide_onehot"]
df["y"] = np.where(df["Antigen"] == ag_pos, 1, 0)

df.head(3)

Unnamed: 0,Slide,Antigen,Slide_onehot
0,AAELFWYFDVW,3VRL,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,AAFITTVGWYF,1ADQ,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,AAFYGRWYFDV,1ADQ,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [58]:
class PairwiseDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        return (
            torch.tensor(self.df.loc[idx, "X"]),
            torch.tensor(self.df.loc[idx, "y"]), 
        )



batch_size = 64
train_frac = 0.8
df = df.sample(frac=1).reset_index(drop=True)  # shuffle

split_idx = int(df.shape[0] * train_frac)
df_train = df.loc[:split_idx].copy()
df_test = df.loc[split_idx:].copy()

train_data = PairwiseDataset(df_train)
test_data = PairwiseDataset(df_test)

train_loader = DataLoader(train_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [68]:
model(train_data[0][0])

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

Define the model.

In [59]:
class SN10(nn.Module):
    def __init__(self):
        super(SN10, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(11*20, 10),
            nn.ReLU(),
            nn.Linear(10, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
model = SN10().to(device)
print(model)

Using cpu device
SN10(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=220, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=1, bias=True)
    (3): Sigmoid()
  )
)


Setup the optimization loop.

In [60]:
learning_rate = 0.01
epochs = 5

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

In [69]:
def train_loop(loader, model, loss_fn, optimizer):
    size = len(loader.dataset)
    for batch, (X, y) in enumerate(loader):
        print(X)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(loader, model, loss_fn):
    size = len(loader.dataset)
    num_batches = len(loader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in loader:
            y_pred = model(X)
            test_loss += loss_fn(y_pred, y).item()
            correct += (round(y_pred) == y).type(torch.float).sum().item()
    
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    test_loop(test_loader, model, loss_fn)