# Visualize Slides

We develop visualizations for `Slide` sequences.

In [1]:
from typing import List
from functools import reduce
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import umap

import NegativeClassOptimization.config as config
import NegativeClassOptimization.datasets as datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(config.DATA_SLACK_1_GLOBAL, sep='\t')
df.head(2)

Unnamed: 0,ID_slide_Variant,CDR3,Best,Slide,Energy,Structure,UID,Antigen
0,1873658_06a,CARPENLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_1873658_06a,3VRL
1,7116990_04a,CARGLLLLLWYFDVW,True,LLLLLWYFDVW,-112.82,137442-BRDSLLUDLS,3VRL_7116990_04a,3VRL


Are all the aminoacids used in `Slide`? Yes.

In [3]:
def get_aminoacids_list(df) -> List[str]:
    """An utility to extract all used aminoacids in a df['Slide']."""
    aminoacids = reduce(lambda s1, s2: set(s1).union(set(s2)), df["Slide"].to_list(), set([]))
    aminoacids = list(aminoacids)
    print(f"Amino acids used (N={len(aminoacids)}): {aminoacids}")
    return aminoacids

get_aminoacids_list(df);

Amino acids used (N=20): ['D', 'I', 'L', 'V', 'E', 'H', 'W', 'S', 'G', 'C', 'P', 'Y', 'Q', 'R', 'T', 'M', 'K', 'N', 'A', 'F']


One-hot encoding of `Slide`.

In [4]:
def get_one_hot_aa_encoder(aminoacids: List[str] = config.SLIDE_AMINOACIDS):
    """Get a OneHotEncoder fitted to the aminoacids characters used in the `Slide`s of `700k dataset`.

    Args:
        aminoacids (List[str], optional): default list of amino acids used in `Slide`. Defaults to config.SLIDE_AMINOACIDS.

    Returns:
        OneHotEncoder: fitted, ready to transform sequences.
    """    
    aa_as_array = np.array(aminoacids)
    encoder = OneHotEncoder(sparse=False).fit(aa_as_array.reshape(-1, 1))
    return encoder


def onehot_encode(
    string: str, 
    encoder = get_one_hot_aa_encoder()
    ) -> np.array:
    """Encode a string to onehot numpy array with shape (-1).

    Args:
        string (str)
        encoder (_type_, optional): fitted encoder. Defaults to get_one_hot_aa_encoder().

    Returns:
        np.array
    """    
    string_as_array = np.array(list(string))
    string_as_onehot = encoder.transform(string_as_array.reshape(-1, 1))
    return string_as_onehot.reshape(-1)

In [6]:
df["Slide"][:10].apply(lambda s: onehot_encode(s))

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
5    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
6    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
7    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
8    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
9    [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
Name: Slide, dtype: object

In [None]:
# TODO
# reducer = umap.UMAP()