# Generating Open Datasets.

In this notebook we explore the generation of open datasets.

In [1]:
from itertools import combinations
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

import NegativeClassOptimization.utils as utils
import NegativeClassOptimization.config as config
import NegativeClassOptimization.ml as ml
import NegativeClassOptimization.preprocessing as nco_preprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(config.DATA_SLACK_1_GLOBAL, sep='\t')
df = df.groupby("Antigen").sample(10)
df.head(3)

Unnamed: 0,ID_slide_Variant,CDR3,Best,Slide,Energy,Structure,UID,Antigen
426075,4349486_03a,CARLRWLLLYYAMDYW,True,LRWLLLYYAMD,-98.8,124837-RRURSUUSLD,1ADQ_4349486_03a,1ADQ
450257,5108550_01a,CARALLLWFAYW,True,ARALLLWFAYW,-95.89,124837-RRURSUUSLD,1ADQ_5108550_01a,1ADQ
485497,6181962_02a,CASLLWLRREYYYAMDYW,True,SLLWLRREYYY,-94.36,128868-LSRRLLSUSU,1ADQ_6181962_02a,1ADQ


NDB1

In [3]:
df_ndb1_open = df.loc[df["Antigen"].isin(config.ANTIGENS_OPENSET)].copy()

for (ag_pos, ag_neg) in combinations(config.ANTIGENS_CLOSEDSET, 2):
    dataset_id = f"NDB1_{ag_pos}-vs-{ag_neg}"
    print(dataset_id)
    df_ndb1_i = df.loc[df["Antigen"].isin([ag_pos, ag_neg])].copy()
    df_ndb1_open_i = df_ndb1_open.loc[~df_ndb1_open["Slide"].isin(df_ndb1_i["Slide"])].copy()

    assert sum(df_ndb1_open_i["Slide"].isin(df_ndb1_i["Slide"])) == 0, (
        "Open set contains `Slide` sequences from the closed set."
    )
    
    data_for_pytorch = ml.preprocess_data_for_pytorch_binary(df_ndb1_i, ag_pos=[ag_pos], scale_onehot=True)
    break

NDB1_1FBI-vs-1NSN


NDBK

In [4]:
df_ndbk_open = df.loc[df["Antigen"].isin(config.ANTIGENS_OPENSET)].copy()

for ag_pos in config.ANTIGENS_CLOSEDSET:
    dataset_id = f"NDBK_{ag_pos}-vs-rest"
    print(dataset_id)
    df_ndbk_i = df.loc[df["Antigen"].isin(config.ANTIGENS_CLOSEDSET)].copy()
    df_ndbk_open_i = df_ndb1_open.loc[~df_ndb1_open["Slide"].isin(df_ndbk_i["Slide"])].copy()

    assert sum(df_ndbk_open_i["Slide"].isin(df_ndbk_i["Slide"])) == 0, (
        "Open set contains `Slide` sequences from the closed set."
    )

    data_for_pytorch = ml.preprocess_data_for_pytorch_binary(df_ndbk_i, ag_pos=[ag_pos], scale_onehot=True)
    break

NDBK_1FBI-vs-rest


OSK

In [12]:
df_osk_open = df.loc[df["Antigen"].isin(config.ANTIGENS_OPENSET)].copy()

dataset_id = f"OSK"
print(dataset_id)

df_osk = df.loc[df["Antigen"].isin(config.ANTIGENS_CLOSEDSET)].copy()
df_osk_open = df_osk_open.loc[~df_osk_open["Slide"].isin(df_osk["Slide"])].copy()

assert sum(df_osk_open["Slide"].isin(df_osk["Slide"])) == 0, (
    "Open set contains `Slide` sequences from the closed set."
)

data_for_pytorch = ml.preprocess_data_for_pytorch_multiclass(df_osk, scale_onehot=True)

OSK


NDM1

In [6]:
df_ndm1_open = df.loc[df["Antigen"].isin(config.ANTIGENS_OPENSET)].copy()

for ag_neg in config.ANTIGENS_CLOSEDSET:
    dataset_id = f"NDM1_rest-vs-{ag_neg}"
    print(dataset_id)
    df_ndm1_i = df.loc[df["Antigen"].isin(config.ANTIGENS_CLOSEDSET)].copy()
    df_ndm1_open_i = df_ndm1_open.loc[~df_ndm1_open["Slide"].isin(df_ndm1_i["Slide"])].copy()

    assert sum(df_ndm1_open_i["Slide"].isin(df_ndm1_i["Slide"])) == 0, (
        "Open set contains `Slide` sequences from the closed set."
    )

    ag_pos = list(set(config.ANTIGENS_CLOSEDSET) - set([ag_neg]))
    data_for_pytorch = ml.preprocess_data_for_pytorch_binary(df_ndm1_i, ag_pos=ag_pos, scale_onehot=True)
    break

NDM1_rest-vs-1FBI


NDMK

In [7]:
df_ndmk_open = df.loc[df["Antigen"].isin(config.ANTIGENS_OPENSET)].copy()

for ag_pos_set in combinations(config.ANTIGENS_CLOSEDSET, 3):
    string_repr = "-".join(sorted(ag_pos_set))
    dataset_id = f"NDM1_{string_repr}-vs-rest"
    print(dataset_id)
    df_ndmk_i = df.loc[df["Antigen"].isin(config.ANTIGENS_CLOSEDSET)].copy()
    df_ndmk_open_i = df_ndmk_open.loc[~df_ndmk_open["Slide"].isin(df_ndmk_i["Slide"])].copy()

    assert sum(df_ndmk_open_i["Slide"].isin(df_ndmk_i["Slide"])) == 0, (
        "Open set contains `Slide` sequences from the closed set."
    )

    ag_pos = list(set(config.ANTIGENS_CLOSEDSET) - set([ag_neg]))
    data_for_pytorch = ml.preprocess_data_for_pytorch_binary(df_ndmk_i, ag_pos=ag_pos, scale_onehot=True)
    break

NDM1_1FBI-1NSN-1OB1-vs-rest
