# 01c2 Epitopes into MiniAbsolut

We generate "new" antigens in MiniAbsolut and MiniAbsolut splits, which are actually nothing else than epitopes from the antigens. We follow the same code pattern used in 01b, in which we integrated experimental data into MiniAbsolut.

Plan: for each Miniabsolut antigen and for each sequence type (high, weak, nonb), we combine train_15 + rest, we select 15k according to epitope/hotspot, we evaluate that it makes sense to have extra splits (if enough data), and we generate a new set train_15* and rest* accordingly. Test set remains constant. Later subsets based on the epitope of the test set can be analysed.

In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

import shutil

from NegativeClassOptimization import ml
from NegativeClassOptimization import utils, config
from NegativeClassOptimization import preprocessing



In [5]:
epitope_based_ags_map = {
    "1WEJ": ("1WEJE1", "F1G2K2K1N1G3I1T2W1K2T1Y1A1T1N1"),
    "1H0D": ("1H0DE1", "P1Q1G1R1I2S1S2S1F1Q2V1G1F1V1H1L1F1"),
    "1OB1": ("1OB1E1", "S1N1S1G1L3V1N2K1I2C2C1P1F2D2"),
}

In [6]:
for key, value in epitope_based_ags_map.items():

    print(key)

    ag = key
    ag_new, seqAGEpitope = value

    base_path = config.DATA_MINIABSOLUT / f"{ag}/energy_contributions"

    df_high_train = pd.read_csv(base_path / "high_train_15000_absolut_energy_contributions.tsv", sep='\t', header=1)
    df_high_rest = pd.read_csv(base_path / "high_rest_absolut_energy_contributions.tsv", sep='\t', header=1)

    df_weak_train = pd.read_csv(base_path / "looserX_train_15000_absolut_energy_contributions.tsv", sep='\t', header=1)
    df_weak_rest = pd.read_csv(base_path / "looserX_rest_absolut_energy_contributions.tsv", sep='\t', header=1)

    df_nonb_train = pd.read_csv(base_path / "95low_train_15000_absolut_energy_contributions.tsv", sep='\t', header=1)
    df_nonb_rest = pd.read_csv(base_path / "95low_rest_absolut_energy_contributions.tsv", sep='\t', header=1)

    df_high_train["binder_type"] = f"{ag}_high"
    df_high_rest["binder_type"] = f"{ag}_high"

    df_weak_train["binder_type"] = f"{ag}_looserX"
    df_weak_rest["binder_type"] = f"{ag}_looserX"

    df_nonb_train["binder_type"] = f"{ag}_95low"
    df_nonb_rest["binder_type"] = f"{ag}_95low"

    # Concatenate all
    df = pd.concat(
        [
            df_high_train,
            df_high_rest,
            df_weak_train,
            df_weak_rest,
            df_nonb_train,
            df_nonb_rest,
        ]
    )

    # df.query("seqAGEpitope == @seqAGEpitope").groupby("binder_type").size()
    df = df.query("seqAGEpitope == @seqAGEpitope")
    df["Antigen"] = ag_new
    # df.head()

    assert all(df.groupby("binder_type").size() > 15000)

    # Rebuild the dataframes
    df_high_train = df.loc[df["binder_type"] == f"{ag}_high"].sample(15000)
    df_high_rest = df.loc[(df["binder_type"] == f"{ag}_high") & (~df.index.isin(df_high_train.index))]
    df_weak_train = df.loc[df["binder_type"] == f"{ag}_looserX"].sample(15000)
    df_weak_rest = df.loc[(df["binder_type"] == f"{ag}_looserX") & (~df.index.isin(df_weak_train.index))]
    df_nonb_train = df.loc[df["binder_type"] == f"{ag}_95low"].sample(15000)
    df_nonb_rest = df.loc[(df["binder_type"] == f"{ag}_95low") & (~df.index.isin(df_nonb_train.index))]

    # Make the new directory in MiniAbsolut
    new_ag_dir = config.DATA_MINIABSOLUT / f"{ag_new}"
    new_ag_dir.mkdir(exist_ok=True)

    # Copy the test files from the original antigen
    for file in (config.DATA_MINIABSOLUT / f"{ag}").glob("*test*.tsv"):
        # Copy file to new antigen directory
        # using shutil.copyfile(src, dst)
        new_file = new_ag_dir / file.name
        shutil.copyfile(file, new_file)

    ## Save the new files in the main folder
    ## Columns for normal tsvs in MiniAbsolut
    cols_sel = ["ID_slide_Variant", "CDR3", "Best", "Slide", "Energy", "Structure", "Antigen"]
    df_high_train[cols_sel].to_csv(new_ag_dir / f"high_train_15000.tsv", sep='\t', index=False)
    df_high_rest[cols_sel].to_csv(new_ag_dir / f"high_rest.tsv", sep='\t', index=False)
    df_weak_train[cols_sel].to_csv(new_ag_dir / f"looserX_train_15000.tsv", sep='\t', index=False)
    df_weak_rest[cols_sel].to_csv(new_ag_dir / f"looserX_rest.tsv", sep='\t', index=False)
    df_nonb_train[cols_sel].to_csv(new_ag_dir / f"95low_train_15000.tsv", sep='\t', index=False)
    df_nonb_rest[cols_sel].to_csv(new_ag_dir / f"95low_rest.tsv", sep='\t', index=False)

    ###
    # Save the new files in the "*_energy_contributions" folder,
    # where other modules expect Absolut data regarding binding
    # energy.
    new_ag_energy_dir = new_ag_dir / "energy_contributions"
    new_ag_energy_dir.mkdir(exist_ok=True)

    # Copy the test files from the original antigen
    for file in (config.DATA_MINIABSOLUT / f"{ag}/energy_contributions").glob("*test*energy_contributions.tsv"):
        # Copy file to new antigen directory
        # using shutil.copyfile(src, dst)
        new_file = new_ag_energy_dir / file.name
        shutil.copyfile(file, new_file)

    df_high_train.to_csv(new_ag_energy_dir / f"high_train_15000_absolut_energy_contributions.tsv", sep='\t', index=False)
    df_high_rest.to_csv(new_ag_energy_dir / f"high_rest_absolut_energy_contributions.tsv", sep='\t', index=False)
    df_weak_train.to_csv(new_ag_energy_dir / f"looserX_train_15000_absolut_energy_contributions.tsv", sep='\t', index=False)
    df_weak_rest.to_csv(new_ag_energy_dir / f"looserX_rest_absolut_energy_contributions.tsv", sep='\t', index=False)
    df_nonb_train.to_csv(new_ag_energy_dir / f"95low_train_15000_absolut_energy_contributions.tsv", sep='\t', index=False) 
    df_nonb_rest.to_csv(new_ag_energy_dir / f"95low_rest_absolut_energy_contributions.tsv", sep='\t', index=False)


1WEJ
1H0D
1OB1
