In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from tqdm import tqdm
from os.path import join
import torch
from selfpeptide.utils.data_utils import load_immunogenicity_dataframes, BetaDistributionDataset, SequencesInteractionDataset
from selfpeptide.utils.training_utils import get_class_weights, find_optimal_sf_quantile, CustomKL_Loss
from selfpeptide.utils.training_utils import lr_schedule, warmup_constant_lr_schedule, eval_regression_metrics, eval_classification_metrics, CustomCMT_AllTriplets_Loss
from selfpeptide.utils.function_utils import get_alpha, get_beta
from selfpeptide.utils.model_utils import load_binding_model, load_sns_model
from selfpeptide.model.immunogenicity_classifier import JointPeptidesNetwork


In [4]:
folder = "../outputs/immunogenicity_models/expert-sweep-16"
with open(join(folder, "config.json"), "r") as f:
    config = json.load(f)

In [19]:
config.update({"immunogenicity_df": "/home/gvisona/Projects/SelfPeptides/processed_data/Immunogenicity/Processed_TCell_IEDB_Beta_noPrior.csv",
          "pseudo_seq_file": "/home/gvisona/Projects/SelfPeptides/data/NetMHCpan_pseudoseq/MHC_pseudo.dat",
          "dhlap_df": "/home/gvisona/Projects/SelfPeptides/processed_data/Immunogenicity/DHLAP_immunogenicity_data.csv",
          "hdf5_dataset": "/home/gvisona/Projects/SelfPeptides/processed_data/Self_nonSelf/pre_tokenized_peptides_dataset.hdf5",
              "binding_model_config": "/home/gvisona/Projects/SelfPeptides/trained_models/binding_model/config.json",
              "binding_model_checkpoint": "/home/gvisona/Projects/SelfPeptides/trained_models/binding_model/checkpoints/001_checkpoint.pt",
              "sns_model_config": "/home/gvisona/Projects/SelfPeptides/trained_models/sns_model/config.json",
              "sns_model_checkpoint": "/home/gvisona/Projects/SelfPeptides/trained_models/sns_model/checkpoints/001_checkpoint.pt",
              "pretrained_aa_embeddings": "/home/gvisona/Projects/SelfPeptides/processed_data/aa_embeddings/normalized_learned_BA_AA_embeddings.npy"})

    

In [20]:

print("Loading immunogenicity data")
train_df, val_df, test_df, dhlap_imm_df = load_immunogenicity_dataframes(config)

Loading immunogenicity data
Applying chosen prior..


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30125/30125 [00:04<00:00, 6926.07it/s]


IEDB N. training samples: 23062
IEDB N. val samples: 2555
IEDB N. test samples: 4508


In [21]:
train_df

Unnamed: 0,Peptide,HLA,Qualitative Measurement,Number of Subjects Tested,Number of Subjects Positive,Peptide length,Alpha,Beta,Distr. Mean,Distr. Variance,Allele Pseudo-sequence,Target,Sample,Peptide Length,Obs. Mean,Obs. Variance,Stratification_index
24546,RELLLEIIY,HLA-B37:01,Negative,1.0,0.0,9,1.0,37.0,0.026316,0.000657,YHSTYREISTNTYEDTLYIRSNFYTWAVDAYTWY,0,RELLLEIIY_HLA-B37:01,9,0.333333,0.055556,HLA-B37:01_0
10637,VTFRERYSY,HLA-A29:02,Negative,3.0,0.0,9,1.0,39.0,0.025000,0.000595,YTAMYLQNVAQTDANTLYIMYRDYTWAVLAYTWY,0,VTFRERYSY_HLA-A29:02,9,0.200000,0.026667,HLA-A29:02_0
21877,NWIQINFHI,HLA-A23:01,Negative,1.0,0.0,9,1.0,37.0,0.026316,0.000657,YSAMYEEKVAHTDENIAYLMFHYYTWAVLAYTGY,0,NWIQINFHI_HLA-A23:01,9,0.333333,0.055556,HLA-A23:01_0
11909,GFYMISLLRK,HLA-A03:01,Negative,3.0,0.0,10,1.0,39.0,0.025000,0.000595,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,0,GFYMISLLRK_HLA-A03:01,10,0.200000,0.026667,HLA-A03:01_0
1197,KLLEQWNLV,HLA-A02:01,Positive-Low,24.0,24.0,9,56.0,8.0,0.875000,0.001683,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY,1,KLLEQWNLV_HLA-A02:01,9,0.961538,0.001370,HLA-A02:01_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28593,RPSPIGYLGL,HLA-B07:06,Positive-High,1.0,1.0,10,49.0,4.0,0.924528,0.001292,YYSEYRNIYAQTDESNLYLSYNYYTWAERAYEWY,1,RPSPIGYLGL_HLA-B07:06,10,0.666667,0.055556,HLA-B07:06_1
28911,RYYDGNIYEL,HLA-A24:07,Positive,1.0,1.0,10,37.0,4.0,0.902439,0.002096,YSAMYEEKVAQTDENIAYLMFHYYTWAVQAYTGY,1,RYYDGNIYEL_HLA-A24:07,10,0.666667,0.055556,HLA-A24:07_1
29098,VRMVMMTHF,HLA-B27:01,Negative,1.0,0.0,9,1.0,37.0,0.026316,0.000657,YHTEYREICAKTYENTAYLNYHDYTWAVLAYEWY,0,VRMVMMTHF_HLA-B27:01,9,0.333333,0.055556,HLA-B27:01_0
29264,ILKEPVHGVY,HLA-B15:10,Positive,1.0,1.0,10,37.0,4.0,0.902439,0.002096,YYSEYRNICTNTYESNLYLRYDYYTWAELAYLWY,1,ILKEPVHGVY_HLA-B15:10,10,0.666667,0.055556,HLA-B15:10_1


In [22]:
test_df

Unnamed: 0,Peptide,HLA,Qualitative Measurement,Number of Subjects Tested,Number of Subjects Positive,Peptide length,Alpha,Beta,Distr. Mean,Distr. Variance,Allele Pseudo-sequence,Target,Sample,Peptide Length,Obs. Mean,Obs. Variance,Stratification_index
1862,DTDFVNEFY,HLA-A01:01,Positive,16.0,7.0,9,43.0,13.0,0.767857,0.003127,YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY,1,DTDFVNEFY_HLA-A01:01,9,0.444444,0.012995,HLA-A01:01_1
28000,PETSSKKPDY,HLA-B49:01,Negative,1.0,0.0,10,1.0,37.0,0.026316,0.000657,YHTKYREISTNTYENIAYWRYNLYTWAELAYLWY,0,PETSSKKPDY_HLA-B49:01,10,0.333333,0.055556,HLA-B49:01_0
25504,ITSKVKVINY,HLA-A30:02,Negative,1.0,0.0,10,1.0,37.0,0.026316,0.000657,YSAMYQENVAHTDENTLYIIYEHYTWARLAYTWY,0,ITSKVKVINY_HLA-A30:02,10,0.333333,0.055556,HLA-A30:02_0
10860,LSNGGLPAY,HLA-A29:02,Negative,3.0,0.0,9,1.0,39.0,0.025000,0.000595,YTAMYLQNVAQTDANTLYIMYRDYTWAVLAYTWY,0,LSNGGLPAY_HLA-A29:02,9,0.200000,0.026667,HLA-A29:02_0
16844,YEALYYVHSL,HLA-B40:02,Negative,2.0,0.0,10,1.0,38.0,0.025641,0.000625,YHTKYREISTNTYESNLYLSYNYYTWAVLAYEWY,0,YEALYYVHSL_HLA-B40:02,10,0.250000,0.037500,HLA-B40:02_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,QIYKTPPIK,HLA-A03:01,Negative,10.0,0.0,9,1.0,46.0,0.021277,0.000434,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,0,QIYKTPPIK_HLA-A03:01,9,0.083333,0.005876,HLA-A03:01_0
14245,PFESPNFTK,HLA-A11:01,Negative,2.0,0.0,9,1.0,38.0,0.025641,0.000625,YYAMYQENVAQTDVDTLYIIYRDYTWAAQAYRWY,0,PFESPNFTK_HLA-A11:01,9,0.250000,0.037500,HLA-A11:01_0
7318,NYMCVDNHL,HLA-A24:02,Negative,5.0,0.0,9,1.0,41.0,0.023810,0.000541,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY,0,NYMCVDNHL_HLA-A24:02,9,0.142857,0.015306,HLA-A24:02_0
16047,VSDQFSVEF,HLA-C05:01,Negative,2.0,0.0,9,1.0,38.0,0.025641,0.000625,YYAGYREKYRQTDVNKLYLRYNFYTWAERAYTWY,0,VSDQFSVEF_HLA-C05:01,9,0.250000,0.037500,HLA-C05:01_0


In [23]:
device="cpu"
model = JointPeptidesNetwork(config, config["binding_model_config"], config["sns_model_config"], 
                     binding_checkpoint=config["binding_model_checkpoint"], 
                     sns_checkpoint=config["sns_model_checkpoint"],
                     device=device)

In [29]:
checkpoint_path = join(folder, "checkpoints", "001_checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))
model.immunogenicity_model.eval()

ImmunogenicityClassifier(
  (immunogenicity_aa_embedder): PeptideEmbedder(
    (tokenizer): AA_Tokenizer()
    (aa_embs): Embedding(23, 512)
    (transformer_encoder): TransformerEncoder(
      (pos_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.05, inplace=False)
      )
      (dropout): Dropout(p=0.05, inplace=False)
      (encoder_layers): ModuleList(
        (0-1): 2 x TEncoderLayer(
          (multihead_attention): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (dropout1): Dropout(p=0.05, inplace=False)
          (res_norm1): ResNorm(
            (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          )
          (feed_forward): Sequential(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): ReLU()
            (2): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout2): Dropout(p=0.05, inpl

In [33]:
df = test_df.iloc[:10]
peptides = df["Peptide"].values.tolist()
hlas = df["Allele Pseudo-sequence"].values.tolist()

In [34]:
binding_score, (binding_peptides_embs, binding_hlas_embs) = model.binding_model(peptides, hlas)
sns_peptides_projections, sns_peptides_embs, sns_scores = model.sns_model(peptides, return_sns_score=True)
        

In [36]:
output = model.immunogenicity_model(peptides, hlas, binding_peptides_embs, binding_hlas_embs, 
                                    sns_peptides_embs, binding_score, sns_scores)

In [37]:
output

tensor([[0.0492, 0.0029],
        [0.0181, 0.0010],
        [0.0205, 0.0010],
        [0.0155, 0.0011],
        [0.0375, 0.0027],
        [0.1317, 0.0086],
        [0.0389, 0.0017],
        [0.0287, 0.0013],
        [0.0248, 0.0011],
        [0.0401, 0.0016]], grad_fn=<CopySlices>)

In [38]:
df["Target"]

1862     1
28000    0
25504    0
10860    0
16844    0
12787    0
28149    0
20650    0
18471    0
24844    0
Name: Target, dtype: int64

In [39]:
"DTDFVNEFY"

'DTDFVNEFY'

In [52]:
peptides2 = [
"DTDFVNEFY",
    "DTDFVNE",
    "DFVNEFY",
    "DDVEY",
    "TFNF",
    "DFVNE",
    "TDFVNEF"
]
hlas2 = [hlas[0]] * len(peptides2)
binding_peptides_embs2 = binding_peptides_embs[0].repeat((len(peptides2), 1))
binding_hlas_embs2 = binding_hlas_embs[0].repeat((len(peptides2), 1))
sns_peptides_embs2 = sns_peptides_embs[0].repeat((len(peptides2), 1))
binding_score2 = binding_score[0].repeat((len(peptides2), 1))
sns_scores2 = sns_scores[0].repeat((len(peptides2), 1))

# binding_hlas_embs2 = torch.tensor([binding_hlas_embs[0]] * len(peptides2))
# sns_peptides_embs2 = torch.tensor([sns_peptides_embs[0]] * len(peptides2))
# binding_score2 = torch.tensor([binding_score[0]] * len(peptides2))
# sns_scores2 = torch.tensor([sns_scores[0]] * len(peptides2))


print(len(peptides2), len(hlas2))
binding_peptides_embs2

7 7


tensor([[ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403],
        [ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403],
        [ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403],
        ...,
        [ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403],
        [ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403],
        [ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403]])

In [53]:
binding_peptides_embs

tensor([[ 0.1216,  0.7268,  0.8552,  ..., -0.3487,  0.7804, -0.1403],
        [-0.3316, -0.2180, -0.2093,  ..., -0.4204,  0.2032,  0.0131],
        [ 0.5682, -0.4412,  0.6588,  ...,  0.7988,  0.4902, -0.5472],
        ...,
        [-0.8319,  0.8530,  0.4181,  ..., -0.3463,  0.7489, -0.1277],
        [-2.1591,  0.8826, -0.4542,  ..., -0.3240,  0.2581,  0.4338],
        [-0.4487,  0.2386,  0.0709,  ...,  0.0581, -0.0285,  1.3244]])

In [55]:
output2 = model.immunogenicity_model(peptides2, hlas2, binding_peptides_embs2, binding_hlas_embs2, 
                                    sns_peptides_embs2, binding_score2, sns_scores2)
output2

tensor([[0.0492, 0.0029],
        [0.0481, 0.0027],
        [0.0711, 0.0046],
        [0.0345, 0.0018],
        [0.0791, 0.0051],
        [0.0614, 0.0038],
        [0.0727, 0.0047]], grad_fn=<CopySlices>)