In [1]:
import json
import torch
import os
import pandas as pd
import numpy as np
from selfpeptide.model.binding_affinity_classifier import Peptide_HLA_BindingClassifier

In [5]:
def load_binding_model(folder, device="cpu"):
    with open(os.path.join(folder, "config.json"), "r") as f:
        config = json.load(f)
    config["pretrained_aa_embeddings"] = "none"
    model = Peptide_HLA_BindingClassifier(config, device)
    checkpoint_path = os.path.join(folder, "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    return model

In [6]:
binding_model = load_binding_model("../trained_models/binding_model")
# binding_model

In [7]:
binding_model

Peptide_HLA_BindingClassifier(
  (aa_sequence_embedder): PeptideEmbedder(
    (tokenizer): AA_Tokenizer()
    (aa_embs): Embedding(23, 512, padding_idx=22)
    (transformer_encoder): TransformerEncoder(
      (pos_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.05, inplace=False)
      )
      (dropout): Dropout(p=0.05, inplace=False)
      (encoder_layers): ModuleList(
        (0-1): 2 x TEncoderLayer(
          (multihead_attention): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (dropout1): Dropout(p=0.05, inplace=False)
          (res_norm1): ResNorm(
            (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          )
          (feed_forward): Sequential(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): ReLU()
            (2): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout2): Dropo

In [None]:
binding_model.aa_sequence_embedder.tokenizer.token2idx

In [None]:
with open(os.path.join("../trained_models/binding_model", "config.json"), "r") as f:
    config = json.load(f)
config["pretrained_aa_embeddings"] = "none"
config


In [None]:
model2 = Peptide_HLA_BindingClassifier(config)
model2

In [None]:
token2idx = binding_model.aa_sequence_embedder.tokenizer.token2idx
idx2token = {v: k for k, v in token2idx.items()}
tokens = [idx2token[i] for i in range(len(idx2token))]
tokens

In [None]:
learned_aa_embeddings = binding_model.aa_sequence_embedder.aa_embs.weight.detach().numpy()

In [None]:
np.save("../processed_data/aa_embeddings/learned_BA_AA_embeddings.npy", learned_aa_embeddings)

In [None]:
embeddings_df = pd.DataFrame(learned_aa_embeddings, index=tokens)
embeddings_df

In [None]:
embeddings_df.to_csv("../processed_data/aa_embeddings/learned_BA_AA_embeddings.csv")

In [None]:
embeddings_df = pd.read_csv("../processed_data/aa_embeddings/learned_BA_AA_embeddings.csv", index_col=0)
embeddings_df

In [None]:
embeddings_df.values/ np.linalg.norm(np.maximum(embeddings_df.values, 1e-10), axis=1)[:, np.newaxis]

In [None]:
norm_embeddings = embeddings_df.values/ np.linalg.norm(np.maximum(embeddings_df.values, 1e-10), axis=1)[:, np.newaxis]
norm_embeddings

In [None]:
np.save("../processed_data/aa_embeddings/normalized_learned_BA_AA_embeddings.npy", norm_embeddings)

In [None]:
norm_embeddings_df = pd.DataFrame(norm_embeddings, index=embeddings_df.index)
norm_embeddings_df

In [None]:
norm_embeddings_df.to_csv("../processed_data/aa_embeddings/normalized_learned_BA_AA_embeddings.csv")

In [None]:
embeddings_df = pd.read_csv("../processed_data/aa_embeddings/learned_BA_AA_embeddings.csv", index_col=0)
embeddings_df

In [None]:
embeddings = embeddings_df.values

In [None]:
np.linalg.norm(embeddings, axis=1)