In [31]:
import pandas as pd
import numpy as np
import re
import json
import os
from tqdm.notebook import tqdm

In [2]:
hla_prots = {}
hla_features = []

hla = None
prot = ""
with open("../data/IPD-IMGT_HLA/hla_prot.fasta", "r") as f:
    i = 0
    for line in f:
        ln = line.strip()
        if ln.startswith(">"):
            if hla is not None:
                hla_prots[hla] = prot
                prot = ""
            
            hla = ln.split(" ")[0][1:]
            hla_features.append(ln[1:].split(" "))
        else:
            prot += ln
    hla_prots[hla] = prot
    

In [3]:
# hla_prots

In [4]:
hla_features = pd.DataFrame(hla_features).iloc[:, :3]

hla_features

Unnamed: 0,0,1,2
0,HLA:HLA00001,A*01:01:01:01,365
1,HLA:HLA02169,A*01:01:01:02N,200
2,HLA:HLA14798,A*01:01:01:03,365
3,HLA:HLA15760,A*01:01:01:04,365
4,HLA:HLA16415,A*01:01:01:05,365
...,...,...,...
37418,HLA:HLA06641,TAP2*01:04,686
37419,HLA:HLA00962,TAP2*02:01:01,704
37420,HLA:HLA06635,TAP2*02:01:02:01,704
37421,HLA:HLA06638,TAP2*02:01:02:02,704


In [9]:
hla_features.columns = ["HLA_id", "HLA", "Bp"]
hla_features

Unnamed: 0,HLA_id,HLA,Bp
0,HLA:HLA00001,A*01:01:01:01,365
1,HLA:HLA02169,A*01:01:01:02N,200
2,HLA:HLA14798,A*01:01:01:03,365
3,HLA:HLA15760,A*01:01:01:04,365
4,HLA:HLA16415,A*01:01:01:05,365
...,...,...,...
25141,HLA:HLA32558,C*18:15,366
25142,HLA:HLA33923,C*18:16,366
25143,HLA:HLA36442,C*18:17,366
25144,HLA:HLA36862,C*18:18,366


In [10]:
hla_features[hla_features["HLA"].str.endswith("A*01:01")]

Unnamed: 0,HLA_id,HLA,Bp


In [11]:
# hla_features = hla_features[hla_features["HLA_allele"].str.contains(r"^[A-Ca-c].\d{2,3}:\d{2,3}$")]
hla_features = hla_features[hla_features["HLA"].str.contains(r"^[A-Ca-c]\*")]
hla_features = hla_features.sort_values(by="HLA").reset_index(drop=True)
hla_features

Unnamed: 0,HLA_id,HLA,Bp
0,HLA:HLA00001,A*01:01:01:01,365
1,HLA:HLA02169,A*01:01:01:02N,200
2,HLA:HLA14798,A*01:01:01:03,365
3,HLA:HLA15760,A*01:01:01:04,365
4,HLA:HLA16415,A*01:01:01:05,365
...,...,...,...
25141,HLA:HLA32558,C*18:15,366
25142,HLA:HLA33923,C*18:16,366
25143,HLA:HLA36442,C*18:17,366
25144,HLA:HLA36862,C*18:18,366


In [8]:
a0101_seqs = []
for hla_id in hla_features[hla_features["HLA_allele"].str.startswith("A*01:01")]["HLA_id"]:
    a0101_seqs.append(hla_prots[hla_id])
len(a0101_seqs)


259

In [12]:
pd.Series(a0101_seqs).value_counts()

MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV    196
SHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRT                                                                                                                                                                                             48
SHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQ

# Prot T5

In [40]:
from transformers import T5Tokenizer, T5EncoderModel
import torch
import pandas as pd
import numpy as np
import re
import json
from tqdm.notebook import tqdm

import h5py

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

# Load the model
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)


In [13]:
hla_sequences = [hla_prots[p] for p in hla_features["HLA_id"]]
hla_sequences[:3]

['MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQLRAYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV',
 'SHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQERPEYWDQETRNVKAHSQTDRENLGTLRGYYNQSEAGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRT',
 'SHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGWCVDGLRRYLENGKETLQRT']

In [17]:
hla_embeddings = []
for batch in tqdm(np.array_split(hla_sequences, 1300)):
        lengths = [len(p) for p in batch]
        tok_batch = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in batch]
        ids = tokenizer.batch_encode_plus(tok_batch, add_special_tokens=True, padding="longest")

        input_ids = torch.tensor(ids['input_ids']).to(device)
        attention_mask = torch.tensor(ids['attention_mask']).to(device)

        # generate embeddings
        with torch.no_grad():
            embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
            
        for i, l in enumerate(lengths):
            seq_emb = embedding_repr.last_hidden_state[i,:l].mean(dim=0)
            # idx += 1
            hla_embeddings.append(seq_emb.numpy())
            # dset[idx,:]=seq_emb.numpy()
        break

  0%|          | 0/1300 [00:00<?, ?it/s]

In [20]:
hla_embeddings = np.vstack(hla_embeddings)

In [23]:
np.save("../processed_data/hla_embeddings.npy", hla_embeddings)
hla_features.to_csv("

array([1.7295066, 1.9745013, 1.9663278, 1.9899385, 1.7321826, 1.9711802,
       1.7215077, 1.723604 , 1.9752562, 1.7278218, 1.9821838],
      dtype=float32)

In [24]:
len(hla_embeddings1)

11

In [None]:
hla_embs = np.load("../data/NetMHCpan_pseudoseq/HLA_ProtT5_embeddings.npy")
hla_pseq_embs = np.mean(hla_embs[:,:-1,:], axis=1)
hla_pseq_embs.shape
np.save("../processed_data/HLA_embeddings/hla_pseudoseqs_ProtT5_embeddings.npy", hla_pseq_embs)

# Process and sort embeddings

In [13]:
hla_features

Unnamed: 0,HLA_id,HLA,Bp
0,HLA:HLA00001,A*01:01:01:01,365
1,HLA:HLA02169,A*01:01:01:02N,200
2,HLA:HLA14798,A*01:01:01:03,365
3,HLA:HLA15760,A*01:01:01:04,365
4,HLA:HLA16415,A*01:01:01:05,365
...,...,...,...
25141,HLA:HLA32558,C*18:15,366
25142,HLA:HLA33923,C*18:16,366
25143,HLA:HLA36442,C*18:17,366
25144,HLA:HLA36862,C*18:18,366


In [23]:
hla_features["Protein_sequence"] = hla_features["HLA_id"].map(hla_prots)
hla_features

Unnamed: 0,HLA_id,HLA,Bp,Protein_sequence
0,HLA:HLA00001,A*01:01:01:01,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
1,HLA:HLA02169,A*01:01:01:02N,200,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2,HLA:HLA14798,A*01:01:01:03,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
3,HLA:HLA15760,A*01:01:01:04,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
4,HLA:HLA16415,A*01:01:01:05,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
...,...,...,...,...
25141,HLA:HLA32558,C*18:15,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...
25142,HLA:HLA33923,C*18:16,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...
25143,HLA:HLA36442,C*18:17,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYCDTAVSRPGRGEPRF...
25144,HLA:HLA36862,C*18:18,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...


In [22]:
# hla_prots

In [19]:
embeddings = np.load("../processed_data/HLA_embeddings/hla_proteins_ProtT5_embeddings.npy")
embeddings.shape

(25146, 1024)

In [24]:
nonunique_seq2idx = {s: i for i, s in enumerate(hla_features["Protein_sequence"])}

In [26]:
hla_protein_sequences = sorted(hla_features["Protein_sequence"].unique())
hla_protein_sequences[:4]

['AHSMRYFYTAVSRPGRGEPHFIAVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHIIQRMYGCDVGPDGRLLRGYDQYAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQLRAYLEGLCVEWLRRYLKNGKETLQRA',
 'ALALTETWAGSHSMRYFYTAMSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRMAPRAPWIEQEGPEYWDRETQISKTNTQTYRESLRNLRGYYNQSEAGSHTLQRMYGCDVGPDGRLLRGYDQSAYDGKDYIALNEDLSSWTAADTAAQITQRKWEAAREAEQWRAYLEGLCVEWLRRYLENGKETLQRADPPKTHVTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEPSSQSTIPIVGIVAGLAVL',
 'ALALTETWAGSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREEPRAPWIEQEGPEYWDRNTQICKTNTQTDRESLRNLRGYYNQSEAGSHTLQWMYGCDVGPDGRLLRGYNQFAYDGKDYIALNEDLSSWTAADTAAQITQRKWEAAREAEQLGAYLEGTCVEWLRRHLENGKETLQRADPPKTHVTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEPSSQSTVPIVG',
 'APRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQIMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQLRAYLDGTCVEWLRRYLENGKETLQRTDP

In [27]:
len(hla_protein_sequences)

16427

In [28]:
seq2idx = {s: i for i, s in enumerate(hla_protein_sequences)}
idx2seq = {i: s for s, i in seq2idx.items()}

In [29]:
sequence_embeddings = []
for i in range(len(idx2seq)):
    seq = idx2seq[i]
    emb_idx = nonunique_seq2idx[seq]
    sequence_embeddings.append(embeddings[emb_idx])
sequence_embeddings = np.vstack(sequence_embeddings)
sequence_embeddings.shape
    

(16427, 1024)

In [30]:
np.save("../processed_data/HLA_embeddings/HLA_proteins_T5/hla_proteins_T5_embeddings.npy", sequence_embeddings)

In [33]:
with open("../processed_data/HLA_embeddings/HLA_proteins_T5/prot_sequence2idx.json", "w") as f:
    json.dump(seq2idx, f, indent=2)

In [34]:
hla_features

Unnamed: 0,HLA_id,HLA,Bp,Protein_sequence
0,HLA:HLA00001,A*01:01:01:01,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
1,HLA:HLA02169,A*01:01:01:02N,200,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2,HLA:HLA14798,A*01:01:01:03,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
3,HLA:HLA15760,A*01:01:01:04,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
4,HLA:HLA16415,A*01:01:01:05,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
...,...,...,...,...
25141,HLA:HLA32558,C*18:15,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...
25142,HLA:HLA33923,C*18:16,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...
25143,HLA:HLA36442,C*18:17,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYCDTAVSRPGRGEPRF...
25144,HLA:HLA36862,C*18:18,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...


In [35]:
hla_features["HLA"] = "HLA-" + hla_features["HLA"].str.replace("*","", regex=False)

In [36]:
hla_features

Unnamed: 0,HLA_id,HLA,Bp,Protein_sequence
0,HLA:HLA00001,HLA-A01:01:01:01,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
1,HLA:HLA02169,HLA-A01:01:01:02N,200,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2,HLA:HLA14798,HLA-A01:01:01:03,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
3,HLA:HLA15760,HLA-A01:01:01:04,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
4,HLA:HLA16415,HLA-A01:01:01:05,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
...,...,...,...,...
25141,HLA:HLA32558,HLA-C18:15,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...
25142,HLA:HLA33923,HLA-C18:16,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...
25143,HLA:HLA36442,HLA-C18:17,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYCDTAVSRPGRGEPRF...
25144,HLA:HLA36862,HLA-C18:18,366,MRVMAPRALLLLLSGGLALTETWACSHSMRYFDTAVSRPGRGEPRF...


In [37]:
missing_hlas = {'HLA-A01:01',
 'HLA-A01:03',
 'HLA-A02:01',
 'HLA-A02:02',
 'HLA-A02:03',
 'HLA-A02:04',
 'HLA-A02:05',
 'HLA-A02:06',
 'HLA-A02:07',
 'HLA-A02:08',
 'HLA-A02:09',
 'HLA-A02:11',
 'HLA-A02:17',
 'HLA-A02:20',
 'HLA-A03:01',
 'HLA-A03:02',
 'HLA-A1',
 'HLA-A11',
 'HLA-A11:01',
 'HLA-A11:02',
 'HLA-A23:01',
 'HLA-A24:02',
 'HLA-A24:03',
 'HLA-A24:07',
 'HLA-A24:13',
 'HLA-A25:01',
 'HLA-A26:01',
 'HLA-A26:02',
 'HLA-A26:03',
 'HLA-A26:08',
 'HLA-A29:01',
 'HLA-A29:02',
 'HLA-A3',
 'HLA-A30:01',
 'HLA-A30:02',
 'HLA-A30:04',
 'HLA-A31:01',
 'HLA-A32:01',
 'HLA-A33:01',
 'HLA-A33:03',
 'HLA-A34:01',
 'HLA-A34:02',
 'HLA-A66:01',
 'HLA-A66:02',
 'HLA-A68:01',
 'HLA-A68:02',
 'HLA-A69:01',
 'HLA-A74:01',
 'HLA-A80:01',
 'HLA-B07:02',
 'HLA-B07:06',
 'HLA-B08:01',
 'HLA-B08:011',
 'HLA-B13:01',
 'HLA-B13:02',
 'HLA-B14:01',
 'HLA-B14:02',
 'HLA-B15:01',
 'HLA-B15:02',
 'HLA-B15:03',
 'HLA-B15:08',
 'HLA-B15:09',
 'HLA-B15:10',
 'HLA-B15:11',
 'HLA-B15:13',
 'HLA-B15:16',
 'HLA-B15:17',
 'HLA-B15:18',
 'HLA-B15:25',
 'HLA-B15:27',
 'HLA-B18:01',
 'HLA-B18:03',
 'HLA-B27:02',
 'HLA-B27:04',
 'HLA-B27:05',
 'HLA-B27:06',
 'HLA-B27:07',
 'HLA-B35:01',
 'HLA-B35:02',
 'HLA-B35:03',
 'HLA-B35:04',
 'HLA-B35:08',
 'HLA-B35:14',
 'HLA-B35:43',
 'HLA-B37:01',
 'HLA-B37:04',
 'HLA-B38:01',
 'HLA-B38:02',
 'HLA-B39:01',
 'HLA-B39:05',
 'HLA-B39:06',
 'HLA-B39:09',
 'HLA-B39:10',
 'HLA-B39:24',
 'HLA-B40:01',
 'HLA-B40:02',
 'HLA-B40:03',
 'HLA-B40:06',
 'HLA-B40:10',
 'HLA-B41:01',
 'HLA-B41:02',
 'HLA-B41:03',
 'HLA-B42:01',
 'HLA-B42:02',
 'HLA-B44:01',
 'HLA-B44:02',
 'HLA-B44:03',
 'HLA-B44:05',
 'HLA-B44:27',
 'HLA-B44:28',
 'HLA-B45:01',
 'HLA-B46:01',
 'HLA-B47:01',
 'HLA-B48:01',
 'HLA-B49:01',
 'HLA-B50:01',
 'HLA-B50:02',
 'HLA-B51:01',
 'HLA-B51:02',
 'HLA-B51:08',
 'HLA-B52:01',
 'HLA-B53:01',
 'HLA-B54:01',
 'HLA-B55:01',
 'HLA-B55:02',
 'HLA-B56:01',
 'HLA-B57:01',
 'HLA-B57:02',
 'HLA-B57:03',
 'HLA-B58:01',
 'HLA-B58:02',
 'HLA-B59:01',
 'HLA-B73:01',
 'HLA-B78:01',
 'HLA-B81:01',
 'HLA-C01:02',
 'HLA-C02:02',
 'HLA-C03:02',
 'HLA-C03:03',
 'HLA-C03:04',
 'HLA-C04:01',
 'HLA-C05:01',
 'HLA-C06:02',
 'HLA-C07:01',
 'HLA-C07:02',
 'HLA-C07:04',
 'HLA-C08:01',
 'HLA-C08:02',
 'HLA-C08:03',
 'HLA-C12:02',
 'HLA-C12:03',
 'HLA-C12:04',
 'HLA-C14:02',
 'HLA-C14:03',
 'HLA-C15:02',
 'HLA-C15:05',
 'HLA-C16:01',
 'HLA-C17:01',
 'HLA-C18:01'}

len(missing_hlas)

159

In [79]:
added_hlas = []
for mh in missing_hlas:
    if ":" not in mh:
        continue
    
    row = hla_features[hla_features["HLA"].str.startswith(mh+":")]
    if len(row)<1:
        continue
    if not row.iloc[0]["HLA"].endswith("01"):
        print(mh)
    # if mh=="HLA-B47:01":
    #     break
    new_row = row.iloc[0]

    new_row["HLA"] = mh
    added_hlas.append(new_row)
added_hlas = pd.DataFrame(added_hlas)
added_hlas

HLA-C17:01
HLA-B78:01
HLA-B47:01


Unnamed: 0,HLA_id,HLA,Bp,Protein_sequence
12748,HLA:HLA00284,HLA-B39:10,362,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
3455,HLA:HLA00044,HLA-A11:02,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGKPRF...
7251,HLA:HLA00113,HLA-A66:02,365,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...
9398,HLA:HLA00158,HLA-B14:02,362,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTAVSRPGRGEPRF...
9825,HLA:HLA00174,HLA-B15:11,362,MRVTAPRTVLLLLSGALALTETWAGSHSMRYFYTAMSRPGRGEPRF...
...,...,...,...,...
15343,HLA:HLA00344,HLA-B51:01,362,MRVTAPRTVLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...
24230,HLA:HLA00467,HLA-C15:02,366,MRVMAPRTLLLLLSGALALTETWACSHSMRYFYTAVSRPGRGEPHF...
13166,HLA:HLA00294,HLA-B40:03,362,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...
18109,HLA:HLA00410,HLA-C03:02,366,MRVMAPRTLILLLSGALALTETWAGSHSMRYFYTAVSRPGRGEPHF...


In [77]:
row["Protein_sequence"].value_counts()

MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKEPRAPWIEQEGPEYWDRETQISKTNTQTYREDLRTLLRYYNQSEAGSHTLQRMFGCDVGPDGRLLRGYHQDAYDGKDYIALNEDLSSWTAADTAAQITQRKWEAARVAEQLRAYLEGECVEWLRRYLENGKETLQRADPPKTHVTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEPSSQSTVPIVGIVAGLAVLAVVVIGAVVAAVVCRRKSSGGKGGSYSQAACSDSAQGSDVSLTA    9
SHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKEPRAPWIEQEGPEYWDRETQISKTNTQTYREDLRTLLRYYNQSEAGSHTLQRMFGCDVGPDGRLLRGYHQDAYDGKDYIALNEDLSSWTAADTAAQITQRKWEAARVAEQLRAYLEGECVEWLRRYLENGKETLQRADPPKTHVTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDRTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRW                                                                                             1
SHSMRYFYTAMSRPGRGEPRFITVGYVDDTLFVRFDSDATSPRKEPRAPWIEQEGPEYWDRETQISKTNTQTYREDLRTLLRYYNQSEAGSHTLQRMFGCDVGPDGRLLRGYHQDAYDGKDYIALNEDLSSWTAADTAAQITQRKWEAARVAEQLRAYLEGECVEWLRRYLENGKETLQRA                                                                                   

In [82]:
completed_df = pd.concat([hla_features, added_hlas]).reset_index(drop=True)
completed_df

Unnamed: 0,HLA_id,HLA,Bp,Protein_sequence
0,HLA:HLA00001,HLA-A01:01:01:01,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
1,HLA:HLA02169,HLA-A01:01:01:02N,200,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2,HLA:HLA14798,HLA-A01:01:01:03,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
3,HLA:HLA15760,HLA-A01:01:01:04,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
4,HLA:HLA16415,HLA-A01:01:01:05,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
...,...,...,...,...
25295,HLA:HLA00344,HLA-B51:01,362,MRVTAPRTVLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...
25296,HLA:HLA00467,HLA-C15:02,366,MRVMAPRTLLLLLSGALALTETWACSHSMRYFYTAVSRPGRGEPHF...
25297,HLA:HLA00294,HLA-B40:03,362,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...
25298,HLA:HLA00410,HLA-C03:02,366,MRVMAPRTLILLLSGALALTETWAGSHSMRYFYTAVSRPGRGEPHF...


In [84]:
completed_df.columns = ["HLA_id", "HLA", "Bp", "sequence"]

In [85]:
completed_df.to_csv("../processed_data/HLA_embeddings/HLA_proteins_T5/hla_proteins_mapping.csv", index=False)

In [86]:
completed_df

Unnamed: 0,HLA_id,HLA,Bp,sequence
0,HLA:HLA00001,HLA-A01:01:01:01,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
1,HLA:HLA02169,HLA-A01:01:01:02N,200,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2,HLA:HLA14798,HLA-A01:01:01:03,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
3,HLA:HLA15760,HLA-A01:01:01:04,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
4,HLA:HLA16415,HLA-A01:01:01:05,365,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
...,...,...,...,...
25295,HLA:HLA00344,HLA-B51:01,362,MRVTAPRTVLLLLWGAVALTETWAGSHSMRYFYTAMSRPGRGEPRF...
25296,HLA:HLA00467,HLA-C15:02,366,MRVMAPRTLLLLLSGALALTETWACSHSMRYFYTAVSRPGRGEPHF...
25297,HLA:HLA00294,HLA-B40:03,362,MRVTAPRTLLLLLWGAVALTETWAGSHSMRYFHTSVSRPGRGEPRF...
25298,HLA:HLA00410,HLA-C03:02,366,MRVMAPRTLILLLSGALALTETWAGSHSMRYFYTAVSRPGRGEPHF...
