In [1]:
import pandas as pd
import numpy as np
import h5py
from tqdm.notebook import tqdm
import torch
from selfpeptide.utils.processing_utils import get_vocabulary_tokens
from selfpeptide.utils.constants import *
from selfpeptide.model.encoder import AA_Tokenizer

In [2]:
N_REF_PEPTIDES = 52208587
N_OTHER_PEPTIDES = 790551832

In [3]:
hdf5_ref_file = "../processed_data/peptide_reference_dataset.hdf5"
with h5py.File(hdf5_ref_file, "r") as f:
    print(f.keys())
    print(f['human_proteome'].keys())   
    viral_proteomes_ids = list(f['viral_proteomes'].keys())
    bacterial_proteomes_ids = list(f['bacterial_proteomes'].keys())    

<KeysViewHDF5 ['bacterial_proteomes', 'human_proteome', 'viral_proteomes']>
<KeysViewHDF5 ['human_cancer_peptides', 'reference_human_proteome']>


In [4]:
vocab = get_vocabulary_tokens()
tokenizer = AA_Tokenizer(vocab)

In [5]:
MAX_PEPTIDE_LEN

12

In [6]:
output_hdf5_file = "../processed_data/pre_tokenized_peptides_dataset.hdf5"

# Tokenize Reference Human peptides

In [7]:
batch_size = 4096

with h5py.File(output_hdf5_file, "w") as fout:
    dset = fout.create_dataset("reference_human_peptides", (N_REF_PEPTIDES, MAX_PEPTIDE_LEN), dtype="u1", compression='gzip', chunks=True)
    with h5py.File(hdf5_ref_file, "r") as f_ref:
        for ix in tqdm(range(0, N_REF_PEPTIDES, batch_size)):
            batch = f_ref['human_proteome']['reference_human_proteome'][ix:ix+batch_size]
            batch = [p.decode() for p in batch]
            tokens, padding_mask = tokenizer(batch)
            dset[ix:ix+batch_size] = tokens.numpy()

  0%|          | 0/12747 [00:00<?, ?it/s]

In [8]:
array_ix = 0
with h5py.File(output_hdf5_file, "r+") as fout:
    dset = fout.create_dataset("nonself_peptides", (N_OTHER_PEPTIDES, MAX_PEPTIDE_LEN), dtype="u1", compression='gzip', chunks=True)

    with h5py.File(hdf5_ref_file, "r") as f_ref:
        for ix in tqdm(range(0, len(f_ref['human_proteome']['human_cancer_peptides']), batch_size)):
            batch = f_ref['human_proteome']['human_cancer_peptides'][ix:ix+batch_size]
            batch_len = len(batch)
            batch = [p.decode() for p in batch]
            tokens, padding_mask = tokenizer(batch)
            dset[array_ix:array_ix+batch_len] = tokens.numpy()
            array_ix += batch_len
            
        print(array_ix)
        group = f_ref['viral_proteomes']
        for prot_id in tqdm(list(group.keys())):
            for ix in range(0, len(group[prot_id]), batch_size):
                batch = group[prot_id][ix:ix+batch_size]
                batch_len = len(batch)
                batch = [p.decode() for p in batch]
                tokens, padding_mask = tokenizer(batch)
                dset[array_ix:array_ix+batch_len] = tokens.numpy()
                array_ix += batch_len
                
        print(array_ix)
        group = f_ref['bacterial_proteomes']
        for prot_id in tqdm(list(group.keys())):
            for ix in range(0, len(group[prot_id]), batch_size):
                batch = group[prot_id][ix:ix+batch_size]
                batch_len = len(batch)
                batch = [p.decode() for p in batch]
                tokens, padding_mask = tokenizer(batch)
                dset[array_ix:array_ix+batch_len] = tokens.numpy()
                array_ix += batch_len

print(array_ix)

  0%|          | 0/9694 [00:00<?, ?it/s]

39702919


  0%|          | 0/195 [00:00<?, ?it/s]

58477888


  0%|          | 0/218 [00:00<?, ?it/s]

790551832


In [13]:
with h5py.File(output_hdf5_file, "r") as fout:
    print(fout.keys())    
    print(fout['nonself_peptides'].shape)
    print(fout['reference_human_peptides'].shape)    
    print(fout['reference_human_peptides'][-20:])        
    print(fout['nonself_peptides'][-20:])        

<KeysViewHDF5 ['nonself_peptides', 'reference_human_peptides']>
(790551832, 12)
(52208587, 12)
[[ 9  6  8  3  8 16 14  8 22 22 22 22]
 [ 2  8 17  3  1  0 11 11  0  9  8 22]
 [12 14  2  3 14 14  4  8  0 22 22 22]
 [13  0 20 13 10 14 12 15  7  7 22 22]
 [ 4 15  9 20 12 16 15 17  6 22 22 22]
 [16  3  3  7  8 11  3  7 22 22 22 22]
 [12 15  0  9  2  0  7 14  3  3 16  5]
 [ 0  2 17 13 16  9 14  3 22 22 22 22]
 [13 13  5 16 17  5 12  9 16 15 22 22]
 [ 0 13 14  1  8  9  3  5  0  7 22 22]
 [ 3 13  3  3  3  7  9  5 15  2 22 22]
 [ 3 14 12 12  3  9 12  9  9 22 22 22]
 [ 8 11 18  5  0 17 17 14 15 12  3  5]
 [17 13  6  7  9 10 12  2  2  3  5  3]
 [15  5  9 11  1  5 16 13 22 22 22 22]
 [15 13 16  3 12  4 16  5  9  2 22 22]
 [14 11 12 17 14 17 15 17  8  3  8  5]
 [11 11 16 14 16 17  2  7 17 22 22 22]
 [14 13 12  9 13  0 16  4 22 22 22 22]
 [ 0  9 11  3 14  5 18  3 22 22 22 22]]
[[13 16 11 10 11  0 11  3 17  7  0 11]
 [17  8  9 15  9 17  0  0 10  0 22 22]
 [17  3 14  9  0 20  3  9 10  2  0 22]
 [ 0  5

In [14]:
tokenizer.token2idx

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'P': 12,
 'Q': 13,
 'R': 14,
 'S': 15,
 'T': 16,
 'V': 17,
 'W': 18,
 'X': 19,
 'Y': 20,
 '-': 21,
 '*': 22}

In [16]:
with h5py.File(output_hdf5_file, "r+") as fout:
    fout.attrs.update(tokenizer.token2idx)

In [18]:
with h5py.File(output_hdf5_file, "r") as fout:
    print(dict(fout.attrs))

{'*': 22, '-': 21, 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'X': 19, 'Y': 20}
