In [1]:
import pandas as pd
import numpy as np
import h5py
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from selfpeptide.utils.constants import *

In [2]:
data_file = "../processed_data/Self_nonSelf/pre_tokenized_peptides_dataset_noDup.hdf5"

In [3]:
with h5py.File(data_file, "r") as f:
    print(f.keys())
    print(f['reference_human_peptides'].shape)
    print(f['reference_human_peptides'].dtype)
    print(f['reference_human_peptides'])
    print(f['nonself_peptides'])
    print(f['nonself_peptides'].shape)
    print(f['nonself_peptides'].dtype)
    attrs_dict = dict(f.attrs)
    print(dict(f.attrs))
    self_peptides = f['reference_human_peptides'][:]
    nonself_peptides = f['nonself_peptides'][:]

<KeysViewHDF5 ['nonself_peptides', 'reference_human_peptides']>
(52208587, 12)
uint8
<HDF5 dataset "reference_human_peptides": shape (52208587, 12), type "|u1">
<HDF5 dataset "nonself_peptides": shape (539512268, 12), type "|u1">
(539512268, 12)
uint8
{'*': 22, '-': 21, 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'X': 19, 'Y': 20}


In [4]:
tuple(nonself_peptides[0])

(6, 4, 15, 5, 5, 16, 15, 7, 22, 22, 22, 22)

In [5]:
len(nonself_peptides)

539512268

In [None]:
nonself_peptides_set = set()
for i in tqdm(range(len(nonself_peptides))):
    nonself_peptides_set.add(tuple(nonself_peptides[i]))

  0%|          | 0/790551832 [00:00<?, ?it/s]

In [4]:
unique_nonself_peptides = np.unique(nonself_peptides, axis=0)

In [6]:
len(self_peptides)

52208587

In [None]:
fout.create_dataset("nonself_peptides", (N_OTHER_PEPTIDES, MAX_PEPTIDE_LEN), dtype="u1", compression='gzip', chunks=True)

In [8]:
(len(nonself_peptides) - len(unique_nonself_peptides))/(len(nonself_peptides))

0.3175497846420777

In [9]:
np.random.seed(42)
np.random.shuffle(unique_nonself_peptides)

In [10]:
unique_nonself_peptides

array([[ 6,  4, 15, ..., 22, 22, 22],
       [ 7, 20, 11, ...,  0, 22, 22],
       [ 4,  8, 14, ..., 22, 22, 22],
       ...,
       [ 4, 13,  3, ...,  9, 22, 22],
       [ 7,  9, 15, ..., 13, 10, 22],
       [20, 14,  9, ...,  9, 13, 22]], dtype=uint8)

In [16]:
with h5py.File(data_file, "a") as f:
    del f['nonself_peptides']
    print(f.keys())

<KeysViewHDF5 ['reference_human_peptides']>


In [17]:

with h5py.File(data_file, "r+") as f:
    dset = f.create_dataset("nonself_peptides", (len(nonself_peptides), MAX_PEPTIDE_LEN), dtype="u1", compression='gzip', chunks=True)
    dset[:] = nonself_peptides_u1

In [16]:
self_peptides

array([[ 1, 20,  2, 12, 16,  9,  2, 17, 18, 22, 22, 22],
       [16, 11,  7, 16,  9, 17,  9,  1,  2, 15,  1, 11],
       [ 5,  7,  1,  2,  4,  2,  8,  5,  2,  5, 15,  6],
       [ 5, 14,  7, 14,  2, 12,  7,  9, 22, 22, 22, 22],
       [15, 15, 15,  9,  6,  1,  0, 15, 15,  7,  1, 15],
       [ 6, 16,  7,  1, 20,  4, 16,  3,  4,  9, 22, 22],
       [12, 12, 10,  9, 15, 14,  5,  9,  5,  6, 22, 22],
       [15, 15,  9,  3,  5,  4, 13, 18, 22, 22, 22, 22],
       [17,  2,  0, 11,  9, 13,  8,  9, 16, 13, 22, 22],
       [ 9,  9,  0,  7, 11, 13, 14,  4, 13, 22, 22, 22],
       [10,  2, 20, 17, 17,  5,  0, 14, 16,  4, 22, 22],
       [12, 15, 12, 14, 17,  7,  7,  6, 11, 14,  1, 12],
       [ 2, 20,  5,  8, 13,  6,  4, 11,  2, 17, 22, 22],
       [ 3, 10,  7,  9, 17,  1,  2,  0, 20, 14,  8, 22],
       [ 5,  0, 14,  0, 13,  9,  9,  8, 15,  9, 14,  3],
       [ 5, 13,  9,  9,  5, 16,  0, 14,  3, 12, 11, 12],
       [12,  0,  3,  3, 13,  2, 12, 15, 12,  3, 22, 22],
       [17,  3, 13, 12,  8,  5,

In [19]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [21]:
dset = TensorDataset(torch.tensor(self_peptides))
dset[0]

(tensor([ 1, 20,  2, 12, 16,  9,  2, 17, 18, 22, 22, 22], dtype=torch.uint8),)

In [7]:
nonself_peptides_u1 = nonself_peptides.astype("u1")

In [8]:
nonself_peptides_u1

array([[ 6,  4, 15, ..., 22, 22, 22],
       [ 7, 20, 11, ...,  0, 22, 22],
       [ 4,  8, 14, ..., 22, 22, 22],
       ...,
       [ 4, 13,  3, ...,  9, 22, 22],
       [ 7,  9, 15, ..., 13, 10, 22],
       [20, 14,  9, ...,  9, 13, 22]], dtype=uint8)

In [9]:
nonself_peptides_u1[0]

array([ 6,  4, 15,  5,  5, 16, 15,  7, 22, 22, 22, 22], dtype=uint8)

In [10]:
nonself_peptides

array([[ 6,  4, 15, ..., 22, 22, 22],
       [ 7, 20, 11, ...,  0, 22, 22],
       [ 4,  8, 14, ..., 22, 22, 22],
       ...,
       [ 4, 13,  3, ...,  9, 22, 22],
       [ 7,  9, 15, ..., 13, 10, 22],
       [20, 14,  9, ...,  9, 13, 22]], dtype=uint8)

In [24]:
with h5py.File("../processed_data/Self_nonSelf/pre_tokenized_peptides_dataset_noDup.hdf5", "w") as f:
    dset1 = f.create_dataset("reference_human_peptides", (len(self_peptides), MAX_PEPTIDE_LEN), dtype="u1", compression='gzip', chunks=True)
    dset1[:] = self_peptides
    dset2 = f.create_dataset("nonself_peptides", (len(nonself_peptides), MAX_PEPTIDE_LEN), dtype="u1", compression='gzip', chunks=True)
    dset2[:] = nonself_peptides
    f.attrs.update(attrs_dict)
    
    