In [7]:
import pandas as pd
import numpy as np
import h5py
from tqdm.notebook import tqdm
import os

from torch.utils.data import Dataset, DataLoader

In [3]:
with h5py.File('../processed_data/peptide_reference_dataset.hdf5', 'r') as f:
    print(f.keys())

<KeysViewHDF5 ['bacterial_peptides', 'human_cancer_peptides', 'reference_human_proteome', 'viral_peptides']>


In [26]:
class PeptideTripletsDataset(Dataset):
    def __init__(self, hdf5_dataset_fname):
        self.hdf5_dataset_fname = hdf5_dataset_fname
        self._get_n_peptides()
        self._generate_triplets()
        
    def _get_n_peptides(self):
        peptides_n_ref = {}
        with h5py.File(self.hdf5_dataset_fname, 'r') as f:
            for k in f.keys():
                peptides_n_ref[k] = len(f[k])
        self.peptides_n_ref = peptides_n_ref
                    
    def _generate_triplets(self, n_triplets=1000, random_state=None):
        triplets = []
        
        if random_state is not None:
            np.random.seed(random_state)
        
        pbar = tqdm(total=n_triplets)
        with h5py.File(self.hdf5_dataset_fname, 'r') as f:
            while len(triplets)<n_triplets:
                idx_p1 = np.random.randint(0, self.peptides_n_ref['reference_human_proteome'])
                idx_p2 = np.random.randint(0, self.peptides_n_ref['reference_human_proteome'])
                p1 = f['reference_human_proteome'][idx_p1].decode()
                p2 = f['reference_human_proteome'][idx_p2].decode()
                
                if p1==p2:
                    continue
                
                cat = np.random.choice(['bacterial_peptides', 'human_cancer_peptides', 'viral_peptides'])
                idx_neg = np.random.randint(0, self.peptides_n_ref[cat])
                neg = f[cat][idx_neg].decode()
                
                triplets.append([p1, p2, neg])
                pbar.update(1)
        pbar.close()
        self.triplets = triplets
        
    
    def __len__(self):
        return len(self.triplets)
    
    def __getitem__(self, idx):
        return self.triplets[idx]

In [31]:
dset = PeptideTripletsDataset('../processed_data/peptide_reference_dataset.hdf5')

  0%|          | 0/1000 [00:00<?, ?it/s]

In [32]:
dset[0]

['QPVVTLYH', 'EVGWMTSVKD', 'GVTPGHGM']

In [33]:
dloader = DataLoader(dset, batch_size=4)

In [34]:
next(iter(dloader))

[('QPVVTLYH', 'HATVSRSL', 'LDLAYGDRVW', 'QQCGDFSGFDWS'),
 ('EVGWMTSVKD', 'PCCGTFPFCGCD', 'MDQSYHSAD', 'VECGPKYPEAP'),
 ('GVTPGHGM', 'YDVPKLRAS', 'PRVPRGPRE', 'EPQTIQPEVK')]

In [10]:
class PeptideDataset_forMining(Dataset):
    def __init__(self, hdf5_dataset_fname, gen_size=1000, init_random_state=None, hold_out_set=None):
        self.hdf5_dataset_fname = hdf5_dataset_fname
        self.gen_size =gen_size
        self.hold_out_set = hold_out_set
        
        if not os.path.exists(self.hdf5_dataset_fname):
            raise FileNotFoundError("Specify a valid HDF5 file for the dataset")
        self._get_n_peptides()
        self._generate_peptides(n_peptides=gen_size, random_state=init_random_state)
        
    def _get_n_peptides(self):
        peptides_n_ref = {}
        with h5py.File(self.hdf5_dataset_fname, 'r') as f:
            for k in f.keys():
                peptides_n_ref[k] = len(f[k])
        self.peptides_n_ref = peptides_n_ref
        
    def get_stored_peptides(self):
        peptides_set = set()
        for t in self.triplets:
            peptides_set.update(t)
        return peptides_set
                    
    def _generate_peptides(self, n_peptides=10000, random_state=None):
        peptides = []
        labels = []
        
        if random_state is not None:
            np.random.seed(random_state)
        
        pbar = tqdm(total=n_peptides)
        with h5py.File(self.hdf5_dataset_fname, 'r') as f:
            while len(peptides)<n_peptides:
                idx_pos = np.random.randint(0, self.peptides_n_ref['reference_human_proteome'])
                pos = f['reference_human_proteome'][idx_pos].decode()

                cat = np.random.choice(['bacterial_peptides', 'human_cancer_peptides', 'viral_peptides'])
                idx_neg = np.random.randint(0, self.peptides_n_ref[cat])
                neg = f[cat][idx_neg].decode()
                
                if self.hold_out_set is not None:
                    if (pos in self.hold_out_set or
                        neg in self.hold_out_set):
                        continue
                
                peptides.append(pos)
                labels.append(1)
                peptides.append(neg)
                labels.append(0)
                pbar.update(2)
                
        pbar.close()
        self.peptides = peptides
        self.labels = labels
        
        
    def refresh_peptides_data(self):
        self._generate_peptides(n_peptides=self.gen_size)
    
    def __len__(self):
        return len(self.peptides)
    
    def __getitem__(self, idx):
        return self.peptides[idx], self.labels[idx]

In [11]:
dset = PeptideDataset_forMining('../processed_data/peptide_reference_dataset.hdf5')

  0%|          | 0/1000 [00:00<?, ?it/s]

In [12]:
dloader = DataLoader(dset, batch_size=32)

In [13]:
next(iter(dloader))

[('APPPTPRPE',
  'CIGPASKE',
  'NIDKLTECIKN',
  'AVYRSKNTKK',
  'EYEEVRKDQDSV',
  'VVLMDDNVECTM',
  'SWELIIDPNLK',
  'RSVLPVLN',
  'AVGTTEFALL',
  'KTRYAQLA',
  'YGHPCGLEYG',
  'YDLPQLYELNR',
  'DLRLPEGDLGKE',
  'AKMILVIAGLIL',
  'AMMCDIIYA',
  'VEAKPVVVK',
  'DVVGSVLDCF',
  'LSPANVSN',
  'LKASCQESA',
  'YEAANENHYPFI',
  'RIRKVYGDL',
  'VVLLYKIISR',
  'DFDPAVTE',
  'SFLAACARSC',
  'KSTASRERLKRS',
  'EHYDTVDLD',
  'SASSQSGG',
  'NHAVTVVGWD',
  'PEQPFDHNECEK',
  'VIDQIIIGGN',
  'KVTIQVSLEPLQ',
  'VMRVVAEA'),
 tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 0, 1, 0, 1, 0, 1, 0])]