# Custom Dataloader

Pytorch dataloader for loading the preprocessed data and turning it into torch tensors that can be fed into the deep learning model

In [46]:
import numpy as np
import time
import torch
from torch.utils.data import Dataset, DataLoader

In [47]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [48]:
data = np.load('data/extracted_comb.npy', allow_pickle=True)

### Title has maximum 39 words

In [49]:
maximum = 0
for num, i in enumerate(data[:,0]):
    if maximum < len(i):
        maximum = len(i)

In [50]:
maximum

39

### Text has maximum 129 words

In [51]:
maximum = 0
for num, i in enumerate(data[:,1]):
    if maximum < len(i):
        maximum = len(i)

In [52]:
maximum

129

## For some reason text isn't uniformly 50 words long!!!

In [53]:
for num, i in enumerate(data[:,1]):
    if num == 2:
        break
    print(len(i))

54
55


## Custom dataset

Needs to be done:

* add starting and ending words (DONE)

* mask


In [54]:
padding_value = 0

class CustomDataset(Dataset):
    def __init__(self, root_dir, max_title=39, max_text=129):
        '''
        Args:
        root_dir (string): Path to npy directory
        ids (string): Path to csv file containing list of patient ids for training/testing
        diag_vocab (string): Path to csv file containing all diagnose labels
        max_visits (int): Maximum number of visits
        max_diag (int): Maximum number of diagnoses per visit that are considered
        Outputs:
        inputs (torch.tensor): Concatenated info of diagnoses and times between visits (batch_size, max_visits, vocab_size+1)
        labels (torch.tensor): Labels for visits
        '''
        self.data = np.load(root_dir, allow_pickle=True) 
        self.max_len_title = max_title
        self.max_len_text = max_text
        self.word2idx, self.idx2word = self.indexify_vocab(self.data)
        
    
    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self, idx):
        #load text and title data(these are lists) and add start and stop tokens
        title = self.add_start_stop(self.data[idx, 0])
        text = self.add_start_stop(self.data[idx, 1])
        
        # masking for text (+2 in torch.ones comes from adding start and stop tokens)
        mask = torch.cat((torch.zeros(len(text)), torch.ones(2 + self.max_len_text - len(text)))).bool().to(device)
        
        #indexify title and text
        title = self.indexify_title(title, self.max_len_title)
        text = self.indexify_text(text, self.max_len_text)
        
        #convert to tensors
        title = torch.from_numpy(title).to(device)
        text = torch.from_numpy(text).to(device)
        
        return title, text, mask
    
    def add_start_stop(self, text):
        ret = '<s> ' + ' '.join(text) + ' </s>'
        return ret.split()
    
    def indexify_vocab(self, vocab):
        '''
        function for creating word2idx and idx2word dictionaries of the vocabulary
        Arg:
        vocab (numpy array): data containing titles(on column 0) and text(on column 1)
        Return: 
        word2idx (dictionary): Words linked with unique index
        idx2word (dictionary): Indeces linked with unique words
        '''
        word2idx = dict()
        idx2word = dict()
        word2idx['<s>'] = 1
        idx2word[1] = '<s>'
        word2idx['</s>'] = 2
        idx2word[2] = '</s>'
        index = 3
        for num, i in enumerate(data[:,0]): #loop for titles
            for j in i:
                if j not in word2idx.keys():
                    word2idx[j] = index
                    idx2word[index] = j
                    index += 1
        
        for num, i in enumerate(data[:,1]): #loop for text
            for j in i:
                if j not in word2idx.keys():
                    word2idx[j] = index
                    idx2word[index] = j
                    index += 1
        
        return word2idx, idx2word
    
    def indexify_text(self, text, max_len):
        '''
        Arg:
        text (list): Contains text in list form
        Return:
        ret (numpy.array): Indexes of words in text
        '''
        ret = np.full((max_len+2), padding_value) 
        for i in range(len(text)):
            ret[i] = self.word2idx[text[i]]
            
        return ret
    
    def indexify_title(self, text, max_len):
        '''
        Arg:
        text (list): Contains title in list form
        Return:
        ret (numpy.array): Indexes of words in text
        '''
        ret = np.full((max_len+2), padding_value) 
        for i in range(len(text)):
            ret[i] = self.word2idx[text[i]]
            
        return ret
    

## Testing

In [55]:
start = time.time()
ds = CustomDataset("data/extracted_comb.npy")
print(time.time()-start)

3.511183977127075


In [56]:
test, train = torch.utils.data.random_split(ds, [round(len(data)*0.2), round(len(data)*0.8)])

In [57]:
train

<torch.utils.data.dataset.Subset at 0x18f70856d08>

In [58]:
trainloader = DataLoader(dataset=train, batch_size=32, shuffle=True)

In [59]:
for i, hopo in enumerate(trainloader):
    if i==1:
        break
    print(hopo[0])
    print(hopo[0].shape)
    print(hopo[1])
    print(hopo[1].shape)
    print(hopo[2])
    print(hopo[2].shape)

tensor([[    1,    55,  1262,  ...,     0,     0,     0],
        [    1,   325,  1423,  ...,     0,     0,     0],
        [    1,    31,  2278,  ...,     0,     0,     0],
        ...,
        [    1,  1057,  4651,  ...,     0,     0,     0],
        [    1, 10630,    87,  ...,     0,     0,     0],
        [    1,    44,  1056,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
torch.Size([32, 41])
tensor([[    1,  2375, 18182,  ...,     0,     0,     0],
        [    1,   138,  1705,  ...,     0,     0,     0],
        [    1,  1095,   972,  ...,     0,     0,     0],
        ...,
        [    1,  1031,    55,  ...,     0,     0,     0],
        [    1,  3897,   549,  ...,     0,     0,     0],
        [    1,   388,   459,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
torch.Size([32, 131])
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  

Seems to work!!!

# Embedding

Coding embedding layer for BERT. It needs:

* embedding for input
* positional embedding
* Segment embeddings???

In [None]:
class Embedding(nn.Module):
    def __init__(self, src_vocab_size, n_blocks, n_features, n_heads, n_hidden=64, dropout=0.1):
        """
        Args:
          src_vocab_size: Number of words in the source vocabulary.
          n_blocks: Number of EncoderBlock blocks.
          n_features: Number of features to be used for word embedding and further in all layers of the encoder.
          n_heads: Number of attention heads inside the EncoderBlock.
          n_hidden: Number of hidden units in the Feedforward block of EncoderBlock.
          dropout: Dropout level used in EncoderBlock.
        """
        # YOUR CODE HERE
        super(Encoder, self).__init__()
        self.emb = nn.Embedding(src_vocab_size, n_features)
        self.pos = tr.PositionalEncoding(n_features, dropout, MAX_LENGTH)
        self.blocks = nn.ModuleList([copy.deepcopy(EncoderBlock(n_features, n_heads, n_hidden, dropout)) for _ in range(n_blocks)])

    def forward(self, x, mask):
        """
        Args:
          x of shape (max_seq_length, batch_size): LongTensor with the input sequences.
          mask of shape (batch_size, max_seq_length): BoolTensor indicating which elements should be ignored.
        
        Returns:
          z of shape (max_seq_length, batch_size, n_features): Encoded input sequence.

        Note: All intermediate signals should be of shape (max_seq_length, batch_size, n_features).
        """
        # YOUR CODE HERE
        embedded = self.emb(x)
        embedded = self.pos(embedded)+ embedded
        for layer in self.blocks:
            embedded = layer(embedded, mask)
        return embedded