# Custom Dataloader

Pytorch dataloader for loading the preprocessed data and turning it into torch tensors that can be fed into the deep learning model

In [125]:
import numpy as np
import time
import torch
from torch.utils.data import Dataset, DataLoader

In [126]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [127]:
data = np.load('data/extracted_comb.npy', allow_pickle=True)

### Title has maximum 39 words

In [128]:
maximum = 0
for num, i in enumerate(data[:,0]):
    if maximum < len(i):
        maximum = len(i)

In [129]:
maximum

39

### Title has maximum 129 words

In [130]:
maximum = 0
for num, i in enumerate(data[:,1]):
    if maximum < len(i):
        maximum = len(i)

In [131]:
maximum

129

## For some reason text isn't uniformly 50 words long!!!

In [132]:
for num, i in enumerate(data[:,1]):
    if num == 2:
        break
    print(len(i))

54
55


## Custom dataset

Needs to be done:

* add starting and ending words 


In [133]:
padding_value = 0

class CustomDataset(Dataset):
    def __init__(self, root_dir, max_title=39, max_text=129):
        '''
        Args:
        root_dir (string): Path to npy directory
        ids (string): Path to csv file containing list of patient ids for training/testing
        diag_vocab (string): Path to csv file containing all diagnose labels
        max_visits (int): Maximum number of visits
        max_diag (int): Maximum number of diagnoses per visit that are considered
        Outputs:
        inputs (torch.tensor): Concatenated info of diagnoses and times between visits (batch_size, max_visits, vocab_size+1)
        labels (torch.tensor): Labels for visits
        '''
        self.data = np.load(root_dir, allow_pickle=True) 
        self.max_len_title = max_title
        self.max_len_text = max_text
        self.word2idx, self.idx2word = self.indexify_vocab(self.data)
        
    
    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self, idx):
        #load text and title data(these are lists)
        title = self.data[idx, 0]
        text = self.data[idx, 1]
        
        #indexify title and text
        title = self.indexify_title(title, self.max_len_title)
        text = self.indexify_text(text, self.max_len_text)
        
        #convert to tensors
        title = torch.from_numpy(title).to(device)
        text = torch.from_numpy(text).to(device)
        
        return title, text
    
    def indexify_vocab(self, vocab):
        '''
        function for creating word2idx and idx2word dictionaries of the vocabulary
        Arg:
        vocab (numpy array): data containing titles(on column 0) and text(on column 1)
        Return: 
        word2idx (dictionary): Words linked with unique index
        idx2word (dictionary): Indeces linked with unique words
        '''
        word2idx = dict()
        idx2word = dict()
        word2idx['<s>'] = 1
        idx2word[1] = '<s>'
        word2idx['</s>'] = 2
        idx2word[2] = '</s>'
        index = 3
        for num, i in enumerate(data[:,0]): #loop for titles
            for j in i:
                if j not in word2idx.keys():
                    word2idx[j] = index
                    idx2word[index] = j
                    index += 1
        
        for num, i in enumerate(data[:,1]): #loop for text
            for j in i:
                if j not in word2idx.keys():
                    word2idx[j] = index
                    idx2word[index] = j
                    index += 1
        
        return word2idx, idx2word
    
    def indexify_text(self, text, max_len):
        '''
        Arg:
        text (list): Contains text in list form
        Return:
        ret (numpy.array): Indexes of words in text
        '''
        ret = np.full((max_len), padding_value) 
        for i in range(len(text)):
            ret[i] = self.word2idx[text[i]]
            
        return ret
    
    def indexify_title(self, text, max_len):
        '''
        Arg:
        text (list): Contains title in list form
        Return:
        ret (numpy.array): Indexes of words in text
        '''
        ret = np.full((max_len+1), padding_value) 
        for i in range(len(text)):
            ret[i+1] = self.word2idx[text[i]]
            
        return ret
    

## Testing

In [134]:
start = time.time()
ds = CustomDataset("data/extracted_comb.npy")
print(time.time()-start)

2.3494722843170166


In [135]:
test, train = torch.utils.data.random_split(ds, [round(len(data)*0.2), round(len(data)*0.8)])

In [136]:
train

<torch.utils.data.dataset.Subset at 0x21e384c6fc8>

In [137]:
trainloader = DataLoader(dataset=train, batch_size=32, shuffle=True)

In [138]:
for i, data in enumerate(trainloader):
    if i==1:
        break
    print(data[0])
    print(data[0].shape)
    print(data[1])
    print(data[1].shape)

tensor([[    0,  3075,  3076,  ...,     0,     0,     0],
        [    0,  5641, 22256,  ...,     0,     0,     0],
        [    0,   611,   972,  ...,     0,     0,     0],
        ...,
        [    0,    14,    15,  ...,     0,     0,     0],
        [    0,  1781,  1782,  ...,     0,     0,     0],
        [    0,    13,   292,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
torch.Size([32, 40])
tensor([[  629,  2252,  1205,  ...,     0,     0,     0],
        [  138,  1705,   140,  ...,     0,     0,     0],
        [  809,   388,   134,  ...,     0,     0,     0],
        ...,
        [   13,  5793,    75,  ...,     0,     0,     0],
        [ 5963, 15094,    69,  ...,     0,     0,     0],
        [ 2800,  3860,   445,  ...,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)
torch.Size([32, 129])


Seems to work!!!