In [1]:
import re
import torch
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [2]:
# 前處理，將文字做tokenize轉為id

def preprocess(corpus):
    
    word_dic = set()
    processed_corpus = []
    
    for sentence in corpus:
        #將所有字詞轉為小寫
        sentence = sentence.lower()
        #只保留英文
        sentence = re.sub('[^a-zA-Z]',' ',sentence)
        
        sentence = sentence.split()
        
        #添加字詞到字典中
        word_dic |= set(sentence)
        processed_corpus.append(sentence)
    
    #建立字詞ID清單
    word2idx = dict()
    for word in word_dic:
        if word not in word2idx:
            idx = len(word2idx) + 1
            word2idx[word] = idx
    
    #將文本轉為ID型式
    id_mapping = lambda x: word2idx[x]
    processed_corpus = [list(map(id_mapping, sentence)) for sentence in processed_corpus]
    
    return processed_corpus, word2idx

In [3]:
corpus = ['i love nlp', 'hi, nice to meet you', 'deep learning is fun', 'enjoy cupoy nlp course']
labels = [0,0,0,0]
corpus, word2idx = preprocess(corpus)
corpus

[[10, 7, 1], [2, 13, 11, 12, 15], [14, 4, 3, 5], [9, 6, 1, 8]]

In [4]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])

In [5]:
custom_dst = dataset(corpus, labels)
data_loader = DataLoader(custom_dst, batch_size=2, shuffle=False)
next(iter(data_loader))

RuntimeError: stack expects each tensor to be equal size, but got [3] at entry 0 and [5] at entry 1

In [6]:
from torch.utils.data import Sampler, SequentialSampler, RandomSampler

In [7]:
test_data = [1,5,78,9,68]

#SequentialSampler
print("------SequentialSampler--------")
seq_index = SequentialSampler(test_data)
for x in seq_index:
    print(x)
    
#RandomSampler
print("------RandomSampler--------")
random_index = RandomSampler(test_data)
for x in random_index:
    print(x)

------SequentialSampler--------
0
1
2
3
4
------RandomSampler--------
2
3
4
0
1


In [8]:
corpus = ['i love nlp', 'hi, nice to meet you', 'deep learning is fun', 'enjoy cupoy nlp course']
labels = [0,0,0,0]
corpus, word2idx = preprocess(corpus)
corpus = sorted(corpus, key=lambda x: len(x), reverse=True) #將文本由長到短排列
corpus

[[2, 13, 11, 12, 15], [14, 4, 3, 5], [9, 6, 1, 8], [10, 7, 1]]

In [9]:
# 定義隨機連續的客製化sampler
class RandomSequentialSampler(Sampler):
    
    def __init__(self, data_source, batch_size):
        self.data_source = data_source
        self.batch_size = batch_size
        
    def __iter__(self):
        n_batch = len(self) // self.batch_size #calculate number of batches
        leftover = len(self) % self.batch_size #calculate the remaining part
        index = np.zeros(len(self), dtype=int)
        
        #batch
        for i in range(n_batch):
            # 因最後一組要符合 batch_size 的大小 所以扣掉
            start_idx = random.randint(0, len(self) - self.batch_size)
            batch_idx = start_idx + np.arange(0, self.batch_size)
            index[i*self.batch_size:(i+1)*self.batch_size] = batch_idx
            
        #dealing with leftover
        if leftover:
            tail_start = random.randint(0, len(self) - leftover)
            tail_index = tail_start + np.arange(0, leftover)
            index[-leftover:] = tail_index
            
        return iter(index)
    
    def __len__(self):
        
        return len(self.data_source)

In [10]:
# 定義客製化collate_fn，將長度不一的文本pad 0 變成相同長度
def collate_fn(batch):

    corpus, labels = zip(*batch) 
    
    ### create pads for corpus ###
    lengths = [len(x) for x in corpus]
    max_length = max(lengths)
    
    batch_corpus = []
    
    for i in range(len(corpus)):
        # pad corpus
        tmp_pads = torch.zeros(max_length)
        tmp_pads[:lengths[i]] = corpus[i]
        tmp_pads.view(-1, 1)
        batch_corpus.append(tmp_pads.view(1,-1))

    return torch.cat(batch_corpus,dim=0), torch.tensor(labels) , torch.tensor(lengths)

In [11]:
custom_dst = dataset(corpus, labels)
custom_sampler = RandomSequentialSampler(corpus, 2)
data_loader = DataLoader(custom_dst, sampler=custom_sampler, collate_fn=collate_fn, batch_size=2)
next(iter(data_loader))

(tensor([[14.,  4.,  3.,  5.],
         [ 9.,  6.,  1.,  8.]]),
 tensor([0, 0]),
 tensor([4, 4]))