In [22]:
import torch
from torchtext import data

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LABEL = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, lower=True)

train, val = data.TabularDataset.splits(path='test_text', train='train.csv', validation='val.csv',
                                        format='csv', skip_header=True,
                                        fields=[('PhraseId', None), ('SentenceId', None),
                                                ('Phrase', TEXT), ('Sentiment', LABEL)])

TEXT.build_vocab(train, vectors='glove.6B.100d', vectors_cache='vector_cache/')

### 与Iterator的<font color='red'>唯一</font>区别,如下:(其他<font color='red'>完全</font>相同)

In [24]:
'''
Defines an iterator that batches examples of similar lengths together
在文本处理中经常需要将每一批样本长度补齐为当前批中最长序列的长度(fix_length=None),因此当样本长度差别较大时,使用BucketIerator可以带来填充效率的提高.
'''
# 继承自Iterator;也可以使用splits进行构建
train_iterator_com = data.BucketIterator(dataset=train, device=device, batch_size=12,
                                         shuffle=True, # 这里看需求选择是否打乱
                                         sort_key=lambda x: len(x.Phrase),
                                         sort_within_batch=True)
for batch in train_iterator_com:
    print(type(batch), end='\n\n')
    print(batch, end='\n\n')
    print(batch.Phrase, end='\n\n') # 该批次内样本长度基本完全相同
    print(batch.Phrase.shape, end='\n\n')
    print(batch.Sentiment)
    break

<class 'torchtext.data.batch.Batch'>


[torchtext.data.batch.Batch of size 12]
	[.Phrase]:[torch.LongTensor of size 13x12]
	[.Sentiment]:[torch.LongTensor of size 12]

tensor([[   13,   143, 10316,    11,    66,   256,  1087,    29,    60,     5,
           231,    23],
        [    9,   104,  1486,     4,   147,    26,    22,  3839,   248,   130,
            44,    14],
        [  184,   154,    11,    18,   228,   137,   867,  3764,     2,    48,
           150,   406],
        [   33,   241,     4,    12,    11,    27, 14767,   158,    20,  1439,
           188,    14],
        [  365,    29,  2172,   267,   431,  9198,     6,    30, 14711,    16,
            13,    21],
        [   35,   949,  3554,     4,     5,     5, 10678,    48,   107,    13,
            72,    32],
        [10495,   129,     3,   573,     4,    38,    16,    85,    17,     6,
            33,    11],
        [   92,    44,     6,    10,  2705,   130,    17,    11,  1010,   130,
           536,    10],
        

