In [1]:
import torch
from torchtext.datasets import LanguageModelingDataset
import torchtext.data as data

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TEXT = data.Field(sequential=True, lower=True)
article = LanguageModelingDataset(path='test_text/english.txt',
                                  text_field=TEXT, encoding='utf-8')
TEXT.build_vocab(article, vectors='glove.6B.100d', vectors_cache='vector_cache/')

In [3]:
article[0].text[:10]

['the',
 'positive',
 'meanings',
 'of',
 'love',
 '<eos>',
 "we'd",
 'like',
 'to',
 'share']

In [4]:
# 继承自Iterator;也可以使用splits进行构建(看源代码易懂)
# ★★★★★Defines an iterator for language modeling tasks that use BPTT
# ★★★★★Expects a Dataset with a single example and a single field called 'text'
# ★★★★★推荐只使用这几个参数(查看其__iter__方法易知)
train_iterator_com = data.BPTTIterator(dataset=article, # LanguageModelingDataset的输出为单个example,单个字段且为text
                                       device=device,
                                       batch_size=10,
                                       bptt_len=2) # Length of sequences for backpropagation through time.
train_iterator_com

<torchtext.data.iterator.BPTTIterator at 0x1ee33104fa0>

In [5]:
test_list = None
for batch in train_iterator_com:
    print(type(batch), end='\n\n')
    print(batch, end='\n\n')
    test_list = batch.text
    print(batch.text)
    print(batch.target) # batch.text后面一个位置处的单词
    break

<class 'torchtext.data.batch.Batch'>


[torchtext.data.batch.Batch of size 10]
	[.text]:[torch.cuda.LongTensor of size 2x10 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 2x10 (GPU 0)]

tensor([[  4,   2,   4,  27,  33,  18,  23,  10, 149, 293],
        [ 90,  46, 164,  17, 111,  47,  77,  56,  58,   4]], device='cuda:0')
tensor([[ 90,  46, 164,  17, 111,  47,  77,  56,  58,   4],
        [ 85,  44,   8,   2,   8,   5, 171,   6,  14, 272]], device='cuda:0')


In [6]:
it = iter(train_iterator_com)
batch = next(it)
print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:, 1].data]))
print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:, 1].data])) # batch.text的下个单词

i care
care about


In [7]:
def find_index(lst, find):
    lst_len = len(lst)
    find_len = len(find)
    for i in range(lst_len):
        num = 0
        for s in range(find_len):
            if lst[i+s] == find[s]:
                num += 1
        if num == find_len:
            return i

        if i == len(lst) - len(find):
            break

In [8]:
id_list = []
for k in article.examples[0].text:
    id_list.append(TEXT.vocab.stoi[k])

inter = [] # 每个句子的起始位置
for j in range(10):
    index = find_index(id_list, test_list[:, j].tolist())
    inter.append(index)

#### 通过测试可以看出:
1. ★★★★★bptt_len决定每个句子的长度(即单词个数)
2. ★★★★★每个句子的起始位置等于(间隔长度为:$ \mathrm{interval} = \lceil \text{单词数量} / \mathrm{batch\_size} \rceil $):
    * 第一个句子的起始位置: $  \mathrm{interval} \cdot 0    $
    * 第二个句子的起始位置: $  \mathrm{interval} \cdot 1    $
    * 第三个句子的起始位置: $  \mathrm{interval} \cdot 2    $
    * $\cdots$
3. 输出的数字为对应单词在单词表中的id

In [9]:
print(len(id_list)) # 文本长度
print(inter)


920
[0, 92, 184, 190, 368, 109, 552, 633, 736, 828]


In [10]:
len(train_iterator_com) # 等于:间隔长度/bptt_len

46