In [1]:
import re
import time

import torchtext

In [2]:
def preprocess(text):
    return re.sub(r'[!"#$%&\'()*+,\-./:;=?@[\\\]^_`{|}~0-9]', '', text)

In [3]:
TEXT = torchtext.data.Field(
    sequential=True,
    lower=True, preprocessing=torchtext.data.Pipeline(preprocess), stop_words=[''])

In [4]:
start = time.time()
train_set, = torchtext.datasets.WikiText103.splits(
    TEXT, root='../data/wikitext-103', validation=None, test=None)
print(f'Loaded data in {time.time() - start:.1f}s')

Loaded data in 202.2s


In [5]:
print(len(train_set[0].text))
print(train_set[0].text[:100])

85620355
['<eos>', 'valkyria', 'chronicles', 'iii', '<eos>', '<eos>', 'senjō', 'no', 'valkyria', '<unk>', 'chronicles', 'japanese', '戦場のヴァルキュリア', 'lit', 'valkyria', 'of', 'the', 'battlefield', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', 'is', 'a', 'tactical', 'role', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'mediavision', 'for', 'the', 'playstation', 'portable', 'released', 'in', 'january', 'in', 'japan', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', 'employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', 'time', 'gameplay', 'as', 'its', 'predecessors', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', 'nameless', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia', 'during', 'the', 'second', 'europan', 'war', 'who', 'perform', 'secret', 'black']


In [6]:
TEXT.build_vocab(train_set, min_freq=10)#, max_size=60000)

In [7]:
print(f'Created vocab with length: {len(TEXT.vocab)}')
for word in ['queen', 'king', '2011', '戦場のヴァルキュリア3', '<eos>', 'valkyria', 'chronicles', 'iii']:
    print(f' - index of "{word}": {TEXT.vocab.stoi[word]}')
for i in [0, 1, 2, 15]:
    print(f' - word with index "{i}": {TEXT.vocab.itos[i]}')

Created vocab with length: 111305
 - index of "queen": 822
 - index of "king": 214
 - index of "2011": 0
 - index of "戦場のヴァルキュリア3": 0
 - index of "<eos>": 7
 - index of "valkyria": 43910
 - index of "chronicles": 6770
 - index of "iii": 1098
 - word with index "0": <unk>
 - word with index "1": <pad>
 - word with index "2": the
 - word with index "15": with


In [8]:
print(dir(TEXT))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'batch_first', 'build_vocab', 'dtype', 'dtypes', 'eos_token', 'fix_length', 'include_lengths', 'init_token', 'is_target', 'lower', 'numericalize', 'pad', 'pad_first', 'pad_token', 'postprocessing', 'preprocess', 'preprocessing', 'process', 'sequential', 'stop_words', 'tokenize', 'truncate_first', 'unk_token', 'use_vocab', 'vocab', 'vocab_cls']


In [9]:
x1 = train_set[0].text[:4]
print(x1)
x2 = TEXT.process([x1])
print(x2.size())
print(x2)
# print(x1 == x2)

['<eos>', 'valkyria', 'chronicles', 'iii']
torch.Size([4, 1])
tensor([[    7],
        [43910],
        [ 6770],
        [ 1098]])
