In [1]:
import re
import time
from collections import Counter

import numpy as np
import torch
import torchtext

In [2]:
TEXT = torchtext.data.Field(
    sequential=True,
    lower=True,
    preprocessing=torchtext.data.Pipeline(
        lambda x: re.sub(r'[—–’°!"#$£￥%&\'()*+,\-./:;=?@[\\\]^_`{|}~0-9]', '', x)),
    pad_token=None,
    stop_words=['', '<eos>', '<unk>'])

In [3]:
start = time.time()
data_set, = torchtext.datasets.WikiText103.splits(
    TEXT, root='../data/wikitext-103', validation=None, test=None)
print(f'Loaded {len(data_set[0].text)} tokens in {time.time() - start:.1f}s')

Loaded 83083869 tokens in 207.0s


In [4]:
TEXT.build_vocab(data_set, min_freq=10)#, max_size=60000)

In [5]:
print(f'Created vocab with length: {len(TEXT.vocab)}')
for word in ['queen', 'king', '2011', '戦場のヴァルキュリア3', '<eos>', 'valkyria', 'chronicles', 'iii']:
    print(f' - index of "{word}": {TEXT.vocab.stoi[word]}')
for i in [0, 1, 2, 15]:
    print(f' - word with index "{i}": {TEXT.vocab.itos[i]}')

Created vocab with length: 111297
 - index of "queen": 815
 - index of "king": 210
 - index of "2011": 0
 - index of "戦場のヴァルキュリア3": 0
 - index of "<eos>": 0
 - index of "valkyria": 43903
 - index of "chronicles": 6763
 - index of "iii": 1091
 - word with index "0": <unk>
 - word with index "1": the
 - word with index "2": of
 - word with index "15": is


In [6]:
x1 = data_set[0].text[:4]
print(x1)
x2 = TEXT.process([x1])
print(x2.size())
print(x2)
# print(x1 == x2)

['valkyria', 'chronicles', 'iii', 'senjō']
torch.Size([4, 1])
tensor([[43903],
        [ 6763],
        [ 1091],
        [    0]])


In [7]:
class SkipGramDataset(torch.utils.data.Dataset):

    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

        freqs = Counter()
        for word, freq in self.vocab.freqs.items():
            freqs.update({self.vocab.stoi[word]: freq})
        # Build negatives table
        x = np.array([f for _, f in sorted(freqs.items())]) ** 0.75
        x = (1e8 * x / x.sum()).round()
        x = Counter({i: int(c) for i, c in zip(sorted(freqs.keys()), x)})
        self.negatives = np.array(list(x.elements()))
        np.random.shuffle(self.negatives)
        # Build sampling table
        x = np.array([f for _, f in sorted(freqs.items())]) / len(self.data)
        x = np.sqrt(0.001 / x) + 0.001 / x
        self.discards = {i: p for i, p in zip(sorted(freqs.keys()), x) if p < 1}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pass

In [8]:
a = SkipGramDataset(data_set[0].text, TEXT.vocab)
len(a)

83083869