In [1]:
# text-preprocessing
from matplotlib import pyplot as plt
import torch,random,re,collections,time

In [2]:
# 读取数据集
with open("./data/timemachine.txt", "r") as f:
    lines = f.readlines()
lines = [re.sub("[^A-Za-z]+", " ", line).strip().lower() for line in lines]


# 词元化
def tokenize(strings, token="word"):
    if token == "word":
        return [str.split() for str in strings]
    elif token == "char":
        return [list(str) for str in strings]
    else:
        print("Wrong!")


tokens = tokenize(lines, token="word")
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [3]:
# 统计词元的频率
def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)


# 词表
class Vocabulary:
    def __init__(self, tokens=None, min_freq=0, reversed_tokens=None) -> None:
        if tokens == None:
            tokens = []
        if reversed_tokens == None:
            reversed_tokens = []
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=self.cmp, reverse=True)
        self.idx_to_token = ["<unk>"] + reversed_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break  # 后面的不加入到词表中
            if token not in self.token_to_idx:
                self.token_to_idx[token] = len(self.idx_to_token)
                self.idx_to_token.append(token)

    def cmp(self, x):
        return x[1]

    @property
    def unk(self):
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.to_tokens(indice) for indice in indices]

In [4]:
vocab = Vocabulary(tokens)
print(list(vocab.token_to_idx.items())[:10])
print(vocab['ads'])
for i in [0, 10]:
    print('文本:', tokens[i])
    print('索引:', vocab[tokens[i]])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]
0
文本: ['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 19, 50, 40, 2183, 2184, 400]
文本: ['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
索引: [2186, 3, 25, 1044, 362, 113, 7, 1421, 3, 1045, 1]


In [5]:
def load_cropus(max_tokens=-1, file_name="timemachine", pattern="word"):

    with open(f"./data/{file_name}.txt", "r") as f:
        lines = f.readlines()
    lines = [re.sub("[^A-Za-z]+", " ", line).strip().lower() for line in lines]
    tokens = tokenize(lines, 'char')
    vocab = Vocabulary(tokens=tokens, min_freq=0)
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

In [6]:
# tokens = tokenize(read_time_machine())
# 因为每个文本行不一定是一个句子或一个段落，因此我们把所有文本行拼接到一起
corpus = [token for line in tokens for token in line]
vocab = Vocabulary(corpus)
vocab.token_freqs[:10]

[('the', 2261),
 ('i', 1267),
 ('and', 1245),
 ('of', 1155),
 ('a', 816),
 ('to', 695),
 ('was', 552),
 ('in', 541),
 ('that', 443),
 ('my', 440)]

In [7]:
corpus, vocab = load_cropus(pattern='word')
len(corpus), len(vocab)

(170580, 28)

In [8]:
def seq_data_iter_random(corpus, batch_size, num_steps, random_offset="True"):
    if random_offset:
        a = random.randint(0, num_steps - 1)
        # 如果偏移量大于时间步x，则偏移量中的x可以作为新的时间步
    else:
        a = 0
    print(f'random offset: {a}')
    # update corpus
    corpus = corpus[a:]
    num_subseqs = (len(corpus) - 1) // num_steps
    initial_indices=[i*num_steps for i in range(num_subseqs)]
    random.shuffle(initial_indices)
    num_batches=num_subseqs//batch_size
    for i in range(0,num_batches*batch_size,batch_size):
        initial_indices_per_batch=initial_indices[i:i+batch_size]
        X=[corpus[j:j+num_steps]for j in initial_indices_per_batch]
        Y=[corpus[j+1:j+num_steps+1]for j in initial_indices_per_batch]
        yield torch.tensor(X),torch.tensor(Y)


In [9]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

random offset: 2
X:  tensor([[12, 13, 14, 15, 16],
        [ 7,  8,  9, 10, 11]]) 
Y: tensor([[13, 14, 15, 16, 17],
        [ 8,  9, 10, 11, 12]])
X:  tensor([[27, 28, 29, 30, 31],
        [ 2,  3,  4,  5,  6]]) 
Y: tensor([[28, 29, 30, 31, 32],
        [ 3,  4,  5,  6,  7]])
X:  tensor([[17, 18, 19, 20, 21],
        [22, 23, 24, 25, 26]]) 
Y: tensor([[18, 19, 20, 21, 22],
        [23, 24, 25, 26, 27]])


In [10]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):  # @save
    offset = random.randint(0, num_steps - 1)
    offset = 0
    corpus = corpus[offset:]
    num_tokens = ((len(corpus) - 1 )// batch_size) * batch_size
    Xs = torch.tensor(corpus[0:num_tokens]).reshape(batch_size, -1)
    Ys = torch.tensor(corpus[1 : num_tokens + 1]).reshape(batch_size, -1)
    num_batches = num_tokens // (batch_size * num_steps)
    for i in range(0, num_batches):
        X = Xs[:, i * num_steps : (i + 1) * num_steps]
        Y = Ys[:, i * num_steps : (i + 1) * num_steps]
        yield X, Y


for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print("X: ", X, "\nY:", Y)

X:  tensor([[ 0,  1,  2,  3,  4],
        [17, 18, 19, 20, 21]]) 
Y: tensor([[ 1,  2,  3,  4,  5],
        [18, 19, 20, 21, 22]])
X:  tensor([[ 5,  6,  7,  8,  9],
        [22, 23, 24, 25, 26]]) 
Y: tensor([[ 6,  7,  8,  9, 10],
        [23, 24, 25, 26, 27]])
X:  tensor([[10, 11, 12, 13, 14],
        [27, 28, 29, 30, 31]]) 
Y: tensor([[11, 12, 13, 14, 15],
        [28, 29, 30, 31, 32]])


In [11]:
class SeqDataLoader:
    def __init__(
        self, batch_size, num_steps, use_random_iter, max_tokens, pattern="word"
    ) -> None:
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        self.corpus, self.vocab = load_cropus(max_tokens, pattern=pattern)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [12]:
def load_data_time_machine(
    batch_size, num_steps, use_random_iter=False, max_tokens=10000
):
    data_iter = SeqDataLoader(
        batch_size,
        num_steps,
        use_random_iter=use_random_iter,
        max_tokens=max_tokens,
    )
    return data_iter, data_iter.vocab

In [13]:
# RNN
%matplotlib inline
import math,torch
from torch import nn
from torch.nn import functional as F

batch_size,num_steps=32,35
train_iter,vocab=load_data_time_machine(batch_size,num_steps)

In [14]:
print(len(vocab))

28


In [15]:
def get_params(vocab_size, num_hiddens, device):
    num_inputs,num_outputs=vocab_size,vocab_size

    W_xh=torch.randn((num_inputs,num_hiddens),device=device)
    W_hh=torch.randn((num_hiddens,num_hiddens),device=device)
    b_h=torch.zeros(num_hiddens,device=device)
    W_hq=torch.randn((num_hiddens,num_outputs),device=device)
    b_q=torch.zeros(num_outputs,device=device)
    params=[W_xh,W_hh,b_h,W_hq,b_q]
    for param in params:
        param.requires_grad=True
    return params

In [16]:
def init_rnn_state(batch_size,num_hiddens,device):
    return torch.zeros((batch_size,num_hiddens),device=device)
    # 加逗号是使用元组，没有逗号会被当做是(a)=a的表达式
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H = state
    outputs = []
    for x in inputs:
        h = torch.tanh(torch.matmul(x, W_xh) + torch.matmul(H, W_hh) + b_h)
        o = torch.matmul(h, W_hq) + b_q
        outputs.append(o)
    # o的形状：(num_step*batch_size，词表大小)
    return torch.cat(outputs, dim=0), H

In [17]:
def try_gpu(i=0):
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f"cuda:{i}")
    return torch.device("cpu")

print(try_gpu())

cuda:0


In [18]:
class RNNModelScratch:
    def __init__(
        self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn
    ):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(self.vocab_size, self.num_hiddens, device)
        self.init_state = init_state
        self.forward_fn = forward_fn

    def __call__(self, X, state):
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)

In [19]:
X = torch.arange(10).reshape((2, 5))
F.one_hot(X.T, 28).shape

torch.Size([5, 2, 28])

In [20]:
num_hiddens = 512
net = RNNModelScratch(len(vocab), num_hiddens, try_gpu(), get_params,
                      init_rnn_state, rnn)
state = net.begin_state(X.shape[0],try_gpu())
Y, new_state = net(X.to(try_gpu()), state)
Y.shape, len(new_state), new_state[0].shape,net.vocab_size

(torch.Size([10, 28]), 2, torch.Size([512]), 28)

In [21]:
def predict(prefix,num_preds,net,vocab,device):
    outputs=[vocab[prefix[0]]]
    state=net.begin_state(batch_size=1,device=device)
    for y in prefix[1:]:
        _,state=net(torch.tensor([outputs[-1]],device=device).reshape((1,1)),state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y,state=net(torch.tensor([outputs[-1]],device=device).reshape((1,1)),state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    # return ''.join(*vocab.to_tokens([outputs]))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [22]:
print(predict('time traveller ', 30, net, vocab,try_gpu()))

time traveller k<unk>ssssssssssssssssssssssssssss


In [23]:
def grad_clipping(net, theta):
    if isinstance(net, nn.Module):
        params = [p for p in net.parameters() if p.requires_grad]
    else:
        params = net.params
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [24]:
def sgd(params, lr, batch_size):
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

In [25]:
def train_epoch(net, train_iter, loss, updater, device, use_random_iter):
    state=None
    time_start=time.time()
    for X,Y in train_iter:
        if state is None or use_random_iter:
            state=net.begin_state(batch_size=X.shape[0],device=device)
        else:
            if isinstance(net,nn.Module()) and not isinstance(state,tuple):
                state.detach_()
            else:
                for s in state:
                    s.detach_()
        y=Y.T.reshape(-1)
        X,y=X.to(device),y.to(device)
        y_hat,state=net(X,state)
        l=loss(y_hat,y.long()).mean()
        if isinstance(updater,torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net,1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            updater(batch_size=1)
        # total_loss=l*y.numel()
        time_use=time.time()-time_start
        return math.exp(l),y.numel()/time_use

In [35]:
def train(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False):
    loss = nn.CrossEntropyLoss()
    updater=torch.optim.SGD(net.parameters(),lr)
    # updater = lambda batch_size: sgd(net.params, lr, batch_size)

    for epoch in range(num_epochs):
        ppl, speed = train_epoch(
            net, train_iter, loss, updater, device, use_random_iter
        )
        if (epoch + 1) % 10 == 0:
            print(predict("time traveller", 50, net, vocab, device))
            print(f"epoch {epoch+1}, ppl {ppl}")
    print(f"困惑度 {ppl:.1f}, {speed:.1f} 词元/秒 {str(device)}")
    print(predict("time traveller", 50, net, vocab, device))
    print(predict("traveller", 50, net, vocab, device))

In [27]:
try_gpu()

device(type='cuda', index=0)

In [28]:
len(vocab)

28

In [29]:
# num_epochs, lr = 500, 1
# train(net, train_iter, vocab, lr, num_epochs, try_gpu())

In [30]:
# RNN concise
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

batch_size, num_steps = 32, 35
train_iter, vocab = load_data_time_machine(batch_size, num_steps)
num_hiddens=256
rnn_layer=nn.RNN(len(vocab),num_hiddens)
state=torch.zeros((1,batch_size,num_hiddens))

In [31]:
class RNNModel(nn.Module):
    def __init__(self, rnn_layer, vocab_size) -> None:
        super(RNNModel, self).__init__()
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.num_hiddens = self.rnn.hidden_size
        if not self.rnn.bidirectional:
            self.num_directions = 1
            self.linear = nn.Linear(self.num_hiddens, self.vocab_size)
        else:
            self.num_directions = 2
            self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size)

    def forward(self, inputs, state):
        X = F.one_hot(inputs.T.long(), self.vocab_size)
        X = X.to(torch.float32)
        Y, state = self.rnn(X, state)
        output = self.linear(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, device, batch_size=1):
        if not isinstance(self.rnn, nn.LSTM):
            return torch.zeros(
                (
                    self.num_directions * self.rnn.num_layers,
                    batch_size,
                    self.num_hiddens,
                ),
                device=device,
            )
        else:
            return (
                torch.zeros(
                    (
                        self.num_directions * self.rnn.num_layers,
                        batch_size,
                        self.num_hiddens,
                    ),
                    device=device,
                ),
                torch.zeros(
                    (
                        self.num_directions * self.rnn.num_layers,
                        batch_size,
                        self.num_hiddens,
                    ),
                    device=device,
                ),
            )

In [32]:
net = RNNModel(rnn_layer, vocab_size=len(vocab))
net = net.to(try_gpu())
predict("time traveller", 10, net, vocab, try_gpu())

'time travellereyyyyyyyyy'

In [36]:
num_epochs, lr = 500, 1
train(net, train_iter, vocab, lr, num_epochs, try_gpu())

time traveller                                                  
epoch 10, ppl 18.564595587381607
time traveller                                                  
epoch 20, ppl 16.91033488729284
time traveller th  a   a  a   a  a   a  a   a  a   a  a   a  a  
epoch 30, ppl 14.501900209752568
time traveller ao  an  an  an  an  an  an  an  an  an  an  an  a
epoch 40, ppl 12.559301541730122
time traveller ao  ae the  ao  ae the  ao  ae the  ao  ae the  a
epoch 50, ppl 11.240032706620338
time traveller an in the the the the the the the the the the the
epoch 60, ppl 10.00452175802084
time traveller the the the t the the the the the the the the the
epoch 70, ppl 8.966000684356063
time traveller ar he tant the  ar ant the  ar ane the  ar se tan
epoch 80, ppl 8.55381858946517
time traveller an ho s io s an in s an he he the he the s than s
epoch 90, ppl 7.409951022602628
time traveller ae wer ae sar an in the the and ane the  al ine t
epoch 100, ppl 6.919818847756392
time traveller the shes ne