In [1]:
import math
import torch
from torch import nn
from torch.nn import functional as F
import collections
import random
import matplotlib.pyplot as plt
import re
%matplotlib auto

Using matplotlib backend: Qt5Agg


In [1]:
file_path=r'F:\study\ml\LM\8\timemachine.txt'
def read_time_machine():
    with open(file_path) as f:
        lines=f.readlines()
    return [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in lines]

In [2]:
def tokenize(lines,token='word'):
    if token=='word':
        return [line.split() for line in lines]
    elif token=='char':
        return [list(line) for line in lines]
    else:
        print("error : unknown token type : ",token)

In [4]:
class Vocab:
    def __init__(self,tokens=None,min_freq=0,reserved_tokens=None):
        if tokens is None:
            tokens=[]
        if reserved_tokens is None:
            reserved_tokens=[]
        counter=count_corpus(tokens)
        self._token_freqs=sorted(counter.items(),key=lambda x:x[1],reverse=True)
        self.idx_to_token=['<unk>']+reserved_tokens
        self.token_to_idx={token:idx for idx,token in enumerate(self.idx_to_token)}
        self.idx_to_token,self.token_to_idx=[],dict()
        for token,freq in self._token_freqs:
            if freq<min_freq:
                break;
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token]=len(self.idx_to_token)-1
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self,tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self,indices):
        if not isinstance(indices,(list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    @property
    def unk(self):
        return 0;
    
    @property
    def token_freqs(self):
        return self._token_freqs;
    

In [5]:
def count_corpus(tokens):
    if len(tokens)==0 or isinstance(tokens[0],list):
        tokens=[token for line in tokens for token in line]
    return collections.Counter(tokens)

In [6]:
tokens=tokenize(read_time_machine())
corpus=[token for line in tokens for token in line]
vocab=Vocab(corpus)

In [7]:
def load_corpus_time_machine(max_tokens=-1):
    lines=read_time_machine()
    tokens=tokenize(lines,'char')
    vocab=Vocab(tokens)
    corpus=[vocab[token] for line in tokens for token in line]
    if max_tokens>0:
        corpus=corpus[:max_tokens]
    return corpus,vocab

In [8]:
corpus,vocab=load_corpus_time_machine()
len(corpus),len(vocab)

(170580, 27)

In [9]:
def seq_data_iter_random(corpus,batch_size,num_steps):
    corpus=corpus[random.randint(0,num_step-1):]
    num_subseqs=(len(corpus)-1)//num_steps
    initial_indices=list(range(0,num_subseqs*num_steps,num_steps))
    random.shuffle(initial_indices)

    def data(pos):
        return corpus[pos:pos+num_steps]
    
    num_batches=num_subseqs // batch_size
    for i in range(0,batch_size*num_batches,batch_size):
        initial_indices_per_batch=initial_indices[i:i+batch_size]
        X=[data(j) for j in initial_indices_per_batch]
        Y=[data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X),torch.tensor(Y)
    

In [10]:
def seq_data_iter_sequential(corpus,batch_size,num_steps):
    offset=random.randint(0,num_steps)
    num_tokens=((len(corpus)-offset -1) //batch_size)*batch_size
    Xs=torch.tensor(corpus[offset:offset+num_tokens])
    Ys=torch.tensor(corpus[offset+1:offset+1+num_tokens])
    Xs,Ys=Xs.reshape(batch_size,-1),Ys.reshape(batch_size,-1)
    num_batches=Xs.shape[1]//num_steps
    for i in range(0,num_steps*num_batches,num_steps):
        X=Xs[:,i:i+num_steps]
        Y=Ys[:,i:i+num_steps]
        yield X,Y

In [11]:
class SeqDataLoader:
    def __init__(self,batch_size,num_steps,use_random_iter,max_tokens):
        if use_random_iter:
            self.data_iter_fn=seq_data_iter_random
        else:
            self.data_iter_fn=seq_data_iter_sequential
        self.corpus,self.vocab=load_corpus_time_machine(max_tokens)
        self.batch_size,self.num_steps=batch_size,num_steps
        
    def __iter__(self):
        return self.data_iter_fn(self.corpus,self.batch_size,self.num_steps)

In [12]:
def load_data_time_machine(batch_size,num_steps,use_random_iter=False,max_tokens=10000):
    data_iter=SeqDataLoader(batch_size,num_steps,use_random_iter,max_tokens)
    return data_iter,data_iter.vocab

In [13]:
batch_size,num_steps=32,35
train_iter,vocab=load_data_time_machine(batch_size,num_steps)

In [14]:
num_hiddens=256
rnn_layer=nn.RNN(len(vocab),num_hiddens)

In [15]:
state=torch.zeros((1,batch_size,num_hiddens))
state.shape

torch.Size([1, 32, 256])

In [16]:
X=torch.rand(size=(num_steps,batch_size,len(vocab)))
Y,state_new=rnn_layer(X,state)
Y.shape,state_new.shape

(torch.Size([35, 32, 256]), torch.Size([1, 32, 256]))

In [17]:
rnn_layer.hidden_size

256

In [18]:
rnn_layer.num_layers

1

In [19]:
class RNNModule(nn.Module):
    def __init__(self,rnn_layer,vocab_size,**kwargs):
        super().__init__(**kwargs)
        self.rnn=rnn_layer
        self.vocab_size=vocab_size
        self.num_hiddens=self.rnn.hidden_size
        if not self.rnn.bidirectional:
            self.num_directions=1
            self.linear=nn.Linear(self.num_hiddens,self.vocab_size)
        else:
            self.num_directions=2
            self.linear=nn.Linear(self.num_hiddens*2,self.vocab_size)
    
    def forward(self,inputs,state):
        X=F.one_hot(inputs.T.long(),self.vocab_size)
        X=X.to(torch.float32)
        Y,state=self.rnn(X,state)
        output=self.linear(Y.reshape((-1,Y.shape[-1])))
        return output,state
    
    def begin_state(self,device,batch_size=1):
        if not isinstance(self.rnn,nn.LSTM):
            return torch.zeros((self.num_directions*self.rnn.num_layers,batch_size,self.num_hiddens),device=device)
        else:
            return (torch.zeros((self.num_directions*self.rnn.num_layers,batch_size,self.num_hiddens),device=device),
                    torch.zeros((self.num_directions*self.rnn.num_layers,batch_size,self.num_hiddens),device=device))

In [20]:
device='cpu'
net=RNNModule(rnn_layer,vocab_size=len(vocab))
net=net.to(device)

In [21]:
def predict_ch8(prefix,num_preds,net,vocab,device):
    state=net.begin_state(batch_size=1,device=device)
    outputs=[vocab[prefix[0]]]
    get_input=lambda : torch.tensor([outputs[-1]],device=device).reshape((1,1))
    for y in prefix[1:]:
        _,state=net(get_input(),state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y,state=net(get_input(),state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [22]:
predict_ch8('time traveller', 10, net, vocab, device)

'time travelleroooooooooo'

In [23]:
def grad_clipping(net,theta):
    if isinstance(net,nn.Module):
        params=[p for p in net.parameters() if p.requires_grad]
    else:
        params=net.params
    norm=torch.sqrt(sum(torch.sum((p.grad**2)) for p in params))
    if norm>theta:
        for param in params:
            param.grad[:]*=theta /norm
    

In [24]:
def train_epoch_ch8(net,train_iter,loss,updater,device,use_random_iter):
    state=None
    tr_l,tr_num=[],[]
    for X,Y in train_iter:
        if state is None or use_random_iter:
            state=net.begin_state(batch_size=X.shape[0],device=device)
        else:
            if isinstance(net,nn.Module) and not isinstance(state,tuple):
                state.detach_()
            else:
                for s in state:
                    s.detach_()
        y=Y.T.reshape(-1)
        X,y=X.to(device),y.to(device)
        y_hat,state=net(X,state)
        l=loss(y_hat,y.long()).mean()
        if isinstance(updater,torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net,l)
            updater.step()
        else:
            l.backward()
            grad_clipping(net,l)
            updater(batch_size=1)
        tr_l.append(l*y.numel())
        tr_num.append(y.numel())
    return math.exp(sum(tr_l)/sum(tr_num))
        
            
        

In [25]:
def sgd(params,lr,batch_size):
    with torch.no_grad():
        for p in params:
            p.data -=lr*p.grad.data/batch_size
            p.grad.zero_()
    

In [26]:
def train_ch8(net,train_iter,vocab,lr,num_epochs,device,use_random_iter=False):
    loss=nn.CrossEntropyLoss()
    if isinstance(net,nn.Module):
        updater=torch.optim.SGD(net.parameters(),lr)
    else:
        updater=lambda batch_size:sgd(net.params,lr,batch_size)
    predict=lambda prefix:predict_ch8(prefix,50,net,vocab,device)
    
    for epoch in range(num_epochs):
        ppl=train_epoch_ch8(net,train_iter,loss,updater,device,use_random_iter)
        
        if(epoch+1) %10==0:
            print(predict("time traveller"))
    print(f'困惑度 {ppl:.1f}')
    print(predict('time traveller'))
    print(predict('traveller'))
    

In [42]:
num_epochs,lr=500,1
train_ch8(net,train_iter,vocab,lr,num_epochs,'cpu')

time travellere the the the the the the the the the the the the 
time traveller and he the the the the the the the the the the th
time traveller the that the this the that the that the this the 
time traveller the thing the the the the the the the the the the
time traveller anding sion and have and the the the thas ans and
time traveller simension sion the ghit the the three dimension s
time traveller the fice travely thave and have a dine simett the
time traveller the ond he but se to he wis he thate d male ande 
time traveller the e dimensions ard he the ghe the e we the ong 
time traveller the gea lo ghe thaven th yon in a mone the time t
time traveller bot some anoul thave at ond have able this be and
time travellerit fiughis to teeputhe ald he sumale ingther attee
time travellerit s alaig the firedid now yor ank the this bect a
time travellericentalive the ind in filby becall hand the mime s
time travellerit s ala lle ge at a coul one somot ichthef a dor 
time traveller hat no hea

In [27]:
class RNNModuelScatch:
    def __init__(self,vocab_size,num_hiddens,device,get_params,init_state,forward_fn):
        self.vocab_size,self.num_hiddens=vocab_size,num_hiddens
        self.params=get_params(vocab_size,num_hiddens,device)
        self.init_state,self.forward_fn=init_state,forward_fn
    
    def __call__(self,X,state):
        X=F.one_hot(X.T,self.vocab_size).type(torch.float32)
        return self.forward_fn(X,state,self.params)
    
    def begin_state(self,batch_size,device):
        return self.init_state(batch_size,self.num_hiddens,device)
    

In [28]:
def get_params(vocab_size,num_hiddens,device):
    num_inputs=num_outputs=vocab_size

    def normal(shape):
        return torch.randn(size=shape,device=device)*0.01
    
    def three():
        return (normal((num_inputs,num_hiddens)),
                normal((num_hiddens,num_hiddens)),
                torch.zeros(num_hiddens,device=device))
    
    W_xz,W_hz,b_z=three()
    W_xr,W_hr,b_r=three()
    W_xh,W_hh,b_h=three()

    W_hq=normal((num_hiddens,num_outputs))
    b_q=torch.zeros(num_outputs,device=device)

    params=[W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q]
    for p in params:
        p.requires_grad_(True)
    return params

In [29]:
def init_gru_state(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),)

In [30]:
def gru(inputs,state,params):
    W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q=params
    H,=state
    outputs=[]
    for X in inputs:
        Z=torch.sigmoid((X@W_xz)+(H@W_hz)+b_z)
        R=torch.sigmoid(((X@W_xr)+(H@W_hr)+b_r))
        H_tilda=torch.tanh((X@W_xh)+((R*H)@W_hh)+b_h)
        H=Z*H+(1-Z)*H_tilda
        Y=H@W_hq+b_q
        outputs.append(Y)
    return torch.cat(outputs,dim=0),(H,)

In [37]:
vocab_size, num_hiddens, device = len(vocab), 256, 'cpu'
num_epochs, lr = 500, 1
model = RNNModuelScatch(len(vocab), num_hiddens, device, get_params,init_gru_state, gru)
train_ch8(model, train_iter, vocab, lr, num_epochs, device)

time traveller                                                  
time traveller te te te te te te te te te te te te te te te te t
time traveller the the the the the the the the the the the the t
time traveller the the the the the the the the the the the the t
time travellere the the the the the the the the the the the the 
time travellere the the the the the the the the the the the the 
time travellere the the the the the the the the the the the the 
time travellerererererererererererererererererererererererererer
time travellere the the the the the the the the the the the the 
time traveller and the the the the the the the the the the the t
time traveller and the the the the the the the the the the the t
time travellere the the the the the the the the the the the the 
time travellere the the the the the the the the the the the the 
time traveller and the the the the the the the the the the the t
time traveller the the the the the the the the the the the the t
time travellere the the t

#### lstm

In [31]:
batch_size, num_steps = 32, 35
train_iter, vocab = load_data_time_machine(batch_size, num_steps)

In [32]:
def get_lstm_params(vocab_size,num_hiddens,device):
    num_inputs=num_outputs=vocab_size

    def normal(shape):
        return torch.randn(size=shape,device=device)*0.01
    
    def three():
        return (normal((num_inputs,num_hiddens)),
                normal((num_hiddens,num_hiddens)),
                torch.zeros(num_hiddens,device=device))
    
    W_xi,W_hi,b_i=three()
    W_xf,W_hf,b_f=three()
    W_xo,W_ho,b_o=three()
    W_xc,W_hc,b_c=three()

    W_hq=normal((num_hiddens,num_outputs))
    b_q=torch.zeros(num_outputs,device=device)

    params=[W_xi,W_hi,b_i,W_xf,W_hf,b_f,W_xo,W_ho,b_o,W_xc,W_hc,b_c,W_hq,b_q]

    for p in params:
        p.requires_grad_(True)
    return params

In [33]:
def init_lstm_state(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),
            torch.zeros((batch_size,num_hiddens),device=device))

In [34]:
def lstm(inputs,state,params):
    [W_xi,W_hi,b_i,W_xf,W_hf,b_f,W_xo,W_ho,b_o,W_xc,W_hc,b_c,W_hq,b_q]=params
    (H,C)=state
    outputs=[]
    for X in inputs:
        I=torch.sigmoid((X@W_xi)+(H@W_hi)+b_i)
        F=torch.sigmoid((X@W_xf)+(H@W_hf)+b_f)
        O=torch.sigmoid((X@W_xo)+(H@W_ho)+b_o)
        C_tilda=torch.tanh((X@W_xc)+(H@W_hc)+b_c)
        C=F*C+I*C_tilda
        H=O*torch.tanh(C)
        Y=(H@W_hq)+b_q
        outputs.append(Y)
    return torch.cat(outputs,dim=0),(H,C)


In [35]:
vocab_size, num_hiddens, device = len(vocab), 256, 'cpu'
num_epochs, lr = 500, 1
model = RNNModuelScatch(len(vocab), num_hiddens, device, get_lstm_params,init_lstm_state, lstm)

In [36]:
train_ch8(model, train_iter, vocab, lr, num_epochs, device)

time traveller                                                  
time traveller                                                  
time traveller  t t a t t t a t t t a t t t a t t t a t t t a t 
time traveller at at at at at at at at at at at at at at at at a
time traveller at at at at at at at at at at at at at at at at a
time traveller at an the the the the the the the the the the the
time travellere the the the the the the the the the the the the 
time travellere the the the the the the the the the the the the 
time travellere the the the the the the the the the the the the 
time traveller the the the the the the the the the the the the t
time traveller an the the the the the the the the the the the th
time traveller an the the the the the the the the the the the th
time travellerererererererererererererererererererererererererer
time traveller the the the the the the the the the the the the t
time traveller and the the the the the the the the the the the t
time traveller an a the t