In [5]:
import torchtext
import torch
import torch.nn as nn
import numpy as np
import random

In [6]:
torch.cuda.get_device_name(None)

'Tesla P4'

## 数据处理部分

In [None]:
def set_seed(seed):
    """随机数种子设置"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed) # CPU上设置随机种子
    torch.cuda.manual_seed(seed) # 当前GPU上设置随机种子

set_seed(0)

In [8]:
MAX_VOCAB_SIZE = 10000
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.\
    splits(path='.', train='text8.train.txt', validation='text8.dev.txt',
           test='text8.test.txt', text_field=TEXT)
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [9]:
VOCAB_SIZE = len(TEXT.vocab) # 单词数目
print("vocabulary size: {}".format(VOCAB_SIZE))

vocabulary size: 10002


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32
train_iter = torchtext.data.BPTTIterator(train, batch_size=BATCH_SIZE, device=device, bptt_len=30, shuffle=True)
val_iter = torchtext.data.BPTTIterator(val, batch_size=BATCH_SIZE, device=device, bptt_len=30, shuffle=False)
test_iter = torchtext.data.BPTTIterator(test, batch_size=BATCH_SIZE, device=device,bptt_len=30, shuffle=False)

In [11]:
for i in train_iter:
    print(i.text.shape, '\n', i.text)
    print(i.target.shape, '\n', i.target)
    break

torch.Size([30, 32]) 
 tensor([[5269, 6271,  417,    9,    6,  375,  317, 2278,    6,   21,   72,   54,
          742,    2, 4434,  283,   23,  531,    0,    5,  463, 5850,   22, 8624,
         1455,   68,   11,   66,    2, 5931,    3,    0],
        [3110,    6,  288,    2, 3047,    2,   25,  109,  261,   50, 6129,  892,
            7,    0,   25,    0,   18,    5,  556,   10,    7, 4664,    5,   43,
          163,    5,    9,    2, 1311,   57,  168,    6],
        [  13, 3593,  458, 1259,   40,  375,   10,  550,    3,    0,   21,    0,
            0,    3,    2,    7, 2316,   10,  427,    5, 1185,  127,   48,  504,
         2461,    0,    9,  277,    3,   12,    0,  314],
        [   7,    4,    0,    0,   55,   19,   11,    4, 3278, 4858,  176,  119,
          340,    0, 8644,  381,    0,    5,  882,   18, 8991,  416,   49,   27,
            8, 1435,   18,    6,    2,   95,  497, 1853],
        [ 196,  105, 3693, 1416,  289,   78,   17,    0, 1180,    3,  130,    5,
            3,  

## 模型部分

In [14]:
from RNNModel import RNNModel
%run RNNModel.py

EMBEDDING_SIZE = 650
HIDDEN_SIZE = 100
model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, 2).to(device)
model

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (embed): Embedding(10002, 650)
  (rnn): LSTM(650, 100, num_layers=2, dropout=0.5)
  (linear): Linear(in_features=100, out_features=10002, bias=True)
)

In [16]:
loss_fn = nn.CrossEntropyLoss() # 损失函数
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [17]:
from RNN_train import RNN_train
%run RNN_train.py

CRAD_CLIP = 1
NUM_EPOCHS = 2

net = RNN_train(model=model, optimizer=optimizer, criterion=loss_fn, device=device, batch_size=BATCH_SIZE)
for epoch in range(NUM_EPOCHS):
    train_loss = net.train(train_iter, CRAD_CLIP)
    valid_loss = net.evaluate(val_iter)

torch.save(net.model.state_dict(), 'net.pth')

train loss: tensor(9.2191, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(6.1607, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.6713, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(6.0158, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.5789, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.6051, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.5207, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.4126, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.4051, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.3398, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.4990, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.3214, device='cuda:0', grad_fn=<NllLossBackward>)
train loss: tensor(5.1796, device='cuda:0', grad_fn=<NllLossBackward>)
train 

In [18]:
best_model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, 2).to(device) # 读入保存好的模型
best_model.load_state_dict(torch.load("net.pth"))
best_model

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (embed): Embedding(10002, 650)
  (rnn): LSTM(650, 100, num_layers=2, dropout=0.5)
  (linear): Linear(in_features=100, out_features=10002, bias=True)
)

In [19]:
test_loss = RNN_train(model, optimizer, loss_fn, device, BATCH_SIZE).evaluate(test_iter) # 测试数据集的损失
print("perplexity: ", np.exp(test_loss)) # 测试数据的困惑度

valid loss: tensor(4.6224, device='cuda:0')
perplexity:  137.98903409089112


In [20]:
test_net = RNN_train(model, optimizer, loss_fn, device, 1)
hidden = test_net.init_hidden()
input_int = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device) # 随机生成一个整数

words = []
for i in range(100):
     output, hidden = best_model(input_int, hidden)
     word_weights = output.squeeze().exp() # 函数.exp(是为了防止出现负值)
     word_idx = torch.multinomial(word_weights, 1)[0] # 使用多项式概率(权重为word_weights)随机抽样出一个单词
     input_int.fill_(word_idx) # 用来预测下一个单词
     word = TEXT.vocab.itos[word_idx]
     words.append(word)
print(" ".join(words)) # (LSTM)语言模型生成的文本

disagree <unk> at <unk> it started two zero zero many te attend half one zero eight five martin five in david election incorporated on under <unk> and i of deities used similar to lowered becomes supported by other the homosexual body only an pure encryption manual just of regular sources recent intelligence standard and humanity social channel one one which goes a whole but of one groups them with the lowest <unk> this in <unk> to experience what the c one and the man first vary in the so were sonata that also allow the applications of <unk> and for
