In [1]:
import torch
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F

SEED = 1234

torch.manual_seed(SEED)  # 为CPU设置随机种子
torch.cuda.manual_seed(SEED)  #为GPU设置随机种子
# 在程序刚开始加这条语句可以提升一点训练速度，没什么额外开销
torch.backends.cudnn.deterministic = True

# 首先，我们要创建两个Field 对象：这两个对象包含了我们打算如何预处理文本数据的信息。
# spaCy:英语分词器,类似于NLTK库，如果没有传递tokenize参数，则默认只是在空格上拆分字符串。
# torchtext.data.Field : 用来定义字段的处理方法（文本字段，标签字段）
TEXT = data.Field(tokenize='spacy',fix_length=380)
#LabelField是Field类的一个特殊子集，专门用于处理标签。
LABEL = data.LabelField(dtype=torch.float)

# 加载IMDB电影评论数据集
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

import random
# 默认split_ratio=0.7
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

# 从预训练的词向量（vectors）中，将当前(corpus语料库)词汇表的词向量抽取出来，构成当前 corpus 的 Vocab（词汇表）
# 预训练的 vectors 来自glove模型，每个单词有100维。glove模型训练的词向量参数来自很大的语料库
# 而我们的电影评论的语料库小一点，所以词向量需要更新，glove的词向量适合用做初始化参数。
TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [2]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 相当于把样本划分batch，知识多做了一步，把相等长度的单词尽可能的划分到一个batch，不够长的就用padding。
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)

In [3]:
# 计算预测的准确率

def binary_accuracy(preds, y):
  """
  Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
  """
  
  # .round函数 四舍五入，rounded_preds要么为0，要么为1
  # neg为0, pos为1
  rounded_preds = torch.round(torch.sigmoid(preds))
  
  # convert into float for division
  """
  a = torch.tensor([1, 1])
  b = torch.tensor([1, 1])
  print(a == b)
  output: tensor([1, 1], dtype=torch.uint8)
  
  a = torch.tensor([1, 0])
  b = torch.tensor([1, 1])
  print(a == b)
  output: tensor([1, 0], dtype=torch.uint8)
  """
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  
  return acc

In [4]:
# 不用优化器了
def evaluate(model, iterator, criterion):
  
  epoch_loss = 0
  epoch_acc = 0
  total_len = 0
  
  # 转成测试模式，冻结dropout层或其他层
  model.eval() 
  
  with torch.no_grad():
    # iterator为valid_iterator
    for batch in iterator:
      
      # 没有反向传播和梯度下降
      
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)

      epoch_loss += loss.item() * len(batch.label)
      epoch_acc += acc.item() * len(batch.label)
      total_len += len(batch.label)
  
  
  # 调回训练模式
  model.train()
  
  return epoch_loss / total_len, epoch_acc / total_len

In [5]:
import time

# 查看每个epoch的时间
def epoch_time(start_time, end_time):  
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### CNN 模型

In [30]:
def train(model, iterator, optimizer, criterion):
  
  epoch_loss = 0
  epoch_acc = 0
  total_len = 0
  
  # model.train()代表了训练模式
  # model.train() ：启用 BatchNormalization 和 Dropout
  # model.eval() ：不启用 BatchNormalization 和 Dropout
  model.train() 
  
  # iterator为train_iterator
  for batch in iterator:
    # 梯度清零，加这步防止梯度叠加
    optimizer.zero_grad()
    
    # batch.text 就是上面forward函数的参数text
    # 压缩维度，不然跟 batch.label 维度对不上
    predictions = model(batch.text).squeeze(1)
    
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    
    loss.backward()  # 反向传播
    optimizer.step() # 梯度下降
    
    # loss.item() 以及本身除以了 len(batch.label)
    # 所以得再乘一次，得到一个batch的损失，累加得到所有样本损失
    epoch_loss += loss.item() * len(batch.label)
    
    # (acc.item(): 一个batch的正确率) * batch数 = 正确数
    # train_iterator 所有batch的正确数累加
    epoch_acc += acc.item() * len(batch.label)
    
    # 计算 train_iterator 所有样本的数量，应该是17500
    total_len += len(batch.label)
    print('train loss = ',epoch_loss / total_len,'| train acc = ',epoch_acc / total_len)

  
  # epoch_loss / total_len ：train_iterator所有batch的损失
  # epoch_acc / total_len ：train_iterator所有batch的正确率
  return epoch_loss / total_len, epoch_acc / total_len

In [31]:

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes,
               output_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, out_channels = n_filters,kernel_size = (fs, embedding_dim))
                                    for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        text = text.permute(1, 0)        # [batch size, sent len]
        embedded = self.embedding(text)  # [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1) # [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [32]:
# 初始化参数

pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data.copy_(pretrained_embeddings)

# UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]  # UNK_IDX = 0

# # 词汇表25002个单词，前两个unk和pad也需要初始化，把它们初始化为0
# model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
# model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# print(model.embedding.weight.data)

In [33]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model = model.to(device)

In [34]:
import time 
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'CNN-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

train loss =  0.7774416208267212 | train acc =  0.5
train loss =  0.780773788690567 | train acc =  0.4765625
train loss =  0.7702415585517883 | train acc =  0.4895833333333333
train loss =  0.7764285951852798 | train acc =  0.4765625
train loss =  0.7555935740470886 | train acc =  0.50625
train loss =  0.7458275655905405 | train acc =  0.5182291666666666
train loss =  0.740338316985539 | train acc =  0.5245535714285714
train loss =  0.7328274101018906 | train acc =  0.533203125
train loss =  0.735551291041904 | train acc =  0.5277777777777778
train loss =  0.7375329256057739 | train acc =  0.521875
train loss =  0.7431170290166681 | train acc =  0.5085227272727273
train loss =  0.7444758812586466 | train acc =  0.5065104166666666
train loss =  0.7439538607230554 | train acc =  0.5084134615384616
train loss =  0.7387319718088422 | train acc =  0.515625
train loss =  0.7342114249865214 | train acc =  0.51875
train loss =  0.7321473248302937 | train acc =  0.525390625
train loss =  0.7320

KeyboardInterrupt: 

### LSTM model

In [19]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
  
  # model.train()代表了训练模式
  # model.train() ：启用 BatchNormalization 和 Dropout
  # model.eval() ：不启用 BatchNormalization 和 Dropout
    model.train() 
  
  # iterator为train_iterator
    for batch in iterator:
        optimizer.zero_grad()
#         print(batch.text.size())
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()  # 反向传播
        optimizer.step() # 梯度下降
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
#         print('train loss = ',epoch_loss / total_len,'| train acc = ',epoch_acc / total_len)
    
    return epoch_loss / total_len, epoch_acc / total_len

In [20]:
from torch import nn
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
               n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                       bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)  # *2是因为BiLSTM
        self.dropout = nn.Dropout(dropout)
    
    # embedding_dim: 每个词向量的维度
    # hidden_dim: 隐藏层的维度
    # num_layers: 神经网络深度，纵向深度
    # bidrectional: 是否双向循环RNN
    # dropout是指在深度学习网络的训练过程中，对于神经网络单元，按照一定的概率将其暂时从网络中丢弃。
    # 经过交叉验证，隐含节点dropout率等于0.5的时候效果最好，原因是0.5的时候dropout随机生成的网络结构最多。

    
    def forward(self, text):
        embedded = self.dropout(self.embedding(text)) # [sent len, batch size, emb dim]
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden.squeeze())

In [21]:
INPUT_DIM = len(TEXT.vocab)  # 25002
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

# PAD_IDX = 1 为pad的索引
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
            N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [22]:
# 初始化参数

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]  # UNK_IDX = 0

# 词汇表25002个单词，前两个unk和pad也需要初始化，把它们初始化为0
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.4765,  0.2254,  0.3035,  ..., -0.2082,  0.1948,  0.8972],
        [-0.2472, -1.1190,  0.3695,  ..., -0.5236, -1.1763,  1.4334],
        [-0.2821,  0.0417,  0.4807,  ..., -0.5425, -0.7024,  1.3024]])


In [23]:
# import torch.optim as optim

# 定义优化器
optimizer = torch.optim.Adam(model.parameters())

# 定义损失函数，这个BCEWithLogitsLoss特殊情况，二分类损失函数
criterion = nn.BCEWithLogitsLoss()

# 送到GPU上去
model = model.to(device)
criterion = criterion.to(device)

In [24]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

KeyboardInterrupt: 