In [190]:
import time
import torch
from torchtext import data
import torch.nn as nn
from torchtext import datasets
import numpy as np
import torch.optim as optim
import spacy

In [191]:
spacy_en = spacy.load('en_core_web_sm')


def tokenizer(text):
    """定义分词操作"""
    return [tok.text for tok in spacy_en.tokenizer(text)]


TEXT = data.Field(tokenize=tokenizer, include_lengths=True)
LABEL = data.Field(sequential=False, unk_token=None, dtype=torch.float32)

# 创造情感分析数据集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)  # load the IMDb dataset(电影评论数据集)
train_data

<torchtext.datasets.imdb.IMDB at 0x1b91fa47850>

In [192]:
# 数据集的长度
print(len(train_data))
print(len(test_data))

print(len(train_data.examples))
print(len(test_data.examples))

25000
25000
25000
25000


In [193]:
print(train_data.examples[0].text)  # 每条数据的文本

print(len(train_data.examples[0].text))
print(len(train_data.examples[1].text))
print(len(train_data.examples[2].text))
print(len(train_data.examples[3].text))  # 每条数据的长度

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'line', ':'

In [194]:
train_data.examples[0].label  # 每条数据的标签,这里为积极的情感

'pos'

In [195]:
train_dataset, valid_dataset = train_data.split(random_state=np.random.seed(1))  # 划分训练数据集

In [196]:
print(f'Number of training examples: {len(train_dataset)}')
print(f'Number of validation examples: {len(valid_dataset)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [197]:
# 建立词表
TEXT.build_vocab(train_dataset, max_size=25000, vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_,
                 vectors_cache='D:/PythonCode/F_PyTorch/高阶操作及深度学习相关理论/torchtext自然语言处理/vector_cache')
LABEL.build_vocab(train_dataset)

In [198]:
print(TEXT.vocab.freqs.most_common(20))
print(LABEL.vocab.freqs)  # 数量上基本上是1:1的比例,所以不需要对样本做重采样来保持正负样本比例均衡

[('the', 204174), (',', 193571), ('.', 165997), ('a', 110027), ('and', 110009), ('of', 101501), ('to', 94178), ('is', 76668), ('in', 61632), ('I', 54381), ('it', 53573), ('that', 49552), ('"', 45198), ("'s", 43656), ('this', 42324), ('-', 37404), ('/><br', 35836), ('was', 35335), ('as', 30840), ('with', 30243)]
Counter({'pos': 8793, 'neg': 8707})


In [199]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_data),
    batch_size=BATCH_SIZE,
    device=device,
    shuffle=False,  # 这里可以不打乱顺序
    sort_within_batch=True)  # IMDB内置了sort_key方法

In [200]:
batch = next(iter(valid_iterator))
print(batch)

print(batch.text[0].shape, '\n', batch.text[0])  # 长度为49,批次为64;1表示填充
print(batch.text[1].shape, '\n', batch.text[1])  # 每个批次的长度

print(batch.label.shape, '\n', batch.label)


[torchtext.data.batch.Batch of size 64]
	[.text]:('[torch.cuda.LongTensor of size 49x64 (GPU 0)]', '[torch.cuda.LongTensor of size 64 (GPU 0)]')
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]
torch.Size([49, 64]) 
 tensor([[   0, 1561,    0,  ...,    0,   66, 3033],
        [ 382, 8684,   45,  ...,   16,    9,    3],
        [  42, 2504, 2497,  ...,   23,    2,  395],
        ...,
        [   8,   83,   40,  ...,    1,    1,    1],
        [ 128,   68,   40,  ...,    1,    1,    1],
        [   4,    4,   40,  ...,    1,    1,    1]], device='cuda:0')
torch.Size([64]) 
 tensor([49, 49, 49, 48, 48, 48, 48, 48, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
        46, 46, 46, 46, 46, 45, 45, 44, 44, 43, 43, 43, 42, 42, 42, 42, 42, 42,
        41, 41, 41, 41, 41, 40, 40, 40, 38, 38, 38, 37, 37, 36, 35, 35, 35, 34,
        34, 33, 33, 31, 30, 29, 24, 24, 20, 18], device='cuda:0')
torch.Size([64]) 
 tensor([0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
        0.

In [201]:
pretrained_embeddings = TEXT.vocab.vectors  # 预训练词向量
vocal_size, embedding_size = pretrained_embeddings.shape
hidden_size = 256
dropout = 0.5
bidirectional = True
out_size = 1
num_layers = 2
lr = 0.001  # 学习率
weight_decay = 1e-5
print(vocal_size, embedding_size)

25002 100


In [202]:
from BRNNModel import BRNN
% run BRNNModel.py

net = BRNN(vocal_size=vocal_size,
           embedding_size=embedding_size,
           hidden_size=hidden_size,
           num_layers=num_layers,
           dropout=dropout,
           bidirectional=True,
           out_size=out_size)

In [203]:
def count_parameters(model):
    """计算要训练的参数个数"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(net):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [204]:
net.embed.weight.data.copy_(pretrained_embeddings)  # 也可以使用from_pretrained方法

tensor([[ 1.1846, -0.9289,  1.1936,  ...,  0.4853,  1.7517, -0.6086],
        [ 0.1281, -0.0542,  0.6163,  ..., -0.9423,  1.7000, -0.6532],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0700,  0.6116,  0.2543,  ..., -0.4421,  2.2551, -0.4453],
        [-0.8700, -0.9032,  0.3782,  ...,  0.4581,  1.0631, -0.1945],
        [-0.1995,  0.6562, -0.8827,  ..., -0.0890, -0.6464,  0.0147]])

In [205]:
print(TEXT.unk_token)
print(TEXT.pad_token)
print(TEXT.vocab.stoi['<unk>'])
print(TEXT.vocab.stoi['<pad>'])
print(TEXT.vocab.vectors[TEXT.vocab.stoi[TEXT.unk_token]])  # unk的预训练词向量(这个预训练的词向量没有设置为0)
print(TEXT.vocab.vectors[TEXT.vocab.stoi[TEXT.pad_token]])  # pad的预训练词向量

<unk>
<pad>
0
1
tensor([ 1.1846e+00, -9.2890e-01,  1.1936e+00, -1.4279e+00,  9.1318e-01,
         2.0662e+00,  2.3511e+00,  1.1339e+00, -8.3455e-03, -5.0807e-01,
        -1.5593e-01,  4.7428e-01,  4.1383e-01, -1.7115e+00, -2.0870e+00,
         1.2422e+00, -2.4315e-01,  1.9318e+00,  4.5387e-01,  4.9044e-01,
         1.3784e+00, -3.0550e-01, -6.7882e-01,  1.7509e-03,  3.9291e-01,
        -1.8198e+00, -6.2958e-01,  3.9505e-01, -3.0487e-01, -1.5252e+00,
        -1.4177e+00,  4.7480e-01, -1.6798e-01,  4.6515e-01,  5.0571e-01,
        -9.1175e-02,  3.0353e-01,  8.2069e-01,  7.0419e-01,  1.4249e+00,
         4.4996e-02,  9.8268e-01, -8.8870e-01, -1.5350e+00,  5.9949e-01,
        -3.7801e-01, -1.8546e+00, -7.2452e-02,  1.2489e+00, -8.8147e-01,
        -1.1103e+00,  3.2287e+00,  4.3954e-01, -1.8650e-01, -3.2149e-01,
        -2.2880e-01, -6.5669e-02, -1.6626e-01,  8.6614e-01,  4.8681e-01,
        -1.8137e-03, -1.8987e+00,  1.9274e+00,  1.2491e+00,  1.0557e+00,
         1.9068e+00, -1.2057e-01,  

In [206]:
UNK_INDEX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_INDEX = TEXT.vocab.stoi[TEXT.pad_token]
net.embed.weight.data[UNK_INDEX] = torch.zeros(embedding_size)
net.embed.weight.data[PAD_INDEX] = torch.zeros(embedding_size)
print(net.embed.weight)  # 此时前2行被初始化为0向量

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0700,  0.6116,  0.2543,  ..., -0.4421,  2.2551, -0.4453],
        [-0.8700, -0.9032,  0.3782,  ...,  0.4581,  1.0631, -0.1945],
        [-0.1995,  0.6562, -0.8827,  ..., -0.0890, -0.6464,  0.0147]],
       requires_grad=True)


In [207]:
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()  # 二分类的损失函数

In [208]:
from Senti_train import Train_Evaluate
% run Senti_train.py
trainer = Train_Evaluate(model=net, optimizer=optimizer, criterion=criterion, device=device)

In [209]:
def epoch_time(start, end):
    """计算运行时间"""
    elapsed_time = end - start
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [210]:
NUM_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = trainer.train(train_iterator)
    valid_loss, valid_acc = trainer.evaluate(valid_iterator)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(trainer.model.state_dict(), 'Senti_model.pth')

    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

Epoch: 01 | Epoch Time: 0m 56s
	Train Loss: 0.677 | Train Acc: 57.46%
	 Val. Loss: 0.680 |  Val. Acc: 55.09%
Epoch: 02 | Epoch Time: 0m 55s
	Train Loss: 0.656 | Train Acc: 61.17%
	 Val. Loss: 0.681 |  Val. Acc: 54.22%
Epoch: 03 | Epoch Time: 0m 54s
	Train Loss: 0.589 | Train Acc: 68.69%
	 Val. Loss: 0.507 |  Val. Acc: 76.89%
Epoch: 04 | Epoch Time: 0m 54s
	Train Loss: 0.528 | Train Acc: 73.71%
	 Val. Loss: 0.376 |  Val. Acc: 83.85%
Epoch: 05 | Epoch Time: 0m 54s
	Train Loss: 0.376 | Train Acc: 83.55%
	 Val. Loss: 0.320 |  Val. Acc: 86.21%


In [211]:
best_model = BRNN(vocal_size=vocal_size,
                  embedding_size=embedding_size,
                  hidden_size=hidden_size,
                  num_layers=num_layers,
                  dropout=dropout,
                  bidirectional=True,
                  out_size=out_size)
best_model.load_state_dict(torch.load('Senti_model.pth'))  # 加载模型
best_model

BRNN(
  (embed): Embedding(25002, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [214]:
test_loss, test_acc = Train_Evaluate(model=best_model, criterion=criterion, device=device).evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Test Loss: 0.330 | Test Acc: 85.58%


In [216]:
nlp_en = spacy.load('en_core_web_sm')
nlp_en

<spacy.lang.en.English at 0x1b8a3217df0>

In [234]:
LABEL.vocab.stoi  # pos:用0表示,neg用1表示

defaultdict(None, {'pos': 0, 'neg': 1})

In [236]:
def predict_sentiment(model, sentence):
    """预测句子的评价"""
    model.eval()
    tokenized = [tok.text for tok in nlp_en.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [237]:
predict_sentiment(best_model, "fuck, garbage")

0.9640713930130005

In [238]:
predict_sentiment(best_model, "This film is terrible")  # 越接近与1,越能代表为负面评价

0.994071900844574

In [232]:
predict_sentiment(best_model, "This film is great")  # 越接近与0,越能代表为正面评价

tensor([[ 66],
        [ 24],
        [  9],
        [103]], device='cuda:0')


0.053380392491817474