In [1]:
import time
import torch
from torchtext.legacy import data
import torch.nn as nn
from torchtext.legacy import datasets
import numpy as np
import torch.optim as optim
import spacy

In [2]:
spacy_en = spacy.load('en_core_web_sm')


def tokenizer(text):
    """定义分词操作"""
    return [tok.text for tok in spacy_en.tokenizer(text)]


TEXT = data.Field(tokenize=tokenizer, include_lengths=True)
LABEL = data.Field(sequential=False, unk_token=None, dtype=torch.float32)

# 创造情感分析数据集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)  # load the IMDb dataset(电影评论数据集)
train_data

<torchtext.legacy.datasets.imdb.IMDB at 0x2b0e1c9af10>

In [3]:
# 数据集的长度
print(len(train_data))
print(len(test_data))

print(len(train_data.examples))
print(len(test_data.examples))

25000
25000
25000
25000


In [4]:

train_data.examples[0].text

['Bromwell',
 'High',
 'is',
 'a',
 'cartoon',
 'comedy',
 '.',
 'It',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 ',',
 'such',
 'as',
 '"',
 'Teachers',
 '"',
 '.',
 'My',
 '35',
 'years',
 'in',
 'the',
 'teaching',
 'profession',
 'lead',
 'me',
 'to',
 'believe',
 'that',
 'Bromwell',
 'High',
 "'s",
 'satire',
 'is',
 'much',
 'closer',
 'to',
 'reality',
 'than',
 'is',
 '"',
 'Teachers',
 '"',
 '.',
 'The',
 'scramble',
 'to',
 'survive',
 'financially',
 ',',
 'the',
 'insightful',
 'students',
 'who',
 'can',
 'see',
 'right',
 'through',
 'their',
 'pathetic',
 'teachers',
 "'",
 'pomp',
 ',',
 'the',
 'pettiness',
 'of',
 'the',
 'whole',
 'situation',
 ',',
 'all',
 'remind',
 'me',
 'of',
 'the',
 'schools',
 'I',
 'knew',
 'and',
 'their',
 'students',
 '.',
 'When',
 'I',
 'saw',
 'the',
 'episode',
 'in',
 'which',
 'a',
 'student',
 'repeatedly',
 'tried',
 'to',
 'burn',
 'down',
 'the',
 'school',
 '

In [5]:
train_data.examples[0].label

'pos'

In [6]:
train_data

<torchtext.legacy.datasets.imdb.IMDB at 0x2b0e1c9af10>

In [7]:
print(train_data.examples[0].text)  # 每条数据的文本

print(len(train_data.examples[0].text))
print(len(train_data.examples[1].text))
print(len(train_data.examples[2].text))
print(len(train_data.examples[3].text))  # 每条数据的长度

['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'line', ':'

In [8]:
train_data.examples[0].label  # 每条数据的标签,这里为积极的情感

'pos'

In [9]:
train_dataset, valid_dataset = train_data.split(random_state=np.random.seed(1))  # 划分训练数据集

In [10]:
print(f'Number of training examples: {len(train_dataset)}')
print(f'Number of validation examples: {len(valid_dataset)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [11]:
# 建立词表
TEXT.build_vocab(train_dataset, max_size=25000, vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_,
                 vectors_cache=r'C:\Users\duanm\Music\GitHubProjects\MLNote\E_PyTorch\高阶操作及深度学习相关理论\torchtext自然语言处理\.vector_cache')
LABEL.build_vocab(train_dataset)

In [12]:
TEXT.vocab.stoi


defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x000002B0F3935DF0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'and': 5,
             'a': 6,
             'of': 7,
             'to': 8,
             'is': 9,
             'in': 10,
             'I': 11,
             'it': 12,
             'that': 13,
             '"': 14,
             "'s": 15,
             'this': 16,
             '-': 17,
             '/><br': 18,
             'was': 19,
             'as': 20,
             'movie': 21,
             'with': 22,
             'for': 23,
             'film': 24,
             'The': 25,
             'but': 26,
             '(': 27,
             'on': 28,
             "n't": 29,
             ')': 30,
             'you': 31,
             'are': 32,
             'not': 33,
             'have': 34,
             'his': 35,
             'be': 36,
             'he':

In [13]:
TEXT.vocab.vectors.shape

torch.Size([25002, 100])

In [14]:
print(TEXT.vocab.freqs.most_common(20))
print(LABEL.vocab.freqs)  # 数量上基本上是1:1的比例,所以不需要对样本做重采样来保持正负样本比例均衡

[('the', 202571), (',', 193209), ('.', 165915), ('and', 109443), ('a', 109043), ('of', 100243), ('to', 93794), ('is', 76547), ('in', 61246), ('I', 54168), ('it', 53524), ('that', 49024), ('"', 44294), ("'s", 43138), ('this', 42199), ('-', 37350), ('/><br', 35732), ('was', 34985), ('as', 30292), ('movie', 29876)]
Counter({'neg': 8767, 'pos': 8733})


In [15]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset, test_data),
    batch_size=BATCH_SIZE,
    device=device,
    shuffle=False,  # 这里可以不打乱顺序
    sort_within_batch=True)  # IMDB内置了sort_key方法

In [16]:
batch = next(iter(valid_iterator))
# print(batch)

print(batch.text[0].shape, '\n', batch.text[0])  # 长度为49,批次为64;1表示填充
print(batch.text[1].shape, '\n', batch.text[1])  # 每个批次的长度

# print(batch.label.shape, '\n', batch.label)


torch.Size([49, 64]) 
 tensor([[   11,  6147,    11,  ...,  3800,    25, 11897],
        [  213,     2,   239,  ...,   435,   120,    25],
        [   16,  6902,    16,  ...,    23,    32,     0],
        ...,
        [19372,   626,     9,  ...,     1,     1,     1],
        [   14,   693,   103,  ...,     1,     1,     1],
        [    4,     4,    41,  ...,     1,     1,     1]], device='cuda:0')
torch.Size([64]) 
 tensor([49, 49, 49, 49, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 47, 47, 47, 47,
        47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 43, 43, 43,
        42, 42, 42, 42, 41, 40, 40, 40, 40, 40, 39, 39, 38, 38, 38, 38, 38, 36,
        35, 34, 33, 33, 33, 31, 29, 24, 22, 13], device='cuda:0')


In [17]:
for i in valid_iterator:
    print(i.text[0].shape)


torch.Size([49, 64])
torch.Size([56, 64])
torch.Size([61, 64])
torch.Size([67, 64])
torch.Size([72, 64])
torch.Size([77, 64])
torch.Size([82, 64])
torch.Size([86, 64])
torch.Size([91, 64])
torch.Size([96, 64])
torch.Size([102, 64])
torch.Size([108, 64])
torch.Size([113, 64])
torch.Size([118, 64])
torch.Size([120, 64])
torch.Size([124, 64])
torch.Size([127, 64])
torch.Size([129, 64])
torch.Size([131, 64])
torch.Size([133, 64])
torch.Size([135, 64])
torch.Size([137, 64])
torch.Size([138, 64])
torch.Size([140, 64])
torch.Size([141, 64])
torch.Size([143, 64])
torch.Size([144, 64])
torch.Size([146, 64])
torch.Size([147, 64])
torch.Size([148, 64])
torch.Size([150, 64])
torch.Size([152, 64])
torch.Size([153, 64])
torch.Size([155, 64])
torch.Size([156, 64])
torch.Size([158, 64])
torch.Size([160, 64])
torch.Size([161, 64])
torch.Size([163, 64])
torch.Size([164, 64])
torch.Size([166, 64])
torch.Size([168, 64])
torch.Size([170, 64])
torch.Size([171, 64])
torch.Size([173, 64])
torch.Size([175, 64]

In [18]:
pretrained_embeddings = TEXT.vocab.vectors  # 预训练词向量
vocal_size, embedding_size = pretrained_embeddings.shape
hidden_size = 256
dropout = 0.5
bidirectional = True
out_size = 1
num_layers = 2
lr = 0.001  # 学习率
weight_decay = 1e-5
print(vocal_size, embedding_size)

25002 100


In [19]:
for batch in train_iterator:
    (text, text_lengths), label = batch.text, batch.label
    print(text[-4:, :])
    print(text_lengths)
    print(label)
    break

tensor([[ 5730,    22,     7,    40,    40,   783,   309,    16,  2338,     7,
          4777,     0,  3725,  1752, 12687, 22044,     0,     4,     4,    41,
             4,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1],
        [    0,    16,    16,     6,  1380,    21,   179,    21,   254,   314,
             4,     4,     4,    41,     4,     4,    41,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1

In [20]:
from BRNNModel import BRNN
%run BRNNModel.py

net = BRNN(vocal_size=vocal_size,
           embedding_size=embedding_size,
           hidden_size=hidden_size,
           num_layers=num_layers,
           dropout=dropout,
           bidirectional=True,
           out_size=out_size)

In [21]:
def count_parameters(model):
    """计算要训练的参数个数"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(net):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [22]:
net.embed.weight.data.copy_(pretrained_embeddings)  # 也可以使用from_pretrained方法

tensor([[-0.4704, -0.4168,  0.0804,  ...,  0.5253, -1.4153,  0.6527],
        [ 0.5952, -1.7709,  1.3419,  ..., -1.2439, -0.9224,  0.4309],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1775,  0.6091,  0.3570,  ..., -0.0507,  0.1808,  0.4038],
        [ 0.2581,  0.4548, -0.6745,  ...,  0.1374, -0.2201,  0.7585],
        [-0.1438, -0.3202,  0.4899,  ..., -0.5105, -0.1756,  0.2217]])

In [23]:
print(TEXT.unk_token)
print(TEXT.pad_token)
print(TEXT.vocab.stoi['<unk>'])
print(TEXT.vocab.stoi['<pad>'])
print(TEXT.vocab.vectors[TEXT.vocab.stoi[TEXT.unk_token]])  # unk的预训练词向量(这个预训练的词向量没有设置为0)
print(TEXT.vocab.vectors[TEXT.vocab.stoi[TEXT.pad_token]])  # pad的预训练词向量

<unk>
<pad>
0
1
tensor([-0.4704, -0.4168,  0.0804, -0.2398, -0.9571,  1.3535,  0.7417,  0.2002,
         0.9520, -1.7906,  1.7478, -0.3042,  0.4464,  0.1124,  1.6594,  0.1385,
         0.8991,  0.2505, -0.1612, -1.0242,  0.5758,  0.8801, -0.6415, -0.7938,
         0.5940,  1.0979, -0.2262, -0.1885, -0.9740,  2.3827,  1.1368, -0.7608,
         0.5174,  0.2106, -0.9644,  0.5810, -0.3412, -0.5013, -1.0705, -0.4449,
        -0.9910, -0.9872, -0.0850, -2.4506, -0.1743,  1.2091, -1.2388,  0.3289,
         0.5226,  1.4376, -2.3316, -0.4426, -0.3382, -0.0477,  1.3652,  0.6709,
         0.2234, -0.2513, -1.6245, -2.0295, -0.1263,  0.5826,  1.1050,  1.1442,
        -0.7573, -0.3753, -0.3928,  0.5462, -2.3406,  0.0702, -1.2683, -0.4450,
         2.1960, -0.9264, -1.0451,  0.6124, -1.1049, -0.5299, -0.6392, -1.3246,
        -0.0599,  1.8333,  0.3815, -1.6780, -0.7285,  0.2331,  1.3732, -0.7015,
        -1.2674,  0.6737,  0.7955, -0.2906, -1.5995, -0.5399,  0.5840,  1.1428,
         0.5917,  0.5253

In [24]:
UNK_INDEX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_INDEX = TEXT.vocab.stoi[TEXT.pad_token]
net.embed.weight.data[UNK_INDEX] = torch.zeros(embedding_size)
net.embed.weight.data[PAD_INDEX] = torch.zeros(embedding_size)
print(net.embed.weight)  # 此时前2行被初始化为0向量

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1775,  0.6091,  0.3570,  ..., -0.0507,  0.1808,  0.4038],
        [ 0.2581,  0.4548, -0.6745,  ...,  0.1374, -0.2201,  0.7585],
        [-0.1438, -0.3202,  0.4899,  ..., -0.5105, -0.1756,  0.2217]],
       requires_grad=True)


In [25]:
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()  # 二分类的损失函数

In [26]:
from Senti_train import Train_Evaluate
% run Senti_train.py
trainer = Train_Evaluate(model=net, optimizer=optimizer, criterion=criterion, device=device)

UsageError: Line magic function `%` not found.


In [None]:
def epoch_time(start, end):
    """计算运行时间"""
    elapsed_time = end - start
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
NUM_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = trainer.train(train_iterator)
    valid_loss, valid_acc = trainer.evaluate(valid_iterator)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(trainer.model.state_dict(), 'Senti_model.pth')

    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

In [None]:
best_model = BRNN(vocal_size=vocal_size,
                  embedding_size=embedding_size,
                  hidden_size=hidden_size,
                  num_layers=num_layers,
                  dropout=dropout,
                  bidirectional=True,
                  out_size=out_size)
best_model.load_state_dict(torch.load('Senti_model.pth'))  # 加载模型
best_model

In [None]:
test_loss, test_acc = Train_Evaluate(model=best_model, criterion=criterion, device=device).evaluate(test_iterator)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

In [None]:
nlp_en = spacy.load('en_core_web_sm')
nlp_en

In [None]:
LABEL.vocab.stoi  # pos:用0表示,neg用1表示

In [None]:
def predict_sentiment(model, sentence):
    """预测句子的评价"""
    model.eval()
    tokenized = [tok.text for tok in nlp_en.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
predict_sentiment(best_model, "fuck, garbage")

In [None]:
predict_sentiment(best_model, "This film is terrible")  # 越接近与1,越能代表为负面评价

In [None]:
predict_sentiment(best_model, "This film is great")  # 越接近与0,越能代表为正面评价