<a href="https://colab.research.google.com/github/douzujun/NLP-Project/blob/master/LSTM-Emotional%20analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install torchtext
!python -m spacy download en
!pip install torchvision

# K80 gpu for 12 hours
import torch
from torch import nn, optim
from torchtext import data, datasets

print('GPU:', torch.cuda.is_available())

torch.manual_seed(123)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
GPU: True


<torch._C.Generator at 0x7f45002e3130>

In [3]:
# 为CPU设置随机种子
torch.manual_seed(123)

# 两个Field对象定义字段的处理方法（文本字段、标签字段）
TEXT = data.Field(tokenize='spacy')  # 分词
LABEL = data.LabelField(dtype=torch.float)

# IMDB共50000影评，包含正面和负面两个类别。数据被前面的Field处理
# 按照(TEXT, LABEL) 分割成 训练集，测试集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

print('len of train data:', len(train_data))        # 25000
print('len of test data:', len(test_data))          # 25000

# torchtext.data.Example : 用来表示一个样本，数据+标签
print(train_data.examples[15].text)                 # 文本：句子的单词列表
print(train_data.examples[15].label)                # 标签: 积极

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.0MB/s]


len of train data: 25000
len of test data: 25000
['The', 'movie', 'is', 'a', 'bit', '"', 'thin', '"', 'after', 'reading', 'the', 'book', ',', 'but', 'it', "'s", 'still', 'one', 'of', 'the', 'greatest', 'movies', 'ever', 'made', '.', 'Sheryl', 'Lee', 'is', 'beautiful', 'and', 'Nick', 'Nolte', 'is', 'really', '"', 'vonneguty', '"', '.', 'He', 'makes', 'great', 'job', 'expressing', 'the', 'feelings', 'from', 'the', 'book', 'to', 'the', 'film', '.', 'Not', 'many', 'films', 'engage', 'the', 'feeling', 'of', 'the', 'book', 'as', 'well', 'as', 'Mother', 'Night', 'does', '.']
pos


In [4]:
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))           # 10002
print(TEXT.vocab.itos[:12])        # ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']
print(TEXT.vocab.stoi['and'])       # 5
print(LABEL.vocab.stoi)          # defaultdict(None, {'neg': 0, 'pos': 1})

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 398894/400000 [00:16<00:00, 23519.32it/s]

10002
['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'I']
5
defaultdict(<function _default_unk_index at 0x7f44ffd3ed90>, {'neg': 0, 'pos': 1})


In [18]:
batchsz = 30
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
                                (train_data, test_data),
                                batch_size = batchsz,
                                device=device
                               )

In [6]:
class RNN(nn.Module):

  def __init__(self, vocab_size, embedding_dim, hidden_dim):
    super(RNN, self).__init__()

    # [0-10001] => [100]
    # 参数1:embedding个数(单词数), 参数2:embedding的维度(词向量维度)
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    # [100] => [256]
    # 双向LSTM，所以下面FC层使用 hidden_dim*2
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
                       bidirectional=True, dropout=0.5) 
    # [256*2] => [1]
    self.fc = nn.Linear(hidden_dim*2, 1)
    self.dropout = nn.Dropout(0.5)

  def forward(self, x):
    """
    x: [seq_len, b] vs [b, 3, 28, 28]
    """
    # [seq_len, b, 1] => [seq_len, b, 100]
    embedding = self.dropout(self.embedding(x))

    # output: [seq, b, hid_dim*2]
    # hidden/h: [num_layers*2, b, hid_dim]
    # cell/c: [num_layers*2, b, hid_dim]
    output, (hidden, cell) = self.rnn(embedding)
    # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
    # 双向，所以要把最后两个输出连接
    hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
    # [b, hid_dim*2] => [b, 1]
    hidden = self.dropout(hidden)
    out = self.fc(hidden)

    return out

In [19]:
rnn = RNN(len(TEXT.vocab), 100, 256)                          #词个数，词嵌入维度，输出维度

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)    # torch.Size([10002, 100])

# 使用预训练过的embedding来替换随机初始化
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

pretrained_embedding: torch.Size([10002, 100])
embedding layer inited.


In [20]:
optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
# BCEWithLogitsLoss是针对二分类的CrossEntropy
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)

RNN(
  (embedding): Embedding(10002, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [27]:
import numpy as np 
def binary_acc(preds, y):

  preds = torch.round(torch.sigmoid(preds))
  correct = torch.eq(preds, y).float()
  acc = correct.sum() / len(correct)
  return acc

In [22]:
def train(rnn, iterator, optimizer, criteon):
  avg_acc = []
  rnn.train()   # 表示进入训练模式

  for i, batch in enumerate(iterator):
    # [seq, b] => [b, 1] => [b]
    # batch.text 就是上面forward函数的参数text，压缩维度是为了和batch.label维度一致
    pred = rnn(batch.text).squeeze(1)

    loss = criteon(pred, batch.label)
    # 计算每个batch的准确率
    acc = binary_acc(pred, batch.label).item()
    avg_acc.append(acc)

    optimizer.zero_grad() # 清零梯度准备计算
    loss.backward()    # 反向传播
    optimizer.step()   # 更新训练参数

    if i % 10 == 0:
      print(i, acc)

  avg_acc = np.array(avg_acc).mean()
  print('avg acc:', avg_acc)



In [23]:
def evaluate(rnn, iterator, criteon):
  avg_acc = []
  rnn.eval()         # 表示进入测试模式

  with torch.no_grad():
    for batch in iterator:
      pred = rnn(batch.text).squeeze(1)      # [b, 1] => [b]
      loss = criteon(pred, batch.label)
      acc = binary_acc(pred, batch.label).item()
      avg_acc.append(acc)

  avg_acc = np.array(avg_acc).mean()

  print('test acc:', avg_acc)

In [28]:
for epoch in range(10):
    
  train(rnn, train_iterator, optimizer, criteon)
  
  evaluate(rnn, test_iterator, criteon)

0 0.8666667342185974
10 0.9666666984558105
20 0.8000000715255737
30 0.8666667342185974
40 0.8666667342185974
50 0.8000000715255737
60 0.9333333969116211
70 0.7666667103767395
80 0.9000000357627869
90 0.8666667342185974
100 0.9000000357627869
110 0.7666667103767395
120 0.8000000715255737
130 0.9666666984558105
140 0.8666667342185974
150 0.9000000357627869
160 0.9000000357627869
170 0.9000000357627869
180 0.8000000715255737
190 0.8000000715255737
200 0.9333333969116211
210 0.9000000357627869
220 0.9333333969116211
230 0.8666667342185974
240 0.9000000357627869
250 0.7666667103767395
260 0.9333333969116211
270 0.9000000357627869
280 0.8000000715255737
290 0.8666667342185974
300 0.9333333969116211
310 0.7666667103767395
320 0.9000000357627869
330 0.9666666984558105
340 0.9666666984558105
350 0.8333333730697632
360 0.9000000357627869
370 0.8000000715255737
380 0.9000000357627869
390 0.8666667342185974
400 0.8333333730697632
410 0.9000000357627869
420 0.9333333969116211
430 0.8333333730697632

In [38]:
def predice_test(x):

  preds = torch.round(torch.sigmoid(x))
  return preds

In [39]:
for batch in test_iterator:
  pred = rnn(batch.text).squeeze(1)
  pred = predice_test(pred)
  print(pred)
  print(batch.label)
  break



tensor([1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0.], device='cuda:0',
       grad_fn=<RoundBackward>)
tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0.], device='cuda:0')
