In [1]:
!/opt/bin/nvidia-smi

Sat Mar  5 04:44:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    35W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
! ls "/content/drive/My Drive/nlp-beginner"

acc1.png  data	img  loss1.png	task2  task2-CNN.ipynb


In [5]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [6]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import time
import torchtext.vocab as Vocab
import os
from sklearn.model_selection import train_test_split
import  torch.nn.functional as F

""" 设置随机种子 """
torch.manual_seed(33)
torch.cuda.manual_seed(33)
np.random.seed(33)

In [7]:
import re
import collections
import matplotlib.pyplot as plt
import numpy as np
import torch
from torchtext.vocab import vocab
from collections import OrderedDict
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')


def save_img(loss, acc, test_acc):
    num_epochs = len(loss)
    epochs = range(1, num_epochs + 1)
    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.plot(epochs, test_acc, 'r', label='validation accuracy')
    plt.title('Training and validation accuracy')
    plt.legend(loc='lower right')
    plt.figure()
    plt.savefig("/content/drive/My Drive/nlp-beginner/acc1.png")

    plt.plot(epochs, loss, 'r', label='Training loss')
    # plt.plot(epochs, val_loss, 'b', label='validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.savefig("/content/drive/My Drive/nlp-beginner/loss1.png")


def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed


def load_data(data_path):
    data = pd.read_csv(data_path, sep="\t")
    print("data.shape: ", data.shape)  # (156060, 4)
    # 提取句子与标签的列
    x = data["Phrase"]
    y = data["Sentiment"]
    return x, torch.tensor(y)


# 预处理文本：全部转小写、去除标点符号
def pre_process(text):
    text = text.lower()  # 转小写
    # 去除标点符号
    punctuation = '!,;:?."\'、，；`'
    text = re.sub(r'[{}]+'.format(punctuation), ' ', text)
    return text.strip()


# 分词
def get_tokenized_sent(sents):
    stopWords = set(stopwords.words('english'))
    def tokenizer(text):
        words = word_tokenize(text)
        words = [word.lower() for word in words
                 if word.isalpha() and word not in stopWords]
        return words
    return [tokenizer(review) for review in sents]


# 获得数据集的词典
def get_vocab(sents):
    tokenized_data = get_tokenized_sent(sents)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    min_feq = 3
    idx = len(sorted_by_freq_tuples) - 1
    while sorted_by_freq_tuples[idx][1] < min_feq:
        sorted_by_freq_tuples.pop(idx)
        idx -= 1
    # 用<NOF>表示未找到的词
    sorted_by_freq_tuples.append(('<NOF>', 1))
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    vocab_obj = vocab(ordered_dict)
    return vocab_obj


def words2id(vocab_dic, words):
    def pad(x):
        return x[:MAX_LEN] if len(x) > MAX_LEN else x + [0] * (MAX_LEN - len(x))

    vec = []
    not_found_id = len(vocab_dic) - 1
    for word in words:
        try:
            vec.append(vocab_dic[word])
        except KeyError:
            vec.append(not_found_id)
    return pad(vec)


# 将句子转成长度一致的 词序号向量
def preprocess_data(sents, vocab_dic, file_name):
    tokenized_data = get_tokenized_sent(sents)
    list = []
    for words in tqdm(tokenized_data):
        list.append(words2id(vocab_dic, words))
    # np.save(file_name, np.array(list))
    return torch.tensor(np.array(list))


def get_sents_ids(file_path):
    sents_ids = np.load(file_path).tolist()
    return torch.tensor(sents_ids)


def analysis_len(sents):
    sents_len = [len(sent) for sent in sents]
    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
    plt.figure(figsize=(30, 12), dpi=100)
    plt.subplot(2, 3, 2)
    plt.title("句子长度分布")
    plt.hist(sents_len, bins=list(range(0, max(sents_len) + 1, 1)))
    plt.xlabel('句子长度')
    plt.ylabel('句子数量')
    """ title 累计分布"""
    plt.subplot(2, 3, 5)
    plt.title('累计分布图')
    plt.hist(sents_len, bins=list(range(0, max(sents_len) + 1, 1)), cumulative=True)
    plt.xlabel('句子长度')
    plt.ylabel('累计比例(%)')
    plt.savefig("sent_len.png")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def corr1d(X, K):
    w = K.shape[0]
    Y = torch.zeros((X.shape[0] - w + 1))
    for i in range(Y.shape[0]):
        Y[i] = (X[i: i + w] * K).sum()
    return Y

X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
corr1d(X, K)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [9]:
def corr1d_multi_in(X, K):
    # 首先沿着X和K的第0维（通道维）遍历并计算一维互相关结果。然后将所有结果堆叠起来沿第0维累加
    return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)

X = torch.tensor([[0, 1, 2, 3, 4, 5, 6],
              [1, 2, 3, 4, 5, 6, 7],
              [2, 3, 4, 5, 6, 7, 8]])
K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(X, K)

tensor([ 2.,  8., 14., 20., 26., 32.])

In [10]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
         # x shape: (batch_size, channel, seq_len)
         # return shape: (batch_size, channel, 1)
        return F.max_pool1d(x, kernel_size=x.shape[2])

class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # 不参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 5)
        # 时序最大池化层没有权重，所以可以共用一个实例
        # self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))

    def forward(self, inputs):
        # 将两个形状是(批量大小, 词数, 词向量维度)的嵌入层的输出按词向量连结
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2) # (batch, seq_len, 2*embed_size)
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维(即词向量那一维)，变换到前一维
        embeddings = embeddings.permute(0, 2, 1)
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        # encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        encoding = torch.cat([F.relu(conv(embeddings)).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs


In [11]:
data_path = "/content/drive/My Drive/nlp-beginner"

num_hiddens = 100
num_layers = 1
lr = 0.005
num_epochs = 10
batch_size = 256
MAX_LEN = 30  # 将每条评论通过截断或者补0，使得长度变成500

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]

DATA_ROOT = data_path + "/data"
glove_vocab = Vocab.GloVe(name='6B', dim=embed_size, cache=os.path.join(DATA_ROOT, "glove"))

In [12]:
train_path = data_path + "/data/train2.tsv"
# train_path = data_path + "/data/data.tsv"

test_path = data_path + "/data/test2.tsv"
train_sents, train_labels = load_data(train_path)
test_sents, test_labels = load_data(test_path)

# from data_process import analysis_len
# analysis_len([sent.split() for sent in train_sents])


x = pd.concat([train_sents, test_sents])
y = torch.cat((train_labels, test_labels), -1)
train_sents, test_sents, train_labels, test_labels = train_test_split(x, y, test_size=0.2)
vocab = get_vocab(x)
vocab_dic = vocab.get_stoi()


data.shape:  (126874, 4)
data.shape:  (29186, 4)


In [13]:
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    loss_epochs = []
    acc_epochs = []
    test_acc_epochs = []
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        process_bar = tqdm(train_iter)
        for (X, y) in process_bar:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)   # 梯度裁剪
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        loss_epochs.append(train_l_sum / batch_count)
        acc_epochs.append(train_acc_sum / n)
        test_acc_epochs.append(test_acc)
        # test_loss = compute_loss(test_iter, net, loss, device)
        print('epoch %d, train loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
    save_img(loss_epochs, acc_epochs, test_acc_epochs)


def compute_loss(data_iter, net, loss, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device
    loss_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            X = X.to(device)
            y = y.to(device)
            # net.eval() # 评估模式, 这会关闭dropout
            y_hat = net(X)
            loss_sum += loss(y_hat, y).cpu().item()
            # net.train() # 改回训练模式
            n += y.shape[0]
    return loss_sum / n


def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            n += y.shape[0]
    return acc_sum / n


def main():
    print('# words in vocab:', len(vocab))
    train_input_file = data_path + "/train_input3.npy"
    test_input_file = data_path + "/test_input3.npy"

    train_input = preprocess_data(train_sents, vocab_dic, train_input_file)
    test_input = preprocess_data(test_sents, vocab_dic, test_input_file)

    # train_input = get_sents_ids(train_input_file)
    train_set = TensorDataset(train_input, train_labels)
    train_loader = DataLoader(train_set, batch_size, shuffle=True)
    # test_input = get_sents_ids(test_input_file)
    test_set = TensorDataset(test_input, test_labels)
    test_loader = DataLoader(test_set, batch_size)

    # 创建网络
    model = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)
    # 加载Glove词向量
    vocab_list = vocab.get_itos()
    model.embedding.weight.data.copy_(
        load_pretrained_embedding(vocab_list, glove_vocab))
    model.constant_embedding.weight.data.copy_(
        load_pretrained_embedding(vocab_list, glove_vocab))
    model.constant_embedding.weight.requires_grad = False
    
    # 要过滤掉不计算梯度的embedding参数
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()), lr=lr)
    loss = nn.CrossEntropyLoss()
    train(train_loader, test_loader, model, loss, optimizer, device, num_epochs)


if __name__ == "__main__":
    main()

# words in vocab: 14533


100%|██████████| 124848/124848 [00:00<00:00, 223422.58it/s]
100%|██████████| 31212/31212 [00:00<00:00, 265861.54it/s]


There are 300 oov words.
There are 300 oov words.
training on  cuda:0


  0%|          | 0/488 [00:00<?, ?it/s]


RuntimeError: ignored