<a href="https://colab.research.google.com/github/hshuai97/Colab20210803/blob/main/SHua_Text_Level_GNN%E7%89%88%E6%9C%AC2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

改进：增加词的2个语义和相应的权重，在训练82个周期的基础上，最好结果为0.9689

In [1]:
%%writefile parsing.py
# 标准库
import pandas as pd
from numpy import asarray
import numpy as np
from time import time
import argparse

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

# 操作文件数据
import pickle


#兼容原始GloVe和添加OPTED中'非常用词'后的Full_GloVe
class Full_GloveTokenizer:
    def __init__(self, filename, unk='<unk>', pad='<pad>'): 
        self.filename = filename  # Full_GloVe文件的路径
        self.unk = unk  # 对于未知词汇的处理
        self.pad = pad  # 按最大长度对齐每个文本
        self.stoi = dict()
        self.itos = dict()
        self.embedding_matrix = list()  # [[embedded_word1], [embedded_word2],...]

        Full_GloVe = pickle.load(open(self.filename, 'rb'))  # 读取Full_GloVe字典

        for i, k in enumerate(Full_GloVe.keys()):
          self.stoi[k] = i
          self.itos[i] = k
          self.embedding_matrix.append(Full_GloVe[k])


        if self.unk is not None: # 未知词一律用一个随机embedded向量表示
            i += 1
            self.stoi[self.unk] = i
            self.itos[i] = self.unk
            self.embedding_matrix.append(asarray(np.random.rand(len(self.embedding_matrix[0])), dtype='float32'))
        if self.pad is not None: # Add pad token into the tokenizer # padded词一律用全0的embedded向量表示
            i += 1
            self.stoi[self.pad] = i
            self.itos[i] = self.pad
            self.embedding_matrix.append(asarray(np.zeros(len(self.embedding_matrix[0])), dtype='float32'))
        self.embedding_matrix = np.array(self.embedding_matrix).astype(np.float32) # Convert if from double to float for efficiency

    def encode(self, sentence):
        if type(sentence) == str:
            sentence = sentence.split(' ')
        elif len(sentence): # Convertible to list
            sentence = list(sentence)
        else:
            raise TypeError('sentence should be either a str or a list of str!')
        encoded_sentence = []
        for word in sentence:
            encoded_sentence.append(self.stoi.get(word, self.stoi[self.unk]))
        return encoded_sentence

    def decode(self, encoded_sentence):
        try:
            encoded_sentence = list(encoded_sentence)
        except Exception as e:
            print(e)
            raise TypeError('encoded_sentence should be either a str or a data type that is convertible to list type!')
        sentence = []
        for encoded_word in encoded_sentence:
            sentence.append(self.itos[encoded_word])
        return sentence

    def embedding(self, encoded_sentence):
        # Full_GloVe中有464885个嵌入词向量
        return self.embedding_matrix[np.array(encoded_sentence)]  # 获得一个句子的每个词的embedded向量(max_sentence_length, d)


class TextLevelGNNDataset(Dataset): # For instantiating train, validation and test dataset
    def __init__(self, node_sets, neighbor_sets, public_edge_mask, labels):
        super(TextLevelGNNDataset).__init__()
        self.node_sets = node_sets  # 同batch_size张图的节点集合(batch_size, l)
        self.neighbor_sets = neighbor_sets  # batch_size张图的节点邻居集合
        self.public_edge_mask = public_edge_mask  # batch_size张图的布尔型张量边标记
        self.labels = labels  # batch_size张图的标签

    def __getitem__(self, i):  # 将batch_size张图中，每张图对应的四个属性封装为一个长张量单元，这样可以灵活调整batch_size的大小
        return torch.LongTensor(self.node_sets[i]), \
               torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1), \
               self.public_edge_mask[torch.LongTensor(self.node_sets[i]).unsqueeze(-1).repeat(1, torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1).shape[-1]), torch.nn.utils.rnn.pad_sequence([torch.LongTensor(neighbor) for neighbor in self.neighbor_sets[i]], batch_first=True, padding_value=1)], \
               torch.FloatTensor(self.labels[i])

    def __len__(self):
        return len(self.labels)


class TextLevelGNNDatasetClass: # This class is used to achieve parameters sharing among datasets (train/validation/test)
    def __init__(self, train_filename, test_filename, opted_filename, full_glove_pth, tokenizer, MAX_LENGTH=50, p=2, min_freq=2, train_validation_split=0.8):
        self.train_filename = train_filename
        self.test_filename = test_filename
        # OPTED词典路径
        self.opted_filename = opted_filename
        self.full_glove_pth = full_glove_pth 

        self.tokenizer = tokenizer
        self.MAX_LENGTH = MAX_LENGTH
        self.p = p
        self.min_freq = min_freq
        self.train_validation_split = train_validation_split

        self.train_data = pd.read_csv(self.train_filename, sep='\t', header=None)
        self.test_data = pd.read_csv(self.test_filename, sep='\t', header=None)

        self.stoi = {'<unk>': 0, '<pad>': 1} # Re-index
        self.itos = {0: '<unk>', 1: '<pad>'} # Re-index
        self.vocab_count = len(self.stoi)
        self.embedding_matrix = None

        # 通过OPTED词典，返回训练集上每个词对应的1个语义，没有语义的用0向量表示,对于没有语义的词，后续可以考虑抓取维基百科的词条解释，类似于不断学习增加知识库的过程
        self.meaning_matrix = list()  # [meaning1, meaning2, ...] 
        self.meaning1 = list()  # 节点语义1
        self.meaning2 = list()  # 节点语义2

        self.meaning_mask = list()  # 训练集上的词的语义标记矩阵，标记每个词是否存在相应的语义
        self.mask1 = list()
        self.mask2 = list()

        self.label_dict = dict(zip(self.train_data[0].unique(), pd.get_dummies(self.train_data[0].unique()).values.tolist()))

        # 将读取的数据转成数组
        self.train_dataset, self.validation_dataset = random_split(self.train_data.to_numpy(), [int(len(self.train_data) * train_validation_split), len(self.train_data) - int(len(self.train_data) * train_validation_split)])
        self.test_dataset = self.test_data.to_numpy()

        self.build_vocab() # Based on train_dataset only. Updates self.stoi, self.itos, self.vocab_count and self.embedding_matrix

        self.build_meaning_matrix()  # 获得训练集上每个词对应的2个meaning
        
        self.train_dataset, self.validation_dataset, self.test_dataset, self.edge_stat, self.public_edge_mask = self.prepare_dataset()

    def build_vocab(self):
        vocab_list = [sentence.split(' ') for _, sentence in self.train_dataset]
        unique_vocab = []
        for vocab in vocab_list:
            unique_vocab.extend(vocab)
        unique_vocab = list(set(unique_vocab))

        for vocab in unique_vocab:
            if vocab in self.tokenizer.stoi.keys():
                self.stoi[vocab] = self.vocab_count
                self.itos[self.vocab_count] = vocab
                self.vocab_count += 1
        self.embedding_matrix = self.tokenizer.embedding(self.tokenizer.encode(list(self.stoi.keys())))  # (|V|, d) 仅根据训练集构建的全局词嵌入向量
       
    def build_meaning_matrix(self):  # 构建训练集上每个节点对应的2个语义关系
        OPTED = pickle.load(open(self.opted_filename, 'rb'))
        Full_GloVe = pickle.load(open(self.full_glove_pth, 'rb'))

        # meaning1
        for vocab in self.stoi.keys():  # 遍历训练集字典下的每个词
          m = asarray(np.zeros(300), dtype='float32')   # 初始化一个300维的meaning
          j = 0
          if vocab in OPTED.keys():  # 词在OPTED中, 对于'unk'、'pad'和其他不在OPTED中的词均用一个全0的300维向量语义表示
            for s in OPTED[vocab][0].split():  # 取第一个meaning, 后续可考虑取多个meaning(2, 3, 4...)
              if s in Full_GloVe.keys():
                j += 1
                m += Full_GloVe[s]
            if j>0:
              m = m/j
            self.meaning1.append(m)  # 添加到meaning_matrix中为了和embedded_neighbor_node结构对齐（三维）张量
            self.mask1.append(True)  # 有第一个语义
          else: # 词不在OPTED中
            self.meaning1.append(m)  # 没有解释的词，其meaning用0向量代替
            self.mask1.append(False)  # 没有第一个语义
        self.meaning1 = np.array(self.meaning1).astype(np.float32)
        
        # meaning2
        for vocab in self.stoi.keys():
          m = asarray(np.zeros(300), dtype='float32')
          j = 0
          if vocab in OPTED.keys():
            if len(OPTED[vocab]) > 1:  # 如果词存在第二个语义
              for s in OPTED[vocab][1].split():
                if s in Full_GloVe.keys():
                  j += 1
                  m += Full_GloVe[s]
              if j > 0:
                m = m/j
              self.meaning2.append(m)
              self.mask2.append(True)  # 有第二个语义
            else:  #没有第二个语义
              self.meaning2.append(m)
              self.mask2.append(False)  # 没有第二个语义
          else:  # 词不在OPTED中
            self.meaning2.append(m)
            self.mask2.append(False)  # 没有第二个语义
        
        self.meaning2 = np.array(self.meaning2).astype(np.float32)
        
        # 将meaning1和meaning2合并到meaning_matrix中
        for i in range(len(self.meaning1)):
          self.meaning_matrix.append([self.meaning1[i], self.meaning2[i]])
        self.meaning_matrix = np.array(self.meaning_matrix).astype(np.float32)
        
        # 将mask1和mask2合并到meaning_mask中
        for i in range(len(self.mask1)):
          self.meaning_mask.append([self.mask1[i], self.mask2[i]])
        
        # print(f'self.meaning_matrix:\n{self.meaning_matrix[0]}')
        # print(f'self.meaning_mask:\n{self.meaning_mask[0]}')
        #print(f'len self.meaning_matrix[0]:{len(self.meaning_matrix[0])}')
        #print(f'len self.meaning_matrix[1]:{len(self.meaning_matrix[0][1])}')
        
          
    # 本类中的主方法，负责返回结果
    def prepare_dataset(self): # will also build self.edge_stat and self.public_edge_mask
        # preparing self.train_dataset
        node_sets = [[self.stoi.get(vocab, 0) for vocab in sentence.strip().split(' ')][:self.MAX_LENGTH] for _, sentence in self.train_dataset] # Only retrieve the first MAX_LENGTH words in each document
        neighbor_sets = [create_neighbor_set(node_set, p=self.p) for node_set in node_sets]
        labels = [self.label_dict[label] for label, _ in self.train_dataset]

        # 调用类中的其他方法获取边的统计和标记信息
        edge_stat, public_edge_mask = self.build_public_edge_mask(node_sets, neighbor_sets, min_freq=self.min_freq)
        
        train_dataset = TextLevelGNNDataset(node_sets, neighbor_sets, public_edge_mask, labels)  # 传给TextLevelGNNDataset类，返回每张图对应的torch.LongTensor(tensor0, tensor1, tensor2, tensor3)
        
        # preparing self.validation_dataset
        node_sets = [[self.stoi.get(vocab, 0) for vocab in sentence.strip().split(' ')][:self.MAX_LENGTH] for _, sentence in self.validation_dataset] # Only retrieve the first MAX_LENGTH words in each document
        neighbor_sets = [create_neighbor_set(node_set, p=self.p) for node_set in node_sets]
        labels = [self.label_dict[label] for label, _ in self.validation_dataset]
        validation_dataset = TextLevelGNNDataset(node_sets, neighbor_sets, public_edge_mask, labels)  # 传给TextLevelGNNDataset类，返回每张图对应的torch.LongTensor(tensor0, tensor1, tensor2, tensor3)

        # preparing self.test_dataset
        node_sets = [[self.stoi.get(vocab, 0) for vocab in sentence.strip().split(' ')][:self.MAX_LENGTH] for _, sentence in self.test_dataset] # Only retrieve the first MAX_LENGTH words in each document
        neighbor_sets = [create_neighbor_set(node_set, p=self.p) for node_set in node_sets]
        labels = [self.label_dict[label] for label, _ in self.test_dataset]
        test_dataset = TextLevelGNNDataset(node_sets, neighbor_sets, public_edge_mask, labels)  # 传给TextLevelGNNDataset类，返回每张图对应的torch.LongTensor(tensor0, tensor1, tensor2, tensor3)

        return train_dataset, validation_dataset, test_dataset, edge_stat, public_edge_mask  # 将训练集、验证集和测试集对应的每张图的信息封装后的长张量返回，以及边的统计和标记信息

    def build_public_edge_mask(self, node_sets, neighbor_sets, min_freq=2):
        edge_stat = torch.zeros(self.vocab_count, self.vocab_count)
        for node_set, neighbor_set in zip(node_sets, neighbor_sets):
            for neighbor in neighbor_set:
                for to_node in neighbor:
                    edge_stat[node_set, to_node] += 1
        public_edge_mask = edge_stat < min_freq # mark True at uncommon edges
        return edge_stat, public_edge_mask


def create_neighbor_set(node_set, p=3):  # node_sets: 单个数据集(train/ val/ test)上的(batch_size，l), node_set = (max_length,)
    if type(node_set[0]) != int:
        raise ValueError('node_set should be a 1D list!')
    if p < 0:
        raise ValueError('p should be an integer >= 0!')
    sequence_length = len(node_set)
    neighbor_set = []
    for i in range(sequence_length):
        neighbor = []
        for j in range(-p, p+1):
            if 0 <= i + j < sequence_length:
                neighbor.append(node_set[i+j])
        neighbor_set.append(neighbor)
    return neighbor_set



def pad_custom_sequence(sequences):
    '''
    To pad different sequences into a padded tensor for training. The main purpose of this function is to separate different sequence, pad them in different ways and return padded sequences.
    Input:
        sequences <list>: A sequence with a length of 4, representing the node sets sequence in index 0, neighbor sets sequence in index 1, public edge mask sequence in index 2 and label sequence in index 3.
                          And the length of each sequences are same as the batch size.
                          sequences: [node_sets_sequence, neighbor_sets_sequence, public_edge_mask_sequence, label_sequence]
    Return:
        node_sets_sequence <torch.LongTensor>: The padded node sets sequence (works with batch_size >= 1).
        neighbor_sets_sequence <torch.LongTensor>: The padded neighbor sets sequence (works with batch_size >= 1).
        public_edge_mask_sequence <torch.BoolTensor>: The padded public edge mask sequence (works with batch_size >= 1).
        label_sequence <torch.FloatTensor>: The padded label sequence (works with batch_size >= 1).
    '''
    node_sets_sequence = []  # list[tensor0, tensor1, tensor2, tensor3]
    neighbor_sets_sequence = []
    public_edge_mask_sequence = []
    label_sequence = []
    for node_sets, neighbor_sets, public_edge_mask, label in sequences:  # 依次从batch_size张图中取每张图的4个数据
        node_sets_sequence.append(node_sets)
        neighbor_sets_sequence.append(neighbor_sets)
        public_edge_mask_sequence.append(public_edge_mask)
        label_sequence.append(label) 
    node_sets_sequence = torch.nn.utils.rnn.pad_sequence(node_sets_sequence, batch_first=True, padding_value=1)  # 统一对齐batch_size下的句子节点为(max_sentence_length, d)（主要对句子长度小于max_sentence_length的句子补丁）
    neighbor_sets_sequence, _ = padding_tensor(neighbor_sets_sequence)  # 统一对齐batch_size下的句子的每个节点对应的邻居节点，按max_sentence_length对齐
    public_edge_mask_sequence, _ = padding_tensor(public_edge_mask_sequence)  # 统一对齐布尔型的标记张量
    label_sequence = torch.nn.utils.rnn.pad_sequence(label_sequence, batch_first=True, padding_value=1)
    return node_sets_sequence, neighbor_sets_sequence, public_edge_mask_sequence, label_sequence  # 返回对齐后的4个属性


def padding_tensor(sequences, padding_idx=1):
    '''
    To pad tensor of different shape to be of the same shape, i.e. padding [tensor.rand(2, 3), tensor.rand(3, 5)] to a shape (2, 3, 5), where 0th dimension is batch_size, 1st and 2nd dimensions are padded.
    Input:
        sequences <list>: A list of tensors  # [tensor0, tensor1, tensor2, tensor3]
        padding_idx <int>: The index that corresponds to the padding index
    Return:
        out_tensor <torch.tensor>: The padded tensor
        mask <torch.tensor>: A boolean torch tensor where 1 (represents '<pad>') are marked as true
    '''
    num = len(sequences)
    max_len_0 = max([s.shape[0] for s in sequences])
    max_len_1 = max([s.shape[1] for s in sequences])
    out_dims = (num, max_len_0, max_len_1)
    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_idx)
    for i, tensor in enumerate(sequences):
        len_0 = tensor.size(0)
        len_1 = tensor.size(1)
        out_tensor[i, :len_0, :len_1] = tensor
    mask = out_tensor == padding_idx # Marking all places with padding_idx as mask
    return out_tensor, mask


class MessagePassing(nn.Module):  # 一张图上的处理
    def __init__(self, vertice_count, input_size, out_size, dropout_rate=0, padding_idx=1):
        super(MessagePassing, self).__init__()
        self.vertice_count = vertice_count # |V|  r8语料库中有一万多个unique词
        self.input_size = input_size # d  300维
        self.out_size = out_size # c, r8, c=8
        self.dropout_rate = dropout_rate
        self.padding_idx = padding_idx  # 1
        self.information_rate = nn.Parameter(torch.rand(self.vertice_count, 1)) # (|V|, 1), which means it is a column vector
        self.linear = nn.Linear(self.input_size, self.out_size) # (d, c)  # 将300维映射到8维
        self.dropout = nn.Dropout(self.dropout_rate)  # 0

    def forward(self, node_sets, embedded_node, meaning_node1, meaning_node2, edge_weight, meaning_weight, embedded_neighbor_node):
        # node_sets: (batch_size, l)  # 对应论文的N_i
        # embedded_node: (batch_size, l, d)  # 对应论文的r_n
        # edge_weight: (batch_size, max_sentence_length, max_neighbor_count)  # 对应论文e_an
        # embedded_neighbor_node: (batch_size, max_sentence_length, max_neighbor_count, d)  对应论文的N_n^p
        

        # 对词的邻居节点的词向量求权重后，加到M中
        tmp_tensor = (edge_weight.view(-1, 1) * embedded_neighbor_node.view(-1, self.input_size)).view(embedded_neighbor_node.shape) # (batch_size, l, max_neighbor_count, d)
        tmp_tensor = tmp_tensor.masked_fill(tmp_tensor == 0, -1e18) # (batch_size, max_sentence_length, max_neighbor_count, d), mask for M such that masked places are marked as -1e18 # M不接收来自pad节点的邻居信息，如果是来自pad节点的则将其设置为无穷小
        tmp_tensor = self.dropout(tmp_tensor)

        # 更新meaning_node1, meaning_node2的词嵌入向量  (batch_size, l, d)
        # meaning_weight: (batch_size, l, 2)
        meaning = torch.cat([meaning_node1.unsqueeze(dim=2), meaning_node2.unsqueeze(dim=2)], dim =2)  # (batch_size, l, 2, d)
        weight = meaning_weight.unsqueeze(dim=3)  # (batch_size, l, 2, 1)
        tmp = (weight.view(-1, 1) * meaning.view(-1, self.input_size)).view(meaning.shape)  # (batch_size, l, 2, d)

        tmp_tensor = torch.cat([tmp_tensor, tmp], dim=2)  # 将tmp_tensor在dim=2上拉长 (batch_size, l, max_neighbor_count+2, d)

        M = tmp_tensor.max(dim=2)[0] # (batch_size, max_sentence_length, d), which is same shape as embedded_node (batch_size, l, d)
        
        information_rate = self.information_rate[node_sets] # (batch_size, l, 1)
        information_rate = information_rate.masked_fill((node_sets == self.padding_idx).unsqueeze(-1), 1) # (batch_size, l, 1), Fill the information rate of the padding index as 1, such that new e_n = (1-i_r) * M + i_r * e_n = (1-1) * 0 + 1 * e_n = e_n (no update)
        embedded_node = (1 - information_rate) * M + information_rate * embedded_node # (batch_size, l, d) 
        sum_embedded_node = embedded_node.sum(dim=1) # (batch_size, d)
        x = F.relu(self.linear(sum_embedded_node)) # (batch_size, c)
#         x = self.dropout(x) # if putting dropout with p=0.5 here, it is equivalent to wiping 4 choices out of 8 choices on the question sheet, which does not make sense. If a dropout layer is placed at here, it works the best when p=0 (disabled), followed by p=0.05, ..., p=0.5 (worst and does not even converge).
        y = F.softmax(x, dim=1) # (batch_size, c) along the c dimension
        return y


class TextLevelGNN(nn.Module):
    def __init__(self, pretrained_embeddings, meaning_embeddings, meaning_mask, out_size=8, dropout_rate=0, padding_idx=1):
        super(TextLevelGNN, self).__init__()
        self.out_size = out_size # c
        self.padding_idx = padding_idx
        # nn开头的模型参数
        self.weight_matrix = nn.Parameter(torch.randn(pretrained_embeddings.shape[0], pretrained_embeddings.shape[0])) # (|V|, |V|) 邻居节点边的权值
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False, padding_idx=self.padding_idx) # (|V|, d), # freeze=False（不冻结）,预训练的词向量值可以调整

        m = meaning_embeddings.chunk(chunks=2, dim=1)  # 将(|V|, 2, d)切分为两个张量(|V|, 1, d)并压缩为两个(|V|, d)张量，三维张量不支持embedding，所以需要压缩一下
        self.meaning_embedding1 = nn.Embedding.from_pretrained(m[0].squeeze(dim=1), freeze=False, padding_idx=self.padding_idx)  # (|V|, d), # freeze=True（冻结）,预训练的向量值不可以调整
        self.meaning_embedding2 = nn.Embedding.from_pretrained(m[1].squeeze(dim=1), freeze=False, padding_idx=self.padding_idx)  # (|V|, d) 词的第二个语义

        self.message_passing = MessagePassing(vertice_count=pretrained_embeddings.shape[0], input_size=pretrained_embeddings.shape[1], out_size=self.out_size, dropout_rate=dropout_rate, padding_idx=self.padding_idx) # input_size: (d,); out_size: (c,)
        self.public_edge_weight = nn.Parameter(torch.randn(1, 1)) # (1, 1)  二维张量

        self.weight_matrix2 = nn.Parameter(torch.randn(pretrained_embeddings.shape[0], 2))  # (|V|, 2)，词的语义邻居边权值, 只要语义词向量不发生变化，仅权重变化，则不需要单独设置mask标记

        # meaning_mask默认在cpu上，模型参数在gpu上（若使用GPU），则使用register_buffer(name, tensor, persistent=False)将其临时注册为模型的参数（本质上不属于模型的参数），最后不保存为模型的参数（persistent=False）, 且optim.step之后不更新buffer的变量，只更新模型的参数
        self.register_buffer(name='rb_mask', tensor=meaning_mask, persistent=False)  # (|V|, 2)


    def forward(self, node_sets, neighbor_sets, public_edge_mask):
        # node_sets: (batch_size, l)
        # neighbor_sets: (batch_size, max_sentence_length, max_neighbor_count)
        # neighbor_sets_mask: (batch_size, max_sentence_length, max_neighbor_count) (no need)
        # public_edge_mask: (batch_size, max_sentence_length, max_neighbor_count)

        
        embedded_node = self.embedding(node_sets) # (batch_size, l, d)  #将句子[12, 78,..., 987]转成各个词对应的Embedding向量

        meaning_node1 = self.meaning_embedding1(node_sets)  # (batch_size, l, d)  # 一个词两个语义
        meaning_node2 = self.meaning_embedding2(node_sets)  # (batch_size, l, d)

        # 对应论文的e_an边权重
        edge_weight = model.weight_matrix[node_sets.unsqueeze(2).repeat(1, 1, neighbor_sets.shape[-1]), neighbor_sets]  # (batch_size, max_sentence_length, max_neighbor_count), neighbor_sets.shape[-1]: eg p=2, this expression=5; p=3, this expression=7. This is to first make node_sets to have same shape with neighbor_sets, then just do 1 query instead of 32*100 queries to speed up performance
        a = edge_weight * ~public_edge_mask # (batch_size, max_sentence_length, max_neighbor_count)
        b = self.public_edge_weight.unsqueeze(2).expand(1, public_edge_mask.shape[-2], public_edge_mask.shape[-1]) * public_edge_mask # (batch_size, max_sentence_length, max_neighbor_count)
        edge_weight = a + b # (batch_size, max_sentence_length, max_neighbor_count)
        # 根据训练集上的embedding以及节点集合直接得出节点对应的邻居节点的词嵌入向量表示
        embedded_neighbor_node = self.embedding(neighbor_sets) # (batch_size, max_sentece_length, max_neighbor_count, d)

        # Apply mask to edge_weight, to mask and cut-off any relationships to the padding nodes  # 与'pad'节点相连的边不更新边权重e_an
        edge_weight = edge_weight.masked_fill((node_sets.unsqueeze(2).repeat(1, 1, neighbor_sets.shape[-1]) == self.padding_idx) | (neighbor_sets == self.padding_idx), 0) # (batch_size, max_sentence_length, max_neighbor_count)

        # 对应添加的词的语义权重
        m_mask = self.rb_mask  # 读取mask  # (|V|, 2)
        m_weight = model.weight_matrix2[node_sets]  # (batch_size, l, 2)
        meaning_mask = m_mask[node_sets]  # (batch_size, l, 2)]
        
        # 被mask更新的权重
        meaning_weight = (m_weight.view(-1, 1) * meaning_mask.view(-1, 1)).view(m_weight.shape)  # (batch_size, l, 2)
        meaning_weight = meaning_weight.masked_fill((node_sets.unsqueeze(2).repeat(1, 1, meaning_mask.shape[2]) == self.padding_idx), 0)  # (batch_size, l, 2)

        x = self.message_passing(node_sets, embedded_node, meaning_node1, meaning_node2, edge_weight, meaning_weight, embedded_neighbor_node) # x: (batch_size, c)  # 每张图的预测标签
        return x


parser = argparse.ArgumentParser()
parser.add_argument('--cuda', default='0', type=str, required=False,
                    help='Choosing which cuda to use')
parser.add_argument('--embedding_size', default=300, type=int, required=False,
                    help='Number of hidden units in each layer of the graph embedding part')
parser.add_argument('--p', default=3, type=int, required=False,
                    help='The window size')
parser.add_argument('--min_freq', default=2, type=int, required=False,
                    help='The minimum no. of occurrence for a word to be considered as a meaningful word. Words with less than this occurrence will be mapped to a globally shared embedding weight (to the <unk> token). It corresponds to the parameter k in the original paper.')
parser.add_argument('--max_length', default=70, type=int, required=False,
                    help='The max length of each document to be processed')
parser.add_argument('--dropout', default=0, type=float, required=False,
                    help='Dropout rate')
parser.add_argument('--lr', default=1e-3, type=float, required=False,
                    help='Initial learning rate')
parser.add_argument('--lr_decay_factor', default=0.9, type=float, required=False,
                    help='Multiplicative factor of learning rate decays')  # 学习率衰减的乘子
parser.add_argument('--lr_decay_every', default=5, type=int, required=False,
                    help='Decaying learning rate every ? epochs')
parser.add_argument('--weight_decay', default=1e-4, type=float, required=False,
                    help='Weight decay (L2 penalty)')
parser.add_argument('--warm_up_epoch', default=0, type=int, required=False,
                    help='Pretraining for ? epochs before early stopping to be in effect')
parser.add_argument('--early_stopping_patience', default=10, type=int, required=False,
                    help='Waiting for ? more epochs after the best epoch to see any further improvements')
parser.add_argument('--early_stopping_criteria', default='loss', type=str, required=False,
                    choices=['accuracy', 'loss'],
                    help='Early stopping according to validation accuracy or validation loss')
parser.add_argument("--epoch", default=100, type=int, required=False,
                    help='Number of epochs to train')
args = parser.parse_args()


tokenizer = Full_GloveTokenizer(filename='/content/drive/MyDrive/Colab_Notebooks/DATA/pretrained_model/Full_GloVe_dic.pkl')

dataset = TextLevelGNNDatasetClass(train_filename='/content/drive/MyDrive/Colab_Notebooks/DATA/r8-train-all-terms.txt',
                                   test_filename='/content/drive/MyDrive/Colab_Notebooks/DATA/r8-test-all-terms.txt',              
                                   opted_filename='/content/drive/MyDrive/Colab_Notebooks/DATA/pretrained_model/OPTED_dic.pkl',
                                   full_glove_pth='/content/drive/MyDrive/Colab_Notebooks/DATA/pretrained_model/Full_GloVe_dic.pkl',
                                   train_validation_split=0.9,  # train = train*0.x
                                   tokenizer=tokenizer,
                                   p=args.p,
                                   min_freq=args.min_freq,
                                   MAX_LENGTH=args.max_length)

train_loader = DataLoader(dataset.train_dataset, batch_size=128, shuffle=True, collate_fn=pad_custom_sequence)
validation_loader = DataLoader(dataset.validation_dataset, batch_size=128, shuffle=True, collate_fn=pad_custom_sequence)
test_loader = DataLoader(dataset.test_dataset, batch_size=128, shuffle=True, collate_fn=pad_custom_sequence)

device = torch.device(f'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')  # GPU内存太小，直接用CPU计算
if torch.cuda.is_available():
  print(f'device: {device}')
  print(f'name:{torch.cuda.get_device_name(0)}')
  print(f'memory:{torch.cuda.get_device_properties(0).total_memory/1e9}')

model = TextLevelGNN(pretrained_embeddings=torch.tensor(dataset.embedding_matrix), meaning_embeddings=torch.tensor(dataset.meaning_matrix), meaning_mask=torch.tensor(dataset.meaning_mask), dropout_rate=0).to(device)
criterion = nn.BCELoss()

lr = args.lr
lr_decay_factor = args.lr_decay_factor
lr_decay_every = args.lr_decay_every
weight_decay = args.weight_decay

warm_up_epoch = args.warm_up_epoch
early_stopping_patience = args.early_stopping_patience
early_stopping_criteria = args.early_stopping_criteria
best_epoch = 0 # Initialize

training = {}  # {'accuracy':[], 'loss': []}
validation = {}
testing = {}
training['accuracy'] = []
training['loss'] = []
validation['accuracy'] = []
validation['loss'] = []
testing['accuracy'] = []
testing['loss'] = []

for epoch in range(args.epoch):
    model.train()  # 训练模式
    train_loss = 0
    train_correct_items = 0
    previous_epoch_timestamp = time()

    if epoch % lr_decay_every == 0: # Update optimizer for every lr_decay_every epochs
        if epoch != 0: # When it is the first epoch, disable the lr_decay_factor
            lr *= lr_decay_factor  # 乘以0.9的速度衰减
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)  # 更新优化器参数

    for i, (node_sets, neighbor_sets, public_edge_masks, labels) in enumerate(train_loader):  # 批量取图数据
        #print('Finished batch:', i)
        node_sets = node_sets.to(device)
        neighbor_sets = neighbor_sets.to(device)
        public_edge_masks = public_edge_masks.to(device)
        labels = labels.to(device)
        prediction = model(node_sets, neighbor_sets, public_edge_masks)  # 调用TextLevelGNN类中的forward方法返回预测预测值，batch_size张图各自对应的预测结果
        loss = criterion(prediction, labels).to(device)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_correct_items += (prediction.argmax(dim=1) == labels.argmax(dim=1)).sum().item()  # 获得训练集上总的预测正确的数量
    train_accuracy = train_correct_items / len(dataset.train_dataset)  # 计算训练集上的准确率

    model.eval()  # 评价模式
    validation_loss = 0
    validation_correct_items = 0
    for i, (node_sets, neighbor_sets, public_edge_masks, labels) in enumerate(validation_loader):
        node_sets = node_sets.to(device)
        neighbor_sets = neighbor_sets.to(device)
        public_edge_masks = public_edge_masks.to(device)
        labels = labels.to(device)
        prediction = model(node_sets, neighbor_sets, public_edge_masks)
        loss = criterion(prediction, labels).to(device)
        validation_loss += loss.item()
        validation_correct_items += (prediction.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
    validation_accuracy = validation_correct_items / len(dataset.validation_dataset)

#     model.eval()
    test_loss = 0
    test_correct_items = 0
    for i, (node_sets, neighbor_sets, public_edge_masks, labels) in enumerate(test_loader):
        node_sets = node_sets.to(device)
        neighbor_sets = neighbor_sets.to(device)
        public_edge_masks = public_edge_masks.to(device)
        labels = labels.to(device)
        prediction = model(node_sets, neighbor_sets, public_edge_masks)
        loss = criterion(prediction, labels).to(device)
        test_loss += loss.item()
        test_correct_items += (prediction.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
    test_accuracy = test_correct_items / len(dataset.test_dataset)
    
    print(f'Epoch: {epoch+1}, Training Loss: {train_loss:.4f}, Validation Loss: {validation_loss:.4f}, Testing Loss: {test_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {validation_accuracy:.4f}, Testing Accuracy: {test_accuracy:.4f}, Time Used: {time()-previous_epoch_timestamp:.2f}s')
    training['accuracy'].append(train_accuracy)
    training['loss'].append(train_loss)
    validation['accuracy'].append(validation_accuracy)
    validation['loss'].append(validation_loss)
    testing['accuracy'].append(test_accuracy)
    testing['loss'].append(test_loss)

    # add warmup mechanism for warm_up_epoch epochs
    if epoch >= warm_up_epoch:
        best_epoch = warm_up_epoch
        # early stopping
        if early_stopping_criteria == 'accuracy':
            if validation['accuracy'][epoch] > validation['accuracy'][best_epoch]:
                best_epoch = epoch
            elif epoch >= best_epoch + early_stopping_patience:
                print(f'Early stopping... (No further increase in validation accuracy) for consecutive {early_stopping_patience} epochs.')
                break
        if early_stopping_criteria == 'loss':
            if validation['loss'][epoch] < validation['loss'][best_epoch]:
                best_epoch = epoch
            elif epoch >= best_epoch + early_stopping_patience:
                print(f'Early stopping... (No further decrease in validation loss) for consecutive {early_stopping_patience} epochs.')
                break
    elif epoch + 1 == warm_up_epoch:
        print('--- Warm up finished ---')

# 保存结果至表格
df = pd.concat([pd.DataFrame(training), pd.DataFrame(validation), pd.DataFrame(testing)], axis=1)
df.columns = ['Training Accuracy', 'Training Loss', 'Validation Accuracy', 'Validation Loss', 'Testing Accuracy', 'Testing Loss']
df.to_csv(f'/content/drive/MyDrive/Colab_Notebooks/DATA/F_r8-eb{args.embedding_size}-p{args.p}-k{args.min_freq}-max_l{args.max_length}-epoch{args.epoch}.csv') # Logging


# 保存结果图像
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4))

ax1 = fig.add_subplot(121)
ax1.plot(training['loss'], label='Training Loss')
ax1.plot(validation['loss'], label='Validation Loss')
ax1.plot(testing['loss'], label='Testing Loss')
ax1.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')

ax2 = fig.add_subplot(122)
ax2.plot(training['accuracy'], label='Training Accuracy')
ax2.plot(validation['accuracy'], label='Validation Accuracy')
ax2.plot(testing['accuracy'], label='Testing Accuracy')
ax2.legend()
#plt.xlabel('Epoch')
#plt.ylabel('Accuracy')
plt.savefig(f'/content/drive/MyDrive/Colab_Notebooks/DATA/F_r8-eb{args.embedding_size}-p{args.p}-k{args.min_freq}-max_l{args.max_length}-epoch{args.epoch}.png', dpi=400, bbox_inches='tight')

Writing parsing.py


调参

In [2]:
!CUDA_LAUNCH_BLOCKING=1 python parsing.py --cuda=0 --embedding_size=300 --p=3 --min_freq=2 --max_length=100 --dropout=0.1 --epoch=85

tcmalloc: large alloc 1115734016 bytes == 0x55d878704000 @  0x7fcd4326e1e7 0x7fcd40cae46e 0x7fcd40cfec7b 0x7fcd40d01e83 0x7fcd40d0207b 0x7fcd40da3761 0x55d8407544b0 0x55d840754240 0x55d8407c80f3 0x55d8407c29ee 0x55d84075648c 0x55d840797159 0x55d8407940a4 0x55d840754d49 0x55d8407c894f 0x55d8407c29ee 0x55d8407c26f3 0x55d84088c4c2 0x55d84088c83d 0x55d84088c6e6 0x55d840864163 0x55d840863e0c 0x7fcd42058bf7 0x55d840863cea
device: cuda:0
name:Tesla T4
memory:15.843721216
Epoch: 1, Training Loss: 29.5012, Validation Loss: 4.3510, Testing Loss: 12.3716, Training Accuracy: 0.6220, Validation Accuracy: 0.6776, Testing Accuracy: 0.7474, Time Used: 13.77s
Epoch: 2, Training Loss: 21.1510, Validation Loss: 2.7833, Testing Loss: 8.1083, Training Accuracy: 0.7887, Validation Accuracy: 0.6976, Testing Accuracy: 0.7332, Time Used: 13.67s
Epoch: 3, Training Loss: 7.0615, Validation Loss: 0.7224, Testing Loss: 2.3312, Training Accuracy: 0.8112, Validation Accuracy: 0.7395, Testing Accuracy: 0.7903, Time U