# DeepLearning Assignment 4 实验报告
# SA22221042 汪泱泱

## 一、实验环境

GPU TITAN Xp  
CUDA 10.1  
python 3.7.13  
torch 1.8.1  
torchtext 0.6.0  
spacy 3.4.3  
transformers-4.25.1

## 二、实验过程

In [171]:
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import PPI
from torch_geometric.nn import GCNConv
import time
import matplotlib.pyplot as plt

In [172]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [170]:
device

device(type='cuda')

In [140]:
def read_data(dataset_name):
    assert(dataset_name in ['cora', 'citeseer'])
    cites_path = os.path.join('./dataset', dataset_name, dataset_name+'.cites')
    content_path = os.path.join('./dataset', dataset_name, dataset_name+'.content')
    labels = []
    features = []
    ids = []
    edges = []
    paper_reindex = dict()
    label_reindex = dict()
                
    with open(content_path ,"r") as f:
        node_lines = f.readlines()
        index_cnt = 0
        label_cnt = 0
        for line in node_lines:
            node = line.strip('\n').split('\t')
            if node[0] not in paper_reindex:
                paper_reindex[node[0]] = index_cnt
                index_cnt += 1
            if node[-1] not in label_reindex:
                label_reindex[node[-1]] = label_cnt
                label_cnt += 1
        for line in node_lines:
            node = line.strip('\n').split('\t')
            ids.append(paper_reindex[node[0]])
            features.append(node[1: -1])
            labels.append(node[-1])
        x = np.zeros(shape=(index_cnt, len(features[0])), dtype=int)
        y = np.zeros(shape=(index_cnt, 1), dtype=int)
        for i in range(len(ids)):
            for j in range(len(features[i])):
                x[i][j] = int(features[i][j])
            y[i][0] = label_reindex[labels[i]]
            
    with open(cites_path,"r") as f:
        edge_lines = f.readlines()
        edge_index = np.zeros(shape=(2, len(edge_lines)*2), dtype=int)
        edge_num = 0
        for line in edge_lines:
            edge = line.strip('\n').split('\t')
            if edge[0] not in paper_reindex or edge[1] not in paper_reindex:
                continue
            edge_index[0][edge_num] = paper_reindex[edge[0]]
            edge_index[1][edge_num] = paper_reindex[edge[1]]
            edge_num += 1
            edge_index[0][edge_num] = paper_reindex[edge[1]]
            edge_index[1][edge_num] = paper_reindex[edge[0]]
            edge_num += 1
        edge_index = edge_index[:, :edge_num]
    return x, edge_index, y, len(paper_reindex), len(label_reindex)

In [141]:
x, edge_index, y, paper_num, label_num = read_data('citeseer')

KeyboardInterrupt: 

In [None]:
x = torch.FloatTensor(x).to(device)
x = F.normalize(x, p=1, dim=1)
edge_index=torch.LongTensor(edge_index).to(device)
y=torch.LongTensor(y).to(device)

In [194]:
data = Data(x=x, edge_index=edge_index, y=y)

In [206]:
x.shape

torch.Size([3312, 3703])

In [None]:
samples_num = len(y) 
train_num = 300
val_num = 500
test_num = samples_num - train_num - val_num
ids = np.random.permutation(samples_num)
train_id = torch.LongTensor(ids[:train_num]).to(device)
val_id = torch.LongTensor(ids[train_num:train_num+val_num]).to(device)
test_id = torch.LongTensor(ids[train_num+train_num:]).to(device)

In [None]:

print(edge_index)

tensor([[   0,    0,    0,  ...,  455, 2175, 2122],
        [   0,    0,   99,  ..., 1008, 2122, 2175]], device='cuda:0')


In [None]:
class Model(nn.Module):
    def __init__(self, hidden_layer_num, input_size, output_size, layer_size, dropout_rate, activation):
        super().__init__()
        assert hidden_layer_num>0
        self.dropout_rate = dropout_rate
        self.gcn_list = nn.ModuleList()
        self.gcn_list.append(GCNConv(input_size, layer_size))
        self.activation = activation
        for i in range(hidden_layer_num-1):
            self.gcn_list.append(GCNConv(layer_size, layer_size))
        self.gcn_list.append(GCNConv(layer_size, output_size))

    def forward(self, x, edge_index):
        for i in range(len(self.gcn_list)):
            x = self.gcn_list[i](x, edge_index)
            if i != len(self.gcn_list)-1:
                if self.activation == 'relu':
                    x = F.relu(x)
                elif self.activation == 'tanh':
                    x = F.tanh(x)
                else:
                    x = F.sigmoid(x)
                x = F.dropout(x, self.dropout_rate)
        return x

In [None]:
hidden_layer_num = 1
input_size = x.shape[1]
output_size = label_num
layer_size = 32
dropout_rate = 0.5

In [None]:
model = Model(hidden_layer_num, input_size, output_size, layer_size, dropout_rate)

In [None]:
def cal_acc(y_pred, y_true):
    return (y_pred.argmax(1) == y_true).type(torch.float32).mean().item()

In [None]:
model = model.to(device)
learning_rate = 2e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4)
Loss = nn.CrossEntropyLoss()
epochs = 2000
start_time = time.time()
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(data.x, data.edge_index)
    loss = Loss(output[train_id], data.y[train_id].squeeze())
    train_acc = cal_acc(output[train_id], data.y[train_id].squeeze())
    val_acc = cal_acc(output[val_id], data.y[val_id].squeeze())
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print('[Epoch {}/{}] train_loss = {:.5f}, train_acc = {:.5f}, val_acc = {:.5f}'.format(epoch + 1, epochs, loss, train_acc, val_acc))

[Epoch 10/2000] train_loss = 1.76283, train_acc = 0.44000, val_acc = 0.27600
[Epoch 20/2000] train_loss = 1.72186, train_acc = 0.46667, val_acc = 0.28600
[Epoch 30/2000] train_loss = 1.68229, train_acc = 0.46667, val_acc = 0.27600
[Epoch 40/2000] train_loss = 1.64005, train_acc = 0.47667, val_acc = 0.29000
[Epoch 50/2000] train_loss = 1.60367, train_acc = 0.53667, val_acc = 0.28400
[Epoch 60/2000] train_loss = 1.56527, train_acc = 0.57333, val_acc = 0.31000
[Epoch 70/2000] train_loss = 1.52310, train_acc = 0.62333, val_acc = 0.35600
[Epoch 80/2000] train_loss = 1.47716, train_acc = 0.66667, val_acc = 0.38200
[Epoch 90/2000] train_loss = 1.44128, train_acc = 0.65667, val_acc = 0.43400
[Epoch 100/2000] train_loss = 1.38129, train_acc = 0.71667, val_acc = 0.43000
[Epoch 110/2000] train_loss = 1.32964, train_acc = 0.72333, val_acc = 0.48400
[Epoch 120/2000] train_loss = 1.27705, train_acc = 0.78333, val_acc = 0.48000
[Epoch 130/2000] train_loss = 1.22878, train_acc = 0.78667, val_acc = 0.5

PPI(path, split = 'train')[0]

In [173]:
path = './ppi/'
train_dataset = PPI(path, split = 'train')
val_dataset = PPI(path, split = 'val')
test_dataset = PPI(path, split = 'test')

In [214]:
PPI(path)[0]

Data(x=[1767, 50], edge_index=[2, 32318], y=[1767, 121])

In [174]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [175]:
hidden_layer_num = 2
input_size = x.shape[1]
output_size = label_num
layer_size = 256
dropout_rate = 0.5

In [176]:
class GCN(torch.nn.Module):
    def __init__(self, dataset, num_layers, hidden):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, hidden)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers - 1):
            self.convs.append(GCNConv(hidden, hidden))
        self.lin1 = nn.Linear(hidden, hidden)
        self.lin2 = nn.Linear(hidden, dataset.num_classes)
        # print('GCN')

    # def reset_parameters(self):
    #     self.conv1.reset_parameters()
    #     for conv in self.convs:
    #         conv.reset_parameters()
    #     self.lin1.reset_parameters()
    #     self.lin2.reset_parameters()

    def forward(self, dataX, dataY):
        activation = F.relu  # torch.sigmoid
        x, edge_index = dataX, dataY
        x = activation(self.conv1(x, edge_index))
        for conv in self.convs:
            x = activation(conv(x, edge_index))
        #x = global_mean_pool(x, batch)
        x = activation(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        x = F.sigmoid(x)
        return x

In [177]:
model = GCN(train_dataset, hidden_layer_num, layer_size)

In [178]:
def cal_acc_ppi(y_pred, y_true):
    correct = torch.eq(torch.round(y_pred), y_true).float()
    acc = correct.sum(dim=1) / len(correct[0])
    return ((torch.round(y_pred)) == y_true).float().mean().item()

In [179]:
def cal_loss_and_acc(loader):
    total_l1 = 0
    total_loss = 0
    total_examples = 0
    for i, data in enumerate(loader, 0):
        data = data.to(device)
        print(len(data.x))
        output = model(data.x, data.edge_index)
        acc = cal_acc_ppi(output, data.y)
        total_l1 += acc * data.num_nodes
        total_loss += loss.item() * data.num_nodes
        total_examples += data.num_nodes
    return total_loss / total_examples, total_l1 / total_examples

In [180]:
model = model.to(device)
learning_rate = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=5e-4)
Loss = torch.nn.BCELoss()
epochs = 2000
start_time = time.time()
for epoch in range(epochs):
    for i, data in enumerate(train_loader, 0):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index)
        loss = Loss(output, data.y)
        loss.backward()
        optimizer.step()
    model.eval()
    train_loss, train_acc = cal_loss_and_acc(train_loader)
    val_loss, val_acc = cal_loss_and_acc(val_loader)
    model.train()
    if (epoch+1) % 1 == 0:
        print('[Epoch {}/{}] train_loss = {:.5f} train_acc = {:.5f} val_loss = {:.5f} val_acc = {:.5f}'
              .format(epoch + 1, epochs,  train_loss, train_acc, val_loss, val_acc))



1767
1021
2401
1823
3312
591
2815
1878
2263
3021
3480
2326
2650
1819
3163
2339
1377
1578
2488
2794
6514
[Epoch 1/2000] train_loss = 0.69346 train_acc = 0.50736 val_loss = 0.69346 val_acc = 0.50749
1767
1578
3163
2488
2815
3480
1878
591
1377
1021
1823
1819
3021
2794
3312
2263
2339
2326
2401
2650
6514
[Epoch 2/2000] train_loss = 0.69299 train_acc = 0.51536 val_loss = 0.69299 val_acc = 0.51616
1377
2326
2815
3021
3480
1819
1578
1021
2488
591
2401
1823
3163
1767
2650
1878
2263
2339
2794
3312
6514
[Epoch 3/2000] train_loss = 0.69251 train_acc = 0.52366 val_loss = 0.69251 val_acc = 0.52509
591
3163
2650
1819
2401
2488
2263
3312
3480
3021
2326
1578
2339
1878
1767
2815
1021
2794
1377
1823
6514
[Epoch 4/2000] train_loss = 0.69216 train_acc = 0.53154 val_loss = 0.69216 val_acc = 0.53376
1021
1767
3163
1819
2401
1377
1823
2339
3480
1878
2650
2263
2488
3312
591
2326
2794
3021
2815
1578
6514
[Epoch 5/2000] train_loss = 0.69186 train_acc = 0.53865 val_loss = 0.69186 val_acc = 0.54119
2263
2401
1823


KeyboardInterrupt: 

In [76]:
t = np.array([[0.1,0,0.5],[0.1,0,0.1],[0.6,0,0.5]])

True

图有2708个节点，每个点有1433个属性（词汇是否在图上出现过），5429条边

In [195]:
data

Data(x=[3312, 3703], edge_index=[2, 9430], y=[3312, 1])

In [200]:
from torch_geometric.transforms import RandomLinkSplit
transform = RandomLinkSplit(num_val=0.1, num_test=0.1)
train_data, val_data, test_data = transform(data)

In [201]:
train_data, val_data, test_data

(Data(x=[3312, 3703], edge_index=[2, 7544], y=[3312, 1], edge_label=[15088], edge_label_index=[2, 15088]),
 Data(x=[3312, 3703], edge_index=[2, 7544], y=[3312, 1], edge_label=[1886], edge_label_index=[2, 1886]),
 Data(x=[3312, 3703], edge_index=[2, 8487], y=[3312, 1], edge_label=[1886], edge_label_index=[2, 1886]))

In [205]:
train_data.edge_label

tensor([1., 1., 1.,  ..., 0., 0., 0.], device='cuda:0')

### 四、测试结果

: 

: 

读取最佳模型参数

In [None]:
model.load_state_dict(torch.load('model.pt'))

: 

选择验证集上表现最好的模型参数在测试集上测试

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, Loss)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.5f}')

: 

ACC为0.93119

### 五、和RNN模型的比较

在同为BCELoss的情况下，两者在测试集上的表现为：

| Model | BCELoss | ACC     |
| ----- | ------- | ------- |
| RNN   | 0.284   | 89.143% |
| BERT  | 0.181   | 93.119% |

从模型表现和性能上来说，显然BERT要更优。

当然BERT的参数更多，bert-base-uncased有104w参数，bert-large-uncased有335w参数，所以无论是消耗的显存、以及训练单轮所需要的时间，BERT模型都需要更多。  
训练时，bert-base-uncased需要37GB显存和5min单轮训练时间(BATCH_SIZE=32),bert-large-uncased需要29GB显存和36min单轮训练时间(BATCH_SIZE=8)，而RNN模型在BATCH_SIZE=256时也仅需要11GB内存和约40s单轮训练时间。