#### Conv3D 主要包括三个类：
- 参数设置 Config()
- 模型 Conv3D(nn.Module)
- 运行 Runner()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

import os
import sys
import time
import datetime
import json
import ctypes
from tqdm import tqdm
import numpy as np
from numpy.random import RandomState

In [None]:
# 使用 GPU 的设置
use_gpu = True if torch.cuda.is_available() else False
device = torch.device("cuda:2" if use_gpu else "cpu")

class Config():
    
    def __init__(self):
        
        self.dim = 30
        self.dim1 = 5
        self.dim2 = self.dim//self.dim1
        
        # config of CNN
        self.out_channels = 1           # 卷积后的输出图片有 16 个通道，即需要 16 个卷积核；Conv3D中，该值设置为1可能会使效果变好
        self.kernel_size = (3, 2, 3)     # (3, 2, 3) for Conv3D, (3,4) for ConvE, 1 for ConvKB
                                         # 卷积核一次处理 3 张图片（h,r,t 的 reshape），每个卷积核大小为 2*3
        self.drop_prob = 0.1    # 设置为 0.5 就会 69
        
        # config of training
        self.learning_rate = 0.01      # ConvKB 中设为 0.01；改为0.015 Conv3D 就会停在 69 下不去，应该是陷入局部最优了
        self.batch_num = 100
        self.epoch_num = 300
        self.lmbda = 0.1    # ConvKB 中是 0.2
        self.opt_method = "Adagrad"
        
        # config of model storage
        self.vali_epoch = 1
        self.save_epoch = 1000 
        self.mode = "train"        # "train" or "test"
        self.checkpoint_path = "./checkpoints"
        
        # self.test_file = ""
        # self.work_threads = 8 
        # self.bern = 0 
        
        # 调用 C++ 封装的库文件 Base.io
        self.clib = ctypes.cdll.LoadLibrary("./Base.so")
        self.dataset = "WN18"    # "WN18" or "WN18RR" or "FB15K" or "FB15K237"
        self.in_path = "./" + self.dataset + "/"    # 将数据集路径传递给 Base.io
        
        # 库函数参数的数据类型，不设置的话服务就会挂掉
        # negative sampling
        self.clib.sampling.argtypes = [
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_int64,
            ctypes.c_int64,
            ctypes.c_int64,
        ]
        
        # validation dataset
        self.clib.getValidHeadBatch.argtypes = [
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_void_p,
        ]
        self.clib.getValidTailBatch.argtypes = [
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_void_p,
        ]
        self.clib.validHead.argtypes = [ctypes.c_void_p]
        self.clib.validTail.argtypes = [ctypes.c_void_p]
        
        # link prediction test dataset
        self.clib.getHeadBatch.argtypes = [
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_void_p,
        ]
        self.clib.getTailBatch.argtypes = [
            ctypes.c_void_p,
            ctypes.c_void_p,
            ctypes.c_void_p,
        ]
        self.clib.testHead.argtypes = [ctypes.c_void_p]
        self.clib.testTail.argtypes = [ctypes.c_void_p]
        
        self.test_file = ""
        self.clib.setInPath(ctypes.create_string_buffer(self.in_path.encode(), len(self.in_path)*2))
        self.clib.setTestFilePath(ctypes.create_string_buffer(self.test_file.encode(), len(self.test_file)*2))
        
        self.clib.setBern(0)
        self.clib.setWorkThreads(8)
        self.clib.randReset()
        
        self.clib.importTrainFiles()
        self.clib.importTestFiles()
        self.clib.importTypeFiles()
        
        # 数据集统计信息
        self.ent_num = self.clib.getEntityTotal()
        self.rel_num = self.clib.getRelationTotal()
        self.train_num = self.clib.getTrainTotal()
        self.vali_num = self.clib.getValidTotal()
        self.test_num = self.clib.getTestTotal()
        
        self.batch_size = int(self.train_num / self.batch_num)
        
con = Config()
print(con.ent_num)
print(con.rel_num)
print(con.train_num)
print(con.vali_num)
print(con.test_num)
print(con.batch_size)

In [1]:
# 为 CPU 设置用于生成随机数的种子，以使得结果是确定的
torch.manual_seed(123)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(123)    # 使用多个 GPU 的话，为所有的 GPU 设置种子，torch.cuda.manual_seed() 是为当前 GPU 设置种子

class Conv3D(nn.Module):
    
    def __init__(self, config):
        super(Conv3D, self).__init__()
        
        self.config = config
        self.batch_h = None
        self.batch_t = None
        self.batch_r = None
        self.batch_y = None
        
        self.ent_embeddings = nn.Embedding(self.config.ent_num, self.config.dim)
        self.rel_embeddings = nn.Embedding(self.config.rel_num, self.config.dim)
        
        #==================================================ConvKB 1D 网络组件===========================================================
#         self.bn1 = nn.BatchNorm2d(num_features=1)
#         self.conv_layer = nn.Conv2d(in_channels=1, out_channels=self.config.out_channels, kernel_size=(self.config.kernel_size, 3))
#         self.bn2 = nn.BatchNorm2d(num_features = self.config.out_channels)    # 卷积之后进行 batch normalization
#         self.dropout = nn.Dropout(self.config.drop_prob)
#         self.nonlinear = nn.ReLU()    # 也可以尝试 nn.Tanh 等其他激活函数
#         self.fc_layer = nn.Linear(in_features=(self.config.dim-self.config.kernel_size+1)*self.config.out_channels, 
#                                   out_features=1, bias=False)
#         self.criterion = nn.Softplus()    # softplus 的公式就是：y = log(1+exp(x))
        #==============================================================================================================================
        
        
        
        #====================================================ConvE 2D 网络组件==========================================================
#         # 网络组件
#         self.conv_layer = nn.Conv2d(in_channels=1, out_channels=self.config.out_channels, kernel_size=self.config.kernel_size)
#         self.bn1 = nn.BatchNorm2d(1)
#         self.bn2 = nn.BatchNorm2d(self.config.out_channels)
#         self.bn3 = nn.BatchNorm1d(1)
        
#         a = self.config.kernel_size[0]
#         b = self.config.kernel_size[1]
#         self.conv_size = self.config.out_channels*(3*self.config.dim1-a+1)*(self.config.dim2-b+1)
#         self.nonlinear = nn.ReLU()
#         self.fc_layer = nn.Linear(in_features=self.conv_size, out_features=1)
        
#         self.input_dropout = nn.Dropout(self.config.drop_prob)    # 原来是 0.2
#         self.feature_map_dropout = nn.Dropout2d(self.config.drop_prob)    # 原来是 0.2
#         self.hidden_layer_dropout = nn.Dropout(self.config.drop_prob)     # 原来是 0.3
        
#         self.criterion = nn.Softplus()    # softplus 的公式是 y = log(1+exp(x))
        #==============================================================================================================================
        
        
        
        #================================================Conv 3D My model网络组件=======================================================
        self.bn1 = nn.BatchNorm3d(num_features=1)
        self.conv_layer = nn.Conv3d(in_channels=1, out_channels=self.config.out_channels, kernel_size=self.config.kernel_size)
        self.bn2 = nn.BatchNorm3d(num_features=self.config.out_channels)
        
        self.dropout = nn.Dropout(self.config.drop_prob)
        # self.dropout = nn.Dropout(self.config.drop_prob)
        self.nonlinear = nn.ReLU()    # 非线性激活函数
        # 卷积层的输出，一般使用ReLu作为激活函数；循环神经网络中的循环层一般为tanh，或者ReLu；
        # 全连接层也多用ReLu，只有在神经网络的输出层，使用全连接层来分类的情况下，才会使用softmax
        a = self.config.kernel_size[0]
        b = self.config.kernel_size[1]
        c = self.config.kernel_size[2]
        self.conv_size = self.config.out_channels*(3-a+1)*(self.config.dim1-b+1)*(self.config.dim2-c+1)
        self.fc_layer = nn.Linear(in_features=self.conv_size, out_features=1)
        
        self.criterion = nn.Softplus()    # softplus 的公式是 y = log(1+exp(x))
        #========================================================================================================================
        
        
        #================================================Conv 3D 二层卷积组件=======================================================
#         self.bn1 = nn.BatchNorm3d(num_features=1)
#         self.bn2 = nn.BatchNorm3d(num_features=8)
#         self.bn3 = nn.BatchNorm3d(num_features=16)
        
#         self.conv_layer_1 = nn.Conv3d(in_channels=1, out_channels=8, kernel_size=(3,2,3))    # 卷积完了的形状应该是 3-3+1=1，5-2+1=4,6-3+1=4
#         self.conv_layer_2 = nn.Conv3d(in_channels=8, out_channels=16, kernel_size=(1,4,4))    # 卷积完了应该是（1,1,1）
        
#         self.dropout = nn.Dropout(0.2)
#         self.nonlinear = nn.ReLU()    # 非线性激活函数
#         self.fc_layer = nn.Linear(in_features=16, out_features=1)
#         self.conv_size = 16
        
#         self.criterion = nn.Softplus()    # softplus 的公式是 y = log(1+exp(x))
        #========================================================================================================================
        
        
        
        # 初始化 embeddings
        nn.init.xavier_uniform_(self.ent_embeddings.weight.data)
        nn.init.xavier_uniform_(self.rel_embeddings.weight.data)
        
        # 初始化卷积层和全连接层的参数
        # nn.init.xavier_uniform_(self.conv_layer.weight.data)
        nn.init.xavier_uniform_(self.conv_layer.weight.data)
        nn.init.xavier_uniform_(self.fc_layer.weight.data)
        
    
    def cal_score(self):
        
        #================================================test ConvKB 1D 卷积========================================================
#         # 查表得到 embeddings
#         h = self.ent_embeddings(self.batch_h)
#         t = self.ent_embeddings(self.batch_t)
#         r = self.rel_embeddings(self.batch_r)
        
#         h = h.unsqueeze(1)    # unsqueeze 函数的作用是在第二维增加一个维度 1，h 的形状由 batch size * dim 变为 batch size * 1 * dim
#         t = t.unsqueeze(1)
#         r = r.unsqueeze(1)
        
#         conv_input = torch.cat([h, r, t], 1)       # 在第二个维度进行拼接，形状为 batch size * 3 * dim
        
#         # 使用 nn.Conv1d实现方式时，将下面两句 transpose 和 unsqueeze 的操作注释掉
#         conv_input = conv_input.transpose(1, 2)    # 将第二个维度和第三个维度进行交换，形状变为 batch size * dim * 3
#         conv_input = conv_input.unsqueeze(1)       # 卷积层的输入需要是 4D， batch size * 1 * dim * 3
#         # 这里总结一下 1D 和 2D 卷积的共性：输入都是 4D，第一个维度是 batch size，第二个维度是通道数（即有几个矩阵），第三和第四维度指示矩阵的形状
        
#         conv_input = self.bn1(conv_input)          # 卷积前的 batch normalization
#         conv_output = self.conv_layer(conv_input)  # 卷积，输出形状为 batch size * out_channels(卷积核数量) * (dim-self.config.kernel_size+1) * 1
#         conv_output = self.bn2(conv_output)        # 卷积后的 batch normalization，形状不变
#         conv_output = self.nonlinear(conv_output)  # 过激活函数，形状不变
#         # 忽略 batch size，就是拍扁（拼接）为一条线
#         conv_output = conv_output.view(-1, self.config.out_channels * (self.config.dim-self.config.kernel_size+1))
#         # 卷积输出形状为二维平面，第一个维度是 batch size，第二维度是 feature map 的长条
#         # batch size * (self.config.out_channels * (self.config.hidden_size-self.config.kernel_size+1))
        
#         fc_input = self.dropout(conv_output)
#         fc_output = self.fc_layer(conv_output)    # batch size * 1
#         score = fc_output.view(-1)             # batch size
#         # print(score)
#         return -score
        #==============================================================================================================================
        
        #=====================================================ConvKB 1D 卷积===========================================================
        # 现象记录：去掉 bn1 和 bn2 以及 fc_layer 前的 dropout，loss 就会卡在 69 下不去，但验证集上的正确率接近于 0
        # 说明发生了过拟合，说明 batch norm 和 dropout 还是很有必要的
        # 只去掉 batch norm，保留 dropout，也发生了同样的情况
        # 说明 batch norm 非常有必要
        # 实践证明，不进行 batch norm 是 loss 卡在 69 下不去的决定因素，dropout 的作用其次；卷积前后的 batch norm 都很有必要
        
        # 注意，这里和 Conv3D 遇到的情况还不太一样，这里的 l_filter 和 r_filter 都是 0，所以平均值也是0
        # Conv3D 的情况是 r_filter 是 1，是比较有意思的现象
        
#         # 查表得到 embeddings
#         h = self.ent_embeddings(self.batch_h)
#         t = self.ent_embeddings(self.batch_t)
#         r = self.rel_embeddings(self.batch_r)
        
#         h = h.unsqueeze(1)    # unsqueeze 函数的作用是在第二维增加一个维度 1，h 的形状由 batch size * dim 变为 batch size * 1 * dim
#         t = t.unsqueeze(1)
#         r = r.unsqueeze(1)
        
#         conv_input = torch.cat([h, r, t], 1)       # 在第二个维度进行拼接，形状为 batch size * 3 * dim
        
#         # 使用 nn.Conv1d实现方式时，将下面两句 transpose 和 unsqueeze 的操作注释掉
#         conv_input = conv_input.transpose(1, 2)    # 将第二个维度和第三个维度进行交换，形状变为 batch size * dim * 3
#         conv_input = conv_input.unsqueeze(1)       # 卷积层的输入需要是 4D， batch size * 1 * dim * 3
#         # 这里总结一下 1D 和 2D 卷积的共性：输入都是 4D，第一个维度是 batch size，第二个维度是通道数（即有几个矩阵），第三和第四维度指示矩阵的形状
        
#         conv_input = self.bn1(conv_input)          # 卷积前的 batch normalization
#         conv_output = self.conv_layer(conv_input)  # 卷积，输出形状为 batch size * out_channels(卷积核数量) * (dim-self.config.kernel_size+1) * 1
#         conv_output = self.bn2(conv_output)        # 卷积后的 batch normalization，形状不变
#         conv_output = self.nonlinear(conv_output)  # 过激活函数，形状不变
#         # 忽略 batch size，就是拍扁（拼接）为一条线
#         conv_output = conv_output.view(-1, self.config.out_channels * (self.config.dim-self.config.kernel_size+1))
#         # 卷积输出形状为二维平面，第一个维度是 batch size，第二维度是 feature map 的长条
#         # batch size * (self.config.out_channels * (self.config.hidden_size-self.config.kernel_size+1))
        
#         fc_input = self.dropout(conv_output)
#         fc_output = self.fc_layer(fc_input)    # batch size * 1
#         score = fc_output.view(-1)             # batch size
#         # print(score)
#         return -score
        #==============================================================================================================================
        

        #==============================================ConvE 2D 图片卷积===========================================================
#         # 问题：验证集上的正确率很低，测试集上跑出了点东西，正确率和验证集一样低
#         # 猜想：在训练集上过拟合了
        
#         # 问题：过滤器个数由 64 改为 16 以及将 drop_prob 同样改为 0.5 之后，同样出现了 loss 到 69 死活下不去的情况
#         # 原因：可能是过滤器个数由 64 改为 16 引起的
        
#         # 将 dim1 和 dim2 分别设置为 1 和 30，正确率立马就有了
#         # 所以，可能是二维卷积没有用？
#         # 二维图片拼接可能没什么用，直接调试三维吧
        
#         # 输入的 h 和 r 进行 reshape
#         reshaped_h = self.ent_embeddings(self.batch_h).view(-1, 1, self.config.dim1, self.config.dim2)    # batch size * 1 * self.dim1 * self.dim2
#         reshaped_r = self.rel_embeddings(self.batch_r).view(-1, 1, self.config.dim1, self.config.dim2)
#         reshaped_t = self.ent_embeddings(self.batch_t).view(-1, 1, self.config.dim1, self.config.dim2)
#         # print(reshaped_h.shape)
        
#         # reshpe 后对 h 和 r 两张图片进行堆叠
#         # 在第二个维进行拼接，维度为 batch size * 1 * self.dim1×3 * self.dim2
#         stack_input = torch.cat([reshaped_h, reshaped_r, reshaped_t], 2)    
#         # print(stack_input.shape)
        
#         # 过 ConvE 网络
#         x = self.bn1(stack_input)
#         # x = self.input_dropout(x)    # batch size * 1 * self.dim1×3 * self.dim2
#         x = self.conv_layer(x)    # batch size * out_channels * a * b
#         # print(x.shape)
#         x = self.bn2(x)
#         x = self.nonlinear(x)
#         x = self.feature_map_dropout(x)
#         # print(x.shape)
#         x = x.view(x.shape[0], -1)    # 保留 batch size 维度，拍扁为一个一维向量
#         # print(x.shape)
#         x = self.fc_layer(x)
#         # print(x.shape)                # batch size * 1
#         # x = self.hidden_layer_dropout(x)
#         # x = self.bn2(x)
#         # x = F.relu(x)
#         score = x.view(-1)             # batch size
#         return -score
        #==============================================================================================================================
        
        #==============================================Conv3D My Model 视频卷积=========================================================
        # 想法：能不能经过多层卷积最后卷成一个得分
        # 回答：单层卷积即会导致过拟合，多层卷积过拟合可能会更严重
        
        # 问题：模型 loss 到 69 之后死活下不去，并且在验证集上正确率 0.5，训练集上无结果？
        # 透过现象看本质，看 validation C++ 源码探究为什么验证集上的准确率会是 0.5
        # 0.5 是 l_filter（0） 和 r_filter（1）的平均，说明网络没有学到东西
        
        # 猜想：69应该是这个网络趋于稳定的一个值，可能在训练集上过拟合了，所以验证集上的正确率上不去
        # PlanB：设计多层卷积，最后卷成一个得分
        # 回答：本来效果就很差，用两层卷积越训效果越差，应该是过拟合了
        # 出现的现象是：loss 和 accuracy 都下降，双降表示模型在训练数据中一头扎得太深
        
        # 三维卷积的网络结构似乎对于三元组数据集来说过于复杂，因此验证集上的准确率不但上不去，反而随着训练下降
        # 应该测试 Conv3D 取特值时即 ConvKB 时的效果
        # 尝试用 convKB 的方式实现三维卷积？
        # 新心得：loss其实不重要，不必过于纠结 loss 不下降，因为会有这样的现象：loss稳定了，但是随着训练，验证集上的正确率还会上升
        # 只专注 acc debug 或调整网络结构即可，不必太过纠结中间量 loss
        
        # batch 的 h、r、t 查表得到 embeddings
        h = self.ent_embeddings(self.batch_h)
        r = self.rel_embeddings(self.batch_r)
        t = self.ent_embeddings(self.batch_t)
        
        # 进行 reshape
        h = h.view(-1, 1, self.config.dim1, self.config.dim2)    # batch size * 1 * dim1 * dim2
        r = r.view(-1, 1, self.config.dim1, self.config.dim2)
        t = t.view(-1, 1, self.config.dim1, self.config.dim2)    # 跪了跪了跪了跪了，磕头一万次，没想到 loss死活下不去的原因竟然是因为 copy paste
                                                                 # r 和 t 都用 h 的值来 reshape，loss 能下去才怪！！！

        cube_hrt = torch.cat([h,r,t], 1)    # batch size * 3 * dim1 * dim2，叠成了立方体
        x = cube_hrt.unsqueeze(1)        # 在第二个维度增加维度为 1 的维度（每帧图片的通道数为 1），因为 Conv3d 的输入需要 5 维
        x = self.bn1(x)
        # x = self.dropout(x)
        x = self.conv_layer(x)
        x = self.bn2(x)
        x = self.nonlinear(x)
        
        # 这里应该有个缓冲？
        # 除去 batch size，拍扁为一维向量，第二维是 out_channels × 卷积后的立方体大小
        x = x.view(-1, self.conv_size)
        x = self.dropout(x)
        x = self.fc_layer(x)
        score = x.view(-1)
        # print(score)
        
        return -score
        #========================================================================================================================
    
    
    def forward(self):
        '''
        前向传播计算 loss，返回该 batch 的 loss
        '''
        batch_score = self.cal_score()
        
        h = self.ent_embeddings(self.batch_h)
        r = self.rel_embeddings(self.batch_r)
        t = self.ent_embeddings(self.batch_t)
        # 跪了orz，我说为什么正确率上不去，逐行对代码才发现又抄错了，r 的查表也写成了 ent_embeddings
        # 改过来之后 loss 就下降得很快了
        
        l2_regular = torch.mean(h ** 2) + torch.mean(t ** 2) + torch.mean(r ** 2)
        for p in self.conv_layer.parameters():
            l2_regular += p.norm(2)
        for p in self.fc_layer.parameters():
            l2_regular += p.norm(2)
            
        # 该 batch 的 loss：该 batch 中所有样本得分的平均值 + 正则项
        mean = torch.mean(self.criterion(self.batch_y * batch_score))    # 三维卷积的话，这部分降到 0.69 就死活下不去了
        regular = self.config.lmbda * l2_regular
        # print(mean.data, regular)
        loss = mean + regular
        # print(loss.data)
        return loss

conv3d = Conv3D(con)
conv3d.batch_h = torch.LongTensor([1,40,5,4,6,79])
conv3d.batch_r = torch.LongTensor([5,4,2,4,1,9])
conv3d.batch_t = torch.LongTensor([3,60,80,3,56,7])
conv3d.batch_y = torch.LongTensor([1,1,1,-1,-1,-1])
conv3d()
print(conv3d.rel_embeddings)

NameError: name 'torch' is not defined

In [None]:
class Runner():
    
    def __init__(self, config, model):
        self.config = config
        self.model = model
        self.clib = self.config.clib
        
    def set_model(self, mode = 'train'):
        self.model.to(device)
        
        if mode == 'train':    # 训练模型
            print("Initializing training model...")
            # 为训练模型设定优化器
            if self.config.opt_method == "Adagrad":
                self.optimizer = optim.Adagrad(
                    params = self.model.parameters(),
                    lr = self.config.learning_rate,
                    lr_decay = 0,
                    weight_decay = 0
                )
            elif self.config.opt_method == "Adadelta":
                self.optimizer = optim.Adadelta(
                    params = self.model.parameters(),
                    lr = self.config.learning_rate,
                    weight_decay = 0
                )
            elif self.config.opt_method == "Adam":
                self.optimizer = optim.Adam(
                    params = self.model.parameters(),
                    lr = self.config.learning_rate,
                    weight_decay = 0
                )
            else:    # 不是以上三种的话就用 SGD
                self.optimizer = optim.SGD(
                    params = self.model.parameters(),
                    lr = self.config.learning_rate,
                    weight_decay = 0
                )
            print("Training model has been initialized.")
        else:                           # mode == 'test'，从 checkpoints 中载入模型，用于测试
            print("Fetching model for test...")
            ckpt_path = os.path.join("./checkpoints/", self.config.dataset + "-netparam_best" + ".ckpt")  
            self.model.load_state_dict(torch.load(ckpt_path))
            self.model.to(device)
            self.model.eval()
            print("Test model has been loaded.")
    
    def get_parameters(self, param_dict, mode = 'numpy'):
        '''
        从 model 中剥离出参数
        '''
        res = dict()
        for param in param_dict:
            if mode == 'numpy':
                res[param] = param_dict[param].cpu().numpy()
            elif mode == 'list':
                res[param] = param_dict[param].cpu().numpy().tolist()
            else:
                res[param] = param_dict[param]
        return res
    
    def neg_sample(self):
        '''
        对 batch 数据进行负采样
        无返回值
        '''
        self.negative_ent = 1    # 负样本实体一个
        self.negative_rel = 0
        self.batch_seq_size = self.config.batch_size * (1 + self.negative_ent + self.negative_rel)
        
        self.batch_h = np.zeros(self.batch_seq_size, dtype = np.int64)    # 容量是 batch size 的两倍，用于盛放负样本
        self.batch_t = np.zeros(self.batch_seq_size, dtype = np.int64)
        self.batch_r = np.zeros(self.batch_seq_size, dtype = np.int64)
        self.batch_y = np.zeros(self.batch_seq_size, dtype = np.float32)

        self.batch_h_addr = self.batch_h.__array_interface__["data"][0]
        self.batch_t_addr = self.batch_t.__array_interface__["data"][0]
        self.batch_r_addr = self.batch_r.__array_interface__["data"][0]
        self.batch_y_addr = self.batch_y.__array_interface__["data"][0]
        
        # 这一步将数据集中实体和关系的 id 传进来
        # print(self.batch_y)
        self.clib.sampling(
            self.batch_h_addr,    # 头实体 batch 的地址，传给 clib 函数的指针
            self.batch_t_addr,
            self.batch_r_addr,
            self.batch_y_addr,
            self.config.batch_size,
            self.negative_ent,
            self.negative_rel
        )
    
    def train_batch(self):
        '''
        使用 self.model 训练一个 batch 的数据
        return: 该 batch 的 loss
        '''
        self.model.train()
        # 向模型喂一个 batch 的数据
        self.model.batch_h = torch.from_numpy(self.batch_h).to(device)    # numpy 数组转为 Tensor
        self.model.batch_t = torch.from_numpy(self.batch_t).to(device)
        self.model.batch_r = torch.from_numpy(self.batch_r).to(device)
        self.model.batch_y = torch.from_numpy(self.batch_y).to(device)
        # print(self.model.batch_y)
        
        self.optimizer.zero_grad()
        loss = self.model()    # 会自动调用 forward() 函数
        loss.backward()        # 误差反向传播
        # 由于在反向传播的过程中会发生梯度消失/爆炸，因此设定阈值，当梯度大于/小于阈值时候，将梯度缩放为阈值
        nn.utils.clip_grad_norm_(parameters = self.model.parameters(), max_norm = 0.5, norm_type = 2)
        self.optimizer.step()
        
        return loss.item()
    
    def test_batch(self, model, batch_h, batch_t, batch_r):
        '''
        测试一个 batch 的数据
        batch_h: numpy array
        batch_t: numpy array
        batch_r: numpy array
        return: 该 test batch 的三元组得分
        '''
        # model.train()  将模块设置为训练模式，使用BatchNormalizetion()和Dropout()
        # model.eval()   将模块设置为评估模式，不使用BatchNormalization()和Dropout()
        model.eval()
        with torch.no_grad():
            model.batch_h = torch.from_numpy(batch_h).to(device)
            model.batch_t = torch.from_numpy(batch_t).to(device)
            model.batch_r = torch.from_numpy(batch_r).to(device)
        # print("test batch res is:")
        res = model.cal_score().cpu().data.numpy()
        # print(res.shape)
        return res
        
    def validation(self, model):
        '''
        验证模型
        '''
        model.eval()
        self.vali_h = np.zeros(self.config.ent_num, dtype=np.int64)
        self.vali_t = np.zeros(self.config.ent_num, dtype=np.int64)
        self.vali_r = np.zeros(self.config.ent_num, dtype=np.int64)
        self.vali_h_addr = self.vali_h.__array_interface__["data"][0]    # array 的内存地址
        self.vali_t_addr = self.vali_t.__array_interface__["data"][0]
        self.vali_r_addr = self.vali_r.__array_interface__["data"][0]
        
        self.clib.validInit()
        self.clib.getValidHit10.restype = ctypes.c_float
        
        print("The total number of validation triplets is %d" % self.config.vali_num)
        for i in range(self.config.vali_num):
            sys.stdout.write("%d \r" % i)    # 动态打印输出
            sys.stdout.flush()
            
            # 之前运行这一步服务就会
            # 原因：self.vali_r = np.zeros(self.config.ent_num, dtype=np.int64)，写成了rel_num，以为是原代码错了，但其实没有，自作聪明的结果
            self.clib.getValidHeadBatch(self.vali_h_addr, self.vali_t_addr, self.vali_r_addr)
            res = self.test_batch(model, self.vali_h, self.vali_t, self.vali_r)
            self.clib.validHead(res.__array_interface__["data"][0])
            
            self.clib.getValidTailBatch(self.vali_h_addr, self.vali_t_addr, self.vali_r_addr)
            res = self.test_batch(model, self.vali_h, self.vali_t, self.vali_r)
            self.clib.validTail(res.__array_interface__["data"][0])
            
            # 第一个 batch 的 Hits@10 res 是 0.0
        return self.clib.getValidHit10()   # 训练时的验证步骤，需要返回 hits@10 结果
    
    def test(self, model):
        self.set_model(mode = 'test')
        # 只做链接预测实验
        
        print("The total number of test triplets is %d" % self.config.test_num)

        self.test_h = np.zeros(self.config.ent_num, dtype=np.int64)
        self.test_t = np.zeros(self.config.ent_num, dtype=np.int64)
        self.test_r = np.zeros(self.config.ent_num, dtype=np.int64)
        self.test_h_addr = self.test_h.__array_interface__["data"][0]
        self.test_t_addr = self.test_t.__array_interface__["data"][0]
        self.test_r_addr = self.test_r.__array_interface__["data"][0]
        
        print("Testing...")
        for i in range(self.config.test_num):
            sys.stdout.write("%d \r" % i)    # 动态打印输出
            sys.stdout.flush()

            self.clib.getHeadBatch(self.test_h_addr, self.test_t_addr, self.test_r_addr)
            res = self.test_batch(model, self.test_h, self.test_t, self.test_r)
            self.clib.testHead(res.__array_interface__["data"][0])

            self.clib.getTailBatch(self.test_h_addr, self.test_t_addr, self.test_r_addr)
            res = self.test_batch(model, self.test_h, self.test_t, self.test_r)
            self.clib.testTail(res.__array_interface__["data"][0])

        self.clib.test_link_prediction()
        print("Finished testing.")
    
    def train_model(self):
        if not os.path.exists(self.config.checkpoint_path):
            os.mkdir(self.config.checkpoint_path)
            
        self.set_model(mode = self.config.mode)
        
        best_epoch = 0
        best_hits10 = 0.0
        best_model = self.model

        epochs = tqdm(range(self.config.epoch_num))
        
        for epoch in epochs:
            res = 0.0    # 用于累加本 epoch 各个 batch 的 loss
            for batch in range(self.config.batch_num):    # 训练一个batch
                self.neg_sample()    # 负采样
                loss = self.train_batch()    # 训练一个 batch 为一个 step
                # print("batch loss: %f" % loss)
                res += loss
                
            epochs.set_description("Epoch %d | loss: %f" % (epoch, res))    # 输出进度条的描述
            # 问题：loss到了69左右就不下降了，但是 50 epoch 后验证集正确率还是达到了 50多（但不再增长），所以应该问题不大
            # 应该是网络比较复杂，在数据集上很快就拟合好了
            # 验证集准确率上升很快，测试集上却没有跑出结果，应该是过拟合了
            # 下一步是学习 ConvE 疯狂 dropout
            
            if (epoch + 1) % self.config.save_epoch == 0:
                epochs.set_description("Epoch %d has finished, loss is %f, saving checkpoint ..." % (epoch, res))
                # 存储 checkpoint
                save_path = os.path.join(self.config.checkpoint_path, self.config.dataset + "-" + str(epoch) + ".ckpt")
                torch.save(self.model.state_dict(), save_path)
            
            if (epoch + 1) % self.config.vali_epoch == 0:
                epochs.set_description("Epoch %d has finished, loss is %f, validating ..." % (epoch, res))
                hits10 = self.validation(self.model)
                print("hits@10 of this validation epoch is: %.8f" % hits10)
#                 print("Testing on test set ...")
#                 self.test(self.model)
#                 print("Test result is printed on Linux shell.")
                
                if hits10 > best_hits10:
                    best_hits10 = hits10
                    best_epoch = epoch
                    best_model = self.model
                    
            # sys.exit()
            
        # 所有的 epoch 都循环完之后（300个），存储验证集上最优模型的网络参数和 embeddings
        print("Best epoch is %d, best hit@10 of validation set is %f" % (best_epoch, best_hits10))
        print("Storing checkpoint of best result at epoch %d ..." % (best_epoch))
        netparam_save_path = os.path.join(self.config.checkpoint_path, self.config.dataset + "-netparam_best" + ".ckpt")
        torch.save(best_model.state_dict(), netparam_save_path)

        embed_save_path = os.path.join(self.config.checkpoint_path, self.config.dataset + "-embed_best" + ".json")
        with open(embed_save_path, 'w') as f:
            f.write(json.dumps(self.get_parameters(best_model.state_dict(), 'list')))
        print("Finished Storing best model and embeddings.")

        self.test(model = best_model)    # 测试结果会输出在 Linux 终端

In [None]:
runner = Runner(con, conv3d)
if runner.config.mode == 'train':
    runner.train_model()
else:
    runner.test(runner.model)

In [None]:
# Conv3D 草稿
# 假设 dim = 30，dim = 5，dim = 6
# 输入图片的通道数是 1，输出通道/需要的卷积核数量是 16，kernel 每次处理 2 帧图片，卷积核大小为 3*4
conv3d = nn.Conv3d(in_channels=1, out_channels=16, kernel_size=(2,3,4))
# 输入是 5 维的，batch size 为 5，每帧图片通道数为 1，一段视频包含 3 帧图片（h,r,t），每帧图片大小为 5*6
input = torch.randn(5, 1, 3, 5, 6)
output = conv3d(input)
print(output.shape)

In [None]:
# 造样本数据进行实验
con = Config()
conv3d = Conv3D(con)
print(conv3d.ent_embeddings)
print(conv3d.rel_embeddings)
conv3d.batch_h = torch.LongTensor([9,2,5,6,8])    # batch size 为 5
conv3d.batch_t = torch.LongTensor([6,7,3,4,6])
conv3d.batch_r = torch.LongTensor([4,3,1,5,2])
conv3d.cal_score()