## 数据集的处理
1. 首先标注数据(待定)
2. 拿到对应的稀疏矩阵和label后，通过原始数据构建一个adj_list(GAT源码中用的是一个字典)
3. 标准化
4. 根据拿到的字典对edge进行统计,并构造edge矩阵shape=(2, num_edges)
5. 划分数据集通过indices来构建(我们用的是自然语言)

## 一些知识图谱的定义
1. rdfs:range 用来表示属性的取值类型
2. rdfs:domain 用来表示属性的域，即属于哪个类别
3. OWL web ontology language(声明性的，非编程语言，用逻辑的方式描述一个事物的状态)，有点类似于一种性质，比如owl:TransitiveProperty （腾讯总部， 位于， 深圳），(深圳， 位于， 广东)=> （腾讯总部， 位于， 广东）
4. 如果不需要这些数据，如果对生成负类样本，就无法有足够好的负类样本了（比如里约热内卢替换为狗，都不是一个大类的）

In [1]:
import torch 
from d2l import torch as d2l
from torch import nn
import numpy as np
import pandas as pd

In [7]:
a = torch.rand(size = (2,2,3))
b = torch.rand(size = (1,2,3))
(a*b).sum(dim=-1).shape

torch.Size([2, 2])

In [5]:
date_fact = pd.read_csv("D:/jupyter/kg-bert/sample/yagoDateFacts.tsv", sep='\t',header=None).drop(0,axis=1)
fact = pd.read_csv("D:/jupyter/kg-bert/sample/yagoFacts.tsv", sep='\t', header=None).drop(0,axis=1)
literal_fact =  pd.read_csv("D:/jupyter/kg-bert/sample/yagoLiteralFacts.tsv", sep='\t', header=None).drop(0,axis=1)
#schema 是一个限制条件，限制你的头/尾的取值范围一类的
schema =  pd.read_csv("D:/jupyter/kg-bert/sample/yagoSchema.tsv", sep='\t', header=None).drop(4,axis=1)
# taxonomy 代表其构建的方式以及对应的属性
taxonomy =  pd.read_csv("D:/jupyter/kg-bert/sample/yagoTaxonomy.tsv", sep='\t', header=None).drop(0,axis=1)
# 大类
types =  pd.read_csv("D:/jupyter/kg-bert/sample/yagoTypes.tsv", sep='\t', header=None).drop(0,axis=1)
# instanceof，也为属性
wiki_ins =  pd.read_csv("D:/jupyter/kg-bert/sample/yagoWikidataInstances.tsv", sep='\t', header=None).drop(0,axis=1)
wiki_info_en =  pd.read_csv("D:/jupyter/kg-bert/sample/yagoWikipediaInfo_en.tsv", sep='\t', header=None).drop(0,axis=1)

### 所以我们只需要: fact, wiki_info_en

In [168]:
def preprocessing(data):
    temp = data[data[3].apply(lambda x: "<" in x)].copy()
   # print(data)
    for i in temp.columns:
        data[i]=data[i].apply(lambda x: x.split("<")[-1].split(">")[0])
    data.iloc[:,-1] = data.iloc[:,-1].apply(lambda x: x.split("^^")[0])
    return data
# date_fc = preprocessing(date_fact)
fc = preprocessing(fact)
# literal_fc = preprocessing(literal_fact)
wiki_info_en_ = preprocessing(wiki_info_en)

In [8]:
# a = pd.read_csv("F:/yago-3.0.2-native/yagoFacts.tsv",header=None, sep = "\t").drop(4,axis=1)[1:].reset_index().drop("index",axis=1)
# # b = pd.read_csv("F:/yago-3.0.2-native/yagoMetaFacts.tsv",header =None, sep="\t")[1:].reset_index().drop("index",axis=1)

In [36]:
a

Unnamed: 0,0,1,2,3
0,<id_156a81d_z7a_1lfplq>,<Onchiam>,<isLocatedIn>,<Kerala>
1,<id_u0gksp_ab2_zz7558>,<Gregory_S._Martin>,<hasWonPrize>,<Order_of_the_Sword_(United_States)>
2,<id_9taxo5_1ul_1oe1uea>,<Wouter_Vrancken>,<playsFor>,<K.V._Kortrijk>
3,<id_gfqocs_dhj_27wsag>,<Anthony_Gilbert_(author)>,<diedIn>,<London>
4,<id_z13ez0_13l_1v6yd3x>,<Johan_Jacobsen>,<directed>,<The_Invisible_Army>
...,...,...,...,...
5628161,<id_2f9lh8_z7a_vipgwv>,<Sambaregou>,<isLocatedIn>,<Zabré_Department>
5628162,<id_u02dc9_14h_1hqlz5i>,<Miguel_Estanislao_Soler>,<isPoliticianOf>,<Buenos_Aires_Province>
5628163,<id_1m1a5xw_ice_2garf6>,"<Orting,_Washington>",<hasWebsite>,<http://www.cityoforting.org>
5628164,<id_81fluf_1ul_1jmf8sk>,<Harvey_Esajas>,<playsFor>,<A.S.D._Legnano_Calcio_1913>


## 构建adj_list
即拿到dic的数据

In [150]:
total_ = pd.concat([fc,wiki_info_en_],ignore_index=True)

In [170]:
def build_adj_ls(total_):
    '''
    input: total_, a DataFrame
    '''
    #创建索引和词语对应
    head_tail = pd.unique(pd.concat([total_[1],total_[3]],ignore_index=True))
    head_tail_index = range(len((head_tail)))
    head2index = {}
    for head,index in zip(head_tail,head_tail_index):
        head2index[head] = index
    relation = pd.unique(total_[2])
    # create adj_list by using the index of each entity
    exist_entity = set()
    adj_ls = {}
    for i in total_.iterrows():
        if i[1][1] not in exist_entity:
            adj_ls[head2index[i[1][1]]] = [head2index[i[1][3]]]
            exist_entity.add(i[1][1])
        else:
            adj_ls[head2index[i[1][1]]].append(head2index[i[1][3]])
    return adj_ls, head_tail.shape[0]

In [171]:
adj_ls, num_nodes = build_adj_ls(total_)

## 创建edge index

从adj_ls 里拿到edge index 的矩阵表示（shape为(2,num_edges)）:

In [156]:
def build_edge_index(adj_ls, num_nodes,add_self_edges = True):
    head_ids, tail_ids = [],[]
    exist_edges = set()
    for head, tails in adj_ls.items():
        for tail in tails:
            if (head, tail) not in exist_edges:
                head_ids.append(head)
                tail_ids.append(tail)
                exist_edges.add((head,tail))
    if add_self_edges:
        head_ids.extend(np.arange(num_nodes))
        tail_ids.extend(np.arange(num_nodes))
    
    # shape为(2,num_edges)
    edge_idx = np.row_stack((head_ids, tail_ids))
    return edge_idx

In [158]:
edge_idx=build_edge_index(adj_ls,num_nodes)
edge_idx = torch.tensor(edge_idx,dtype = torch.long)

In [159]:
edge_idx

tensor([[    0,     0,     0,  ..., 84574, 84575, 84576],
        [  421,   422,   423,  ..., 84574, 84575, 84576]])

## 初始化的embedding（待定，需要查阅维度(我这里就随机设了个5维)等）

In [160]:
class GAT_layer(nn.Module):
    '''
    implementation of GAT LAYER
    '''
    head_e_dim = 0
    tail_dim = 1
    
    head_num_dim = 1
    nodes_dim = 0
    def __init__(self, feature_in, feature_out, num_heads, concat=True,dropout_prob = 0.6, activation = nn.ELU(),add_skip_connection =True, bias =True):
        '''
        input is:
        1. in_feature shape
        2. out_feature shape
        3. multihead 
        4. concatenation 
        5. dropout rate
        6. residual
        '''
        super().__init__()
        self.num_heads = num_heads
        self.feature_out = feature_out
        self.concat = concat
        self.act = activation
        self.add_skip_connection = add_skip_connection
        # 可以看为num_heads 个独立的W矩阵
        self.ln1 = nn.Linear(feature_in, feature_out*num_heads,bias = False)
        # apply additive attention 
        # 在concat的时候[x,y]分别代表节点的特征,即分别做两个变换后再加和
        self.scoring_fn_head = nn.Parameter(torch.Tensor(1,num_heads, feature_out))
        self.scoring_fn_tail = nn.Parameter(torch.Tensor(1,num_heads, feature_out))
        
        if bias and concat:
            self.bias = nn.Parameter(torch.Tensor(num_heads*feature_out))
        if not concat:
            self.bias = nn.Parameter(torch.Tensor(feature_out))
        if add_skip_connection:
            self.ln2 = nn.Linear(feature_in, feature_out*num_heads, bias=False)
        self.leaky_relu = nn.LeakyReLU(0.2)
        self.dropout = nn.Dropout(dropout_prob)
        self.init_params()
    def init_params(self):
        nn.init.xavier_uniform_(self.ln1.weight)
        nn.init.xavier_uniform_(self.scoring_fn_head)
        nn.init.xavier_uniform_(self.scoring_fn_tail)
        if self.bias is not None:
            torch.nn.init.zeros_(self.bias)
        if self.add_skip_connection:
            nn.init.xavier_uniform_(self.ln2.weight)
    def lift(self, score_head, score_tail, features, edge_idx):
        # 拿到edge对应的index
        #edge_idx = torch.tensor(edge_idx,dtype=torch.long)
        head_node_index = edge_idx[self.head_e_dim]
        tail_node_index = edge_idx[self.tail_dim]
        #通过edge index（实际上就是对应的node index）来拿到对应的分数,然后维度由N->E
        score_head = score_head.index_select(self.nodes_dim,head_node_index)
        score_tail = score_tail.index_select(self.nodes_dim, tail_node_index)
        node_features_proj_lifted = features.index_select(self.nodes_dim,head_node_index)
        return score_head, score_tail, node_features_proj_lifted
    def broadcasting(self, entity_1, entity_2):
        '''
        扩开维度
        '''
        for i in range(entity_1.dim(),entity_2.dim()):
            entity_1 = entity_1.unsqueeze(-1)
        return entity_1.expand_as(entity_2)
    def sum_edge_scores_neighbourhood(self, exp_scores_per_edge, tail_index,num_nodes):
        '''
        通过tail_index 和 原注意力exp_scores_per_edge 来进行汇聚 neighbour(scatter_add)
        '''
        trg_index_broadcast = self.broadcasting(tail_index, exp_scores_per_edge)
        # shape = (N,NH)
        size = list(exp_scores_per_edge.shape)
        # 更改 nodes_dim
        size[self.nodes_dim] = num_nodes
        neighbour_sums = torch.zeros(size,dtype = exp_scores_per_edge.dtype,device = exp_scores_per_edge.device)
        # 通过 scatter add, 输入是原矩阵, index矩阵和目标矩阵，目标矩阵的对应位置会是原矩阵的值的加和
        neighbour_sums.scatter_add_(self.nodes_dim, trg_index_broadcast,exp_scores_per_edge)
        return neighbour_sums.index_select(self.nodes_dim, tail_index)
    def aggregate_neighbour_softmax(self, score_per_edge, tail_index,num_nodes):
        '''
        就是很简单的对邻居做aggregate
        '''
        scores_per_edge = score_per_edge - score_per_edge.max()#避免梯度炸裂
        exp_scores_per_eg = scores_per_edge.exp()# softmax
        neighbour_deno = self.sum_edge_scores_neighbourhood(exp_scores_per_eg,tail_index,num_nodes)
        # cal attention
        attention_per_edge = exp_scores_per_eg/(neighbour_deno+1e-16)
        return attention_per_edge.unsqueeze(-1)
    def aggregate_neighbours(self, node_feature_weighted, edge_index,in_features, num_nodes):
        size = list(node_feature_weighted.shape)
        size[self.nodes_dim] = num_nodes
        out_node_features = torch.zeros(size, dtype = in_features.dtype, device = in_features.device)
        
        #broadcasting
        tail_index_broadcast = self.broadcasting(edge_index[self.tail_dim],node_feature_weighted)
        # accumulate all the attention head
        out_node_features.scatter_add_(self.nodes_dim, tail_index_broadcast, node_feature_weighted)
        return out_node_features
    def residual_part(self, attention_coef, in_feature, out_feature):
        if self.add_skip_connection:
            if out_feature.shape[-1] == in_feature.shape[-1]:
                #shape: (N,FIN)->(N,1,FIN), OUT: (N,NH,FOUT)
                # 即将input vector扩大NH（num_head）次到outputvector里
                out_feature += in_feature.unsqueeze(-1)
            else:
                # FIN!=FOUT
                out_feature += self.ln2(in_feature).view(-1,self.num_heads, self.feature_out)
        if self.concat:
            # shape: (N,NH,FOUT)->(N, NH*FOUT)
            out_feature = out_feature.view(-1,self.num_heads*self.feature_out)
        else:
            #shape: (N,NH,FOUT)->(N,FOUT),做avg
            out_feature = out_feature.mean(dim = self.head_num_dim)
        if self.bias is not None:
            out_feature+= self.bias
        return self.act(out_feature)
    def forward(self, input_):
        '''
        input_ 是一个(nodefeatures, edge_index)的 二元组
        '''
        # 投影+正则化
        node_features, edge_idx = input_
        #edge_idx = torch.tensor(edge_idx,dtype= torch.long)
        num_nodes = node_features.shape[self.nodes_dim]
        # 保证 edge 的长是2
        assert edge_idx.shape[0] ==2
        node_features = self.dropout(node_features)
        # 改变维度 (N, feature_in)*(feature_in, num_head*out)-> (N, nh, out)
        node_features_proj = self.ln1(node_features).view(-1,self.num_heads, self.feature_out)
        node_features_proj = self.dropout(node_features_proj)
        # edge attention calculation 哈达玛积,按照最后一个维度求和
        # shape (N,NH,OUT)*(1,NH,Fout) -> 
        score_head = (node_features_proj*self.scoring_fn_head).sum(dim=-1)
        score_tail = (node_features_proj*self.scoring_fn_tail).sum(dim=-1)
        # 使用lift函数拿到对应的score
        score_head_lift, score_tail_lift, node_features_proj_lifted = self.lift(score_head, score_tail,node_features_proj, edge_idx)
        scores_per_edge = self.leaky_relu(score_head_lift + score_tail_lift)
        attention_per_edge = self.aggregate_neighbour_softmax(scores_per_edge,edge_idx[self.tail_dim], num_nodes)
        attention_per_edge = self.dropout(attention_per_edge)
        
        # neighbour aggregation
        # 拿到权重,这里的attention相当于将node feature当作value, edge当作key,邻居节点当作query
        nodes_features_weighted = node_features_proj_lifted*attention_per_edge# 哈达玛积
        out_node_feature = self.aggregate_neighbours(nodes_features_weighted,edge_idx,node_features,num_nodes)
        
        # done the residual part
        out_node_feature = self.residual_part(attention_per_edge,node_features,out_node_feature)
        return (out_node_feature, edge_idx)

In [161]:
# a test for lift function
node_f = torch.rand(size = (142421,5))
num_nodes = node_f.shape[0]
assert edge_idx.shape[0] ==2
ln = nn.Linear(5,20*2)
node_f_proj = ln(node_f).view(-1, 2, 20)
score_fn_head = nn.Parameter(torch.Tensor(1,2, 20))
score_fn_tail = nn.Parameter(torch.Tensor(1,2,20))
print(node_f_proj.shape,score_fn_head.shape)
score_head = (node_f_proj*score_fn_head).sum(-1)
score_tail = (node_f_proj*score_fn_tail).sum(-1)
def lift(score_head, score_tail, features, edge_idx):
        # 拿到edge对应的index
        #edge_idx = torch.tensor(edge_idx,dtype=torch.long)
        head_node_index = edge_idx[0]
        tail_node_index = edge_idx[1]
        #通过edge index（实际上就是对应的node index）来拿到对应的分数
        score_head = score_head.index_select(0,head_node_index)
        score_tail = score_tail.index_select(0, tail_node_index)
        node_features_proj_lifted = features.index_select(0,head_node_index)
        return score_head, score_tail, node_features_proj_lifted
head,tail,feature= lift(score_head,score_tail,node_f_proj,edge_idx)
head.shape, tail.shape, feature.shape

torch.Size([142421, 2, 20]) torch.Size([1, 2, 20])


(torch.Size([219341, 2]), torch.Size([219341, 2]), torch.Size([219341, 2, 20]))

In [162]:
# a test for GAT layer
feature_in, feature_out,num_heads = 5,20,2
net = GAT_layer(feature_in,feature_out,num_heads)
net((node_f,edge_idx))

(tensor([[-0.3858, -0.1784, -0.3770,  ..., -0.0693,  0.8868, -0.1783],
         [-0.5703,  0.0376, -0.5930,  ..., -0.4321, -0.3722,  0.6794],
         [-0.6646,  0.1865,  0.1280,  ..., -0.2703, -0.3344,  0.7680],
         ...,
         [-0.0789, -0.1870,  0.3681,  ..., -0.1255, -0.0142,  0.1889],
         [-0.0570,  0.1854, -0.1375,  ..., -0.0420, -0.1740,  0.1811],
         [ 0.4217, -0.1616,  0.1986,  ...,  0.0410, -0.0907, -0.1615]],
        grad_fn=<EluBackward0>),
 tensor([[    0,     0,     0,  ..., 84574, 84575, 84576],
         [  421,   422,   423,  ..., 84574, 84575, 84576]]))