In [None]:
import torch 
#from d2l import torch as d2l
from torch import nn
import numpy as np
import pandas as pd
from transformers import BertModel
from transformers import BertTokenizer
import os
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt 
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

In [None]:
# train_data = pd.read_csv("../input/train-valid-test-dataset/train.csv").drop("Unnamed: 0", axis = 1)
# train_data.head()

## Labeling

In [None]:
path = "../input/yago-dataset/"
def labeling_Dataset(path, times):
    '''
    This function is to label all the out-dated data from a given graph
    path: location of tsv file 
    times: List: years given by user, [2012,2013] for example
    '''
    # read in the whole dataset
    fact = pd.read_csv(path+"yagoFacts.tsv",sep="\t", header=None).drop(0,axis=0).drop(4,axis=1)
    fact.columns = ["ids","head","relation","tail"]
    meta_fact = pd.read_csv(path+"yagoMetaFacts.tsv",sep = "\t", header = None).drop(0)
    meta_fact.columns = ["ids1","ids2","relation","date/others","date"]
    # get the time info,use inner merge to get the existing result
    time_info = meta_fact[~np.isnan(meta_fact["date"])]
    merge_info =pd.merge(time_info,fact,how = "inner",left_on="ids2",right_on="ids")
    occur_since = merge_info[merge_info["relation_x"] == "<occursSince>"]
    occur_until = merge_info[merge_info["relation_x"] == "<occursUntil>" ]
    occur_since_care = occur_since[["ids","date"]].reset_index(drop = True)
    care_until = occur_until[["ids","head","relation_y","tail","date"]].reset_index(drop=True)
    care_result = pd.merge(care_until,occur_since_care,how = "left", left_on="ids",right_on="ids")
    care_result.columns = ["ids","head","relation","tail","occur_until_date","occur_since_date"]
    # for given time, labeling
    # 先选出occur since的数据，代表出现在given time 之前
    graphs = []
    for time in tqdm(times):
        fact_ = fact.copy()
        # select triple smaller than time
        graph = care_result[care_result["occur_since_date"]<=time].copy()
        #select ids greater than times
        ids_later = care_result[care_result["occur_since_date"]>time].ids
        #判断occur until在不在given time 之前，如果在，则标为过时，否则为非过时
        id_ = graph[graph["occur_until_date"]<time].ids
        # 先选出在之前的数据
        fact_ = fact_[~fact_["ids"].isin(ids_later)]
        # 选出对应的ids，并与原先的fact对应
        idx_out_dated = fact_[fact_["ids"].isin(id_)].index
        fact_["labels"] = 1
        fact_.loc[idx_out_dated,"labels"]=0
        graphs.append(fact_)
        del fact_
    del fact
    del meta_fact
    gc.collect()
    objective = graphs[-1]
    final_graphs = []
    for i in tqdm(graphs[:-1]):
        tmp = objective.copy()
        # 拿到在前面的图而不在最后一张图的index
        idx = tmp[~tmp["ids"].isin(i["ids"])].index
        #print(idx)
        tmp.loc[idx, "head"] = "<sp>"
        tmp.loc[idx, "relation"] = "<sp>"
        tmp.loc[idx, "tail"] = "<sp>"
        final_graphs.append(tmp.reset_index(drop=True))
    final_graphs.append(objective.reset_index(drop=True))
    return final_graphs
graphs = labeling_Dataset(path,[2010,2011,2012])

In [None]:
graphs[-1]

## Change to natural language

In [None]:
def preprocess(input_):
    '''
    Data is the given input dataframe from previous preprocessing
    '''
    data = input_.copy()
    temp = data[data["tail"].apply(lambda x: "<" in x)].copy()
    for i in tqdm(temp.columns):
        if i!="labels":
            data[i]=data[i].apply(lambda x: x.split("<")[-1].split(">")[0])
        else:
            break
    data.loc[:,"tail"] = data.loc[:,"tail"].apply(lambda x: x.split("^^")[0])
    del temp
    gc.collect()
    return data
graphs_ = [preprocess(i) for i in graphs]

## Construct adj list

In [None]:
# only use one dictionary for relation and head 
# use the last graph
def build_dic(total_):
    head_tail = pd.unique(pd.concat([total_["head"],total_["tail"]],ignore_index=True))
    head_tail_index = range(1,len((head_tail))+1)
    head2index = {}
    for head,index in tqdm(zip(head_tail,head_tail_index)):
        head2index[head] = index
    head2index["sp"] = 0
    relation = pd.unique(total_["relation"])
    # create index of each relation
    id_rel = range(1, len(relation) + 1)
    rel2id = {rel:idx for idx, rel in zip(id_rel, relation)}
    # add reverse relation part, which means head->tail is normal one, tail->head is reversed one
    rel2id.update({rel + "reverse":idx + len(rel2id) for idx, rel in enumerate(relation)})
    rel2id["sp"] = 0
    return head2index, rel2id
head2idx, rel2id = build_dic(graphs_[-1])

In [None]:
# nearly cost 18 min for this cell
def build_adj_ls(total_, head2index, rel2id):
    '''
    input: total_, a DataFrame
    '''
    #创建索引和词语对应
# --------------- original GAT preprocessing
#     adj_ls = {}
#    # create adj_list by using the index of each entity
#    exist_entity = set()
#     for i in tqdm(total_.iterrows()):
#         if i[1][1] not in exist_entity:
#             adj_ls[head2index[i[1][1]]] = [head2index[i[1][3]]]
#             exist_entity.add(i[1][1])
#         else:
#             adj_ls[head2index[i[1][1]]].append(head2index[i[1][3]])


    # --------------- Update preprocessing with relation embedding -----------
    head_tail = pd.unique(pd.concat([total_["head"],total_["tail"]],ignore_index=True))
    num_relation = len(rel2id) //2
    ls = []
    tmp_graph = total_.copy()
    # temp dictionary for sampling 
    dic = {}
    for i in tqdm(total_.iterrows()):
        # here choose the correct value and add it into our last training graph
        ls.append((head2index[i[1][1]], rel2id[i[1][2]], head2index[i[1][3]]))
        if i[1][1] not in dic:
            dic[i[1][1]] = i[1][0]
        if i[1][3] not in dic:
            dic[i[1][3]] = i[1][0]
#     print(len(ls), total_.shape[0])
#     print(len(ls) == total_.shape[0])
    tmp_graph["index_where"] = pd.Series(ls)
    edge_idx, edge_type = [], []
    # ------------- add original part
    for sub, rel, obj in ls:
        edge_idx.append((sub,obj))
        edge_type.append(rel)
    # ------------ add reverse part
    for sub,rel,obj in ls:
        edge_idx.append((obj,sub))
        if rel != 0:
            edge_type.append(rel + num_relation)
        else:
            edge_type.append(rel)
    return np.array(edge_idx).T, np.array(edge_type), np.array(head_tail.shape[0]), np.array(num_relation), head2index, rel2id, tmp_graph, dic
#edge_idx = [build_adj_ls(i,head2idx, rel2id) for i in graphs_]

In [None]:
def split_dataset(final_graph,dic,per):
    '''
    This function split the dataset into train and valid dataset.
    Notice that we need to let all the entity be updated when training,
    which means we need to cover all the entity in training set
    params:
        1. final_graph: latest graph
        2. dic: init train_ids
        3. per: percent
    '''
    init_set = pd.DataFrame({"entity":list(dic.keys()), "ids":list(dic.values())})
    ids = pd.unique(init_set["ids"])
    init_train = final_graph[final_graph["ids"].isin(ids)]
    # others info 
    else_t = object_[-2][~object_[-2]["ids"].isin(ids)]
    # split the rest and concat 
    train_else = else_t.sample(frac = per)
    valid = else_t.sample(frac = 1-per)
    return pd.concat([init_train, train_else]), valid

In [None]:
object_ = build_adj_ls(graphs_[-1], head2idx, rel2id)

In [None]:
train, valid = split_dataset(object_[-2], object_[-1], 0.6)
train.to_csv("./train.csv")
valid.to_csv("./valid.csv")

In [None]:
## (For Original GAT preprocessing)create edge index

In [None]:
# # cost 24 second 
# def build_edge_index(adj_ls, num_nodes,add_self_edges = True):
#     '''
#     建立 edge 的矩阵，通过节点的index来表示edge
#     '''
#     head_ids, tail_ids = [],[]
#     exist_edges = set()
#     for head, tails in tqdm(adj_ls.items()):
#         for tail in tails:
#             if (head, tail) not in exist_edges:
#                 head_ids.append(head)
#                 tail_ids.append(tail)
#                 exist_edges.add((head,tail))
#     if add_self_edges:
#         head_ids.extend(np.arange(num_nodes))
#         tail_ids.extend(np.arange(num_nodes))
    
#     # shape为(2,num_edges)
#     edge_idx = np.row_stack((head_ids, tail_ids))
#     return edge_idx
# # edge_idx = [build_edge_index(i[0],i[1]) for i in adj_list]

## save as npz

In [None]:
# for i,idx in enumerate(edge_idx):
#     np.savez("edge_idx"+str(i)+".npz",idx[0])
#     np.savez("edge_type"+str(i)+".npz",idx[1])
#     np.savez("graph_"+str(i)+"_num_nodes.npz",idx[2])
#     np.savez("graph_"+str(i)+"_num_edges.npz",idx[3])
#     np.save("graph_"+str(i)+"entity2index.npy", idx[4])
#     np.save("graph_"+str(i)+"rel2index.npy", idx[5])

In [None]:
# import os
# os.chdir('/kaggle/working')
# print(os.getcwd())
# print(os.listdir("/kaggle/working"))
# from IPython.display import FileLink
# FileLink('train.csv')