# MAXP 2021初赛数据探索和处理-3

使用步骤1里处理好的节点的ID，来构建DGL的graph所需要的边列表。

In [1]:
import pandas as pd
import numpy as np
import os

import dgl

Using backend: pytorch


In [2]:
# path
base_path = '/Users/jamezhan/PycharmProjects/MAXP/final_dataset'
publish_path = 'publish'

link_p1_path = os.path.join(base_path, publish_path, 'link_phase1.csv')
nodes_path = os.path.join(base_path, publish_path, 'IDandLabels.csv')

### 读取节点列表

In [3]:
nodes_df = pd.read_csv(nodes_path, dtype={'Label':str})
print(nodes_df.shape)
nodes_df.tail(4)

(3655452, 4)


Unnamed: 0,node_idx,paper_id,Label,Split_ID
3655448,3655448,caed47d55d1e193ecb1fa97a415c13dd,,1
3655449,3655449,c82eb6be79a245392fb626b9a7e1f246,,1
3655450,3655450,926a31f6b378575204aae30b5dfa6dd3,,1
3655451,3655451,bbace2419c3f827158ea4602f3eb35fa,,1


### 读取边列表

In [4]:
edges_df = pd.read_csv(link_p1_path)
print(edges_df.shape)
edges_df.head()

(29168650, 3)


Unnamed: 0,paper_id,reference_paper_id,phase
0,f10da75ad1eaf16eb2ffe0d85b76b332,711ef25bdb2c2421c0131af77b3ede1d,phase1
1,9ac5a4327bd4f3dcb424c93ca9b84087,2d91c73304c5e8a94a0e5b4956093f71,phase1
2,9d91bfd4703e55dd814dfffb3d63fc33,33d4fdfe3967a1ffde9311bfe6827ef9,phase1
3,e1bdbce05528952ed6579795373782d4,4bda690abec912b3b7b228b01fb6819a,phase1
4,eb623ac4b10df96835921edabbde2951,c1a05bdfc88a73bf2830e705b2f39dbb,phase1


## Join点列表和边列表以生成从0开始的边列表

DGL默认节点是从0开始，并以最大的ID为容量构建Graph，因此这里我们先构建从0开始的边列表。

In [5]:
# Merge paper_id列
edges = edges_df.merge(nodes_df, on='paper_id', how='left')
# Merge reference_paper_id列
edges = edges.merge(nodes_df, left_on='reference_paper_id', right_on='paper_id', how='left')
print(edges.shape)
edges.head(4)

(29168650, 10)


Unnamed: 0,paper_id_x,reference_paper_id,phase,node_idx_x,Label_x,Split_ID_x,node_idx_y,paper_id_y,Label_y,Split_ID_y
0,f10da75ad1eaf16eb2ffe0d85b76b332,711ef25bdb2c2421c0131af77b3ede1d,phase1,529879,,0,2364950,711ef25bdb2c2421c0131af77b3ede1d,,0
1,9ac5a4327bd4f3dcb424c93ca9b84087,2d91c73304c5e8a94a0e5b4956093f71,phase1,410481,D,0,384023,2d91c73304c5e8a94a0e5b4956093f71,K,0
2,9d91bfd4703e55dd814dfffb3d63fc33,33d4fdfe3967a1ffde9311bfe6827ef9,phase1,2196044,D,0,1895619,33d4fdfe3967a1ffde9311bfe6827ef9,N,0
3,e1bdbce05528952ed6579795373782d4,4bda690abec912b3b7b228b01fb6819a,phase1,2545623,,0,2175977,4bda690abec912b3b7b228b01fb6819a,,0


#### 修改node_idx_* 列的名称作为新的node id，并只保留需要的列

In [6]:
edges.rename(columns={'paper_id_x': 'paper_id', 'node_idx_x':'src_nid', 'node_idx_y':'dst_nid'}, inplace=True)
edges = edges[['src_nid', 'dst_nid', 'paper_id', 'reference_paper_id']]
edges.head(4)

Unnamed: 0,src_nid,dst_nid,paper_id,reference_paper_id
0,529879,2364950,f10da75ad1eaf16eb2ffe0d85b76b332,711ef25bdb2c2421c0131af77b3ede1d
1,410481,384023,9ac5a4327bd4f3dcb424c93ca9b84087,2d91c73304c5e8a94a0e5b4956093f71
2,2196044,1895619,9d91bfd4703e55dd814dfffb3d63fc33,33d4fdfe3967a1ffde9311bfe6827ef9
3,2545623,2175977,e1bdbce05528952ed6579795373782d4,4bda690abec912b3b7b228b01fb6819a


## 构建DGL的Graph

In [7]:
# 讲源节点和目标节点转换成Numpy的NDArray
src_nid = edges.src_nid.to_numpy()
dst_nid = edges.dst_nid.to_numpy()

In [8]:
# 构建一个DGL的graph
graph = dgl.graph((src_nid, dst_nid))
print(graph)

Graph(num_nodes=3655452, num_edges=29168650,
      ndata_schemes={}
      edata_schemes={})


In [10]:
# 保存Graph为二进制格式方便后面建模时的快速读取
graph_path = os.path.join(base_path, publish_path, 'graph.bin')
dgl.data.utils.save_graphs(graph_path, [graph])