In [None]:
''' 
Feature_Engineer:
    1. 提取 train.txt 中所有的 Nodes 和 Edges 关系
    2. 利用上述关系生成 pos_data 和 neg_data （已经生成，直接读取)
    3. 利用（1）中生成的 Nodes 和 Edges 生成图

'''

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import pickle
import math


# 读写操作
dirname = 'data/'
def save_obj(obj, name ):
    with open(dirname+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open( dirname + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
from tqdm import tqdm
import random

# 读取train.txt file
with open("data/train.txt", "r") as f:
     train_data = f.readlines()

# SourceNodes 是全部的起点，Nodes和Edges是全部的点和边
SourceNodes = []
Nodes = []
Edges = []
for i in tqdm(range(len(train_data))):
    nodes_list = [int(n) for n in train_data[i].split()]
    SourceNodes.append(nodes_list[0])
    for node in nodes_list:
        Nodes.append(node)
    for node in nodes_list[1:]:
        Edges.append((nodes_list[0],node))
# 生成Nodes和Edges，直接从train.txt 读取就可以

100%|██████████| 20000/20000 [00:10<00:00, 1907.57it/s]


In [7]:
save_obj(Edges,'Edges')

In [3]:
print(len(SourceNodes))
print(SourceNodes[0]) # SourceNodes 的长度应该是2w
print(Edges[0])

20000
540762
(540762, 1912140)


In [5]:
#random.sample(Nodes, 20000)
ng_line = SourceNodes
sE = set(Edges)
sN = set(Nodes)
print(len(sE),len(sN))

23946602 4867136


In [6]:
# 生成负面数据
'''
从2w个sourceNodes 里面选一个做source，然后选一个全部Nodes集合里的做sink，当（source,sink）在Edges中不存在时，就作为一条负边存在。
对于每个Source选取10个，一共20w个neg
ng_line = SourceNodes
'''
from tqdm import tqdm
def generate_neg_data():
    neg_data = []
    for l in tqdm(ng_line):
        source = l
        for sink in random.sample(Nodes, 1200):
            if source!=sink and not (source,sink) in sE:
                neg_data.append([(source, sink), 0])
    return neg_data

neg_data = generate_neg_data()

100%|██████████| 20000/20000 [01:38<00:00, 202.30it/s]


In [8]:
# 生成正面数据

'''
从Edges里随机选20w个作为正边


'''
pos_data = random.sample(Edges, 200000)
res = []
for element in tqdm(pos_data):
    res.append([(element[0], element[1]), 1])
pos_data = res
print(pos_data[:10])
save_obj(pos_data,'pos_data')

100%|██████████| 200000/200000 [00:01<00:00, 160549.61it/s]

[[(3438576, 3311313), 1], [(3509903, 4441306), 1], [(4011264, 381769), 1], [(354904, 2677468), 1], [(802229, 842813), 1], [(3684652, 3233426), 1], [(1022123, 2020646), 1], [(2818749, 2191915), 1], [(781211, 1735082), 1], [(4313572, 1848485), 1]]





In [10]:
import random
neg_data = random.sample(neg_data, 200000)

In [11]:
save_obj(pos_data, 'pos_data')
save_obj(neg_data, 'neg_data')
print(len(pos_data), len(neg_data))
print(len(pos_data[1]), len(neg_data[1]))
print(pos_data[1], neg_data[1])


200000 200000
2 2
[((1633022, 2484234), (2281709, 4781581)), 1] [(1986786, 553041), 0]


In [12]:
# train.txt 文件已经读取， pos_data 和 neg_data 也已经读取
print('Nodes number:\t','Edges number:')
print(len(Nodes), '\t',len(Edges), '\n')
print('posdata_length: ', 'negdata_length:')
print(len(pos_data),'\t', len(neg_data), '\n')

print('first 10 pieces of pos_data and neg_data:')
print(pos_data[:10],'\n')
print(neg_data[:10])



Nodes number:	 Edges number:
24024361 	 24004361 

posdata_length:  negdata_length:
200000 	 200000 

first 10 pieces of pos_data and neg_data:
[[((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1], [((1633022, 2484234), (2281709, 4781581)), 1]] 

[[(4736404, 1656132), 0], [(1986786, 553041), 0], [(856564, 878623), 0], [(4636168, 1463235), 0], [(4106313, 1734432), 0], [(3216427, 2222991), 0], [(3133399, 750901), 0], [(3168312, 4289964), 0], [(2078186, 2350155), 0], [(2834992, 2031424), 0]]


In [11]:
G = nx.Graph()
G.add_nodes_from(Nodes)
G.add_edges_from(Edges)
# Edges and Nodes are all from train.txt (pos_data)
print('Graph has been built!')

Graph has been built!


In [9]:
# 保存G对象到本地
save_obj(G, 'graph')
print('graph has been saved')

graph has been saved


In [13]:
# build hash set for Edges , hash set is more quick for calculating
hash_edges = set(Edges)

In [14]:
#print(Edges[:10])
# let's try a test node:
test_node = 3043
test_neig = sorted(nx.all_neighbors(G, test_node))
print(len(test_neig))

num_in = 0
num_out = 0
for one_neig in test_neig:
    if (3043, one_neig) in hash_edges:
        num_out += 1
    if (one_neig, 3043) in hash_edges:
        num_in +=1
print(num_in, num_out)
    

218
209 55


In [21]:
print('Nodes list length containing duplicates:', len(Nodes))
print('Nodes set length: ', len(sN))
Nodes = list(sN)

Nodes list length containing duplicates: 24024361
Nodes set length:  4867136


In [22]:
from tqdm import tqdm
'''
debug: 为什么所有的边都被保存为了inbound？

solution:  代码从trian.txt中读取了Edges，错在“直接把Edges转化为hash set 然后判断(e, node)是否存在”

'''

'''
生成pre_features: pre_features 需要包含哪些特征？

1. neighbors number 
2. log(nei_number)
3. all_neighbors
4. inbound
5. inbound_num
6. outbound
7. outboud_num
    
'''
pre_features = {}
for node in tqdm(list(Nodes)):
    neig = sorted(nx.all_neighbors(G, node))                          # 3
    num_neig = len(neig)                 # 1
    log_neig = (1. / math.log(num_neig+1)) if num_neig != 0 else 0    # 2

    
    inbound = []
    outbound = []
    for e in list(neig):
        if (e,node) in hash_edges:
            inbound.append(e)
        if (node,e) in hash_edges:
            outbound.append(e)
    
    pre_features[node] = [num_neig, log_neig, neig, inbound, len(inbound), outbound, len(outbound)]

100%|██████████| 4867136/4867136 [06:53<00:00, 11784.37it/s]


In [None]:
#save_obj(pre_features, 'pre_features_v2')
load_obj('pre_features_v2')

In [19]:
'''
Next we combine the pos_data and neg_data to one file:
SBdata
'''
print(len(pre_features))

790558


In [None]:
import pickle
import numpy
import pandas

obj_data_dir = "../../data/"
def save_obj(obj, name ):
    with open(obj_data_dir + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(obj_data_dir + name + '.pkl', 'rb') as f:
        return pickle.load(f)
# load data
#training_edges=load_obj('SBdata')

if pos_data is None:
    pos_data = load_obj("pos_data")
if neg_data is None:
    neg_data = load_obj("neg_data")

pos = pos_data
neg = neg_data

print(len(pos))
print(neg[1])

filename = "SBdata"
SBdata = []
for element in pos:
    SBdata.append(element)
for element in neg:
    SBdata.append(element)

In [None]:
save_obj(SBdata, 'SBdata')