In [1]:
import os
import pandas as pd
import json
import tools
import numpy as np
# from sklearn import svm
# import sklearn.model_selection as sm
from settings import *
from unionfind import UnionFind
from collections import defaultdict, Counter


本程序使用python3, 需要使用的包在`requirements.txt`.  代码中有用到来自论文 Name Disambiguation in AMiner: Clustering, Maintenance, and Human in the Loop的[示例代码](https://github.com/zhysora/BUAALAB_IN_WDQ), 半监督聚类的代码改动自[这里](https://github.com/datamole-ai/active-semi-supervised-clustering).  


首先需要创建一个`data`目录,并将数据都放到里面, 数据在[这里可以下载到](https://biendata.com/competition/scholar2018/data/).     

## 准备中间数据文件 

In [2]:
pubs = [pubs_validate_path, pubs_test_path]
awaits = [VAL_NAME2PUB, TEST_NAME2PUB]
for i in range(2):
    pubdata = json.load(open(pubs[i]))
    name2pub = defaultdict(list)
    for name, papers in pubdata.items():
        for paper in papers:
            name2pub[name].append(paper['id'])
    json.dump(name2pub, open(awaits[i], 'w'))



# 利用强规则生成Pair-wise constraint  



In [9]:
import rules.pos
correct, wrong = rules.pos.generate_positive_pair() 

# 全局度量学习 Global metric learning  

请直接执行`python global_embedding.py`来生成全局论文向量表示.  

# 建立论文图  prepare local data  
请到`local`目录下执行`prepare_localdata.py`文件:    

```sh
cd local 
python prepare_localdata.py 

```  

# 并查集算法 Union-Find

In [5]:
ass_train = json.load(open(assignments_train_path))
ass_val = json.load(open(VAL_NAME2PUB))
testname2ids = json.load(open(TEST_NAME2PUB))
valname2ids = ass_val
trainname2ids = defaultdict(list)
for name, pubs in ass_train.items():
    trainname2ids[name] = [i for pub in pubs for i in pub]

pos_pair = json.load(open(pos_pair_path))

In [7]:
for name in pos_pair.keys():
    file_path = './output/graph-%s/%s_pubs_network.txt'%(IDF_THRESHOLD, name)
    for line in open(file_path):
        a, b = line.strip().split()
        pos_pair[name].append([a,b])

## 训练集结果

In [8]:
train_gen = {}
for name, ids in trainname2ids.items():
    uf = UnionFind(ids)
    for pair in pos_pair[name]:
        uf.union(*pair)
    train_gen[name] = [list(i) for i in uf.components()]
metric = []
for k, v in ass_train.items():
    ids, labs = tools.assign2label(v)
    _, pre_labs = tools.assign2label(train_gen[k])

    f1 = tools.pairwise_precision_recall_f1(pre_labs, labs)
    
    print(k,len(pos_pair[k]), f1, len(ass_train[k]), len(train_gen[k]))
    metric.append(f1[-1])
print(np.mean(metric))

li_ma 52493 (0.10872694531106625, 0.8502086930338489, 0.19279832842594774) 526 220
min_chen 63928 (0.49958197755879524, 0.7039837050802275, 0.5844260543919646) 300 307
gang_zhang 17338 (0.12374652574015438, 0.662967057644803, 0.20856350213435426) 255 138
j_lin 44543 (0.03481354115181934, 0.2497877228909645, 0.06111002492790458) 266 204
xiang_gao 30146 (0.044005678830581554, 0.3220848861150844, 0.07743200951747248) 232 192
xi_zhang 35787 (0.027731618052245652, 0.550809512302597, 0.05280467788124466) 407 177
jie_zhou 34703 (0.1549374380510649, 0.5016964543321651, 0.2367576947068776) 372 230
feng_zhu 23883 (0.08619152506817705, 0.7286632675681667, 0.15414917813679743) 218 124
bing_liu 32631 (0.11831675931241287, 0.920032605802502, 0.20966984723539472) 420 260
di_wu 62729 (0.08747583495281863, 0.6287938373578822, 0.15358535496448616) 478 255
liu_yang 24355 (0.5385984403896588, 0.9001567258984818, 0.6739478961190927) 333 211
lei_wu 9719 (0.017736461734140636, 0.2404258700143016, 0.033035836

f_liu 213174 (0.11356153432725269, 0.8871117592407475, 0.2013479786992673) 379 265
rui_zhang 95868 (0.23518653923004032, 0.9360567206078182, 0.37592180581384904) 371 310
0.21421668508321123


## 验证集结果

In [10]:
val = {}
from scipy.special import comb
ass_val = json.load(open(assignments_val_path))
valname2ids = {k: [j for i in v for j in i] for k, v in ass_val.items()}
for name, ids in valname2ids.items():
    if len(ids) == 0:
        val[name] = []
        continue
    uf = UnionFind(ids)
    for pair in pos_pair[name]:
        if (pair[0] in ids) and (pair[1] in ids):
            uf.union(*pair)
#     print('a')
    val[name] = [list(i) for i in uf.components()]
json.dump(val, open('res_union_find_val.json', 'w'))

## 生成测试集答案

In [11]:
test = {}

for name, ids in testname2ids.items():
    
    uf = UnionFind(ids)
    for pair in pos_pair[name]:
        uf.union(*pair)
    test[name] = [list(i) for i in uf.components()]

json.dump(test, open('res_union_find_test.json', 'w'))

# 聚类  

另一种解法是聚类, 这里我们以半监督聚类演示一下使用方法. 半监督聚类和聚类的调用入口均在`opendac.py`中.  

在使用聚类算法前, 需要先生成精调后的paper的向量表征, 即需要:   

```sh
cd local/gae
python train.py 
```    

## Semi-supervised clustering

In [None]:
cluster_num = 15  

#Generate train
pubs_train = json.load(open(pubs_train_path))
local_output = pkl.load(open(local_output_path,'rb'))                                                                                                                           
p = mkl.Pool(CPU_COUNT)
length = len(pubs_train)
res = p.starmap(clustering_with_const,  zip( pubs_train.keys(), ['PCKMeans']*length, [cluster_num]*length) ) 
J = dict(zip(pubs_train.keys(), res))
json.dump(J, open('assignment_train_result_for_val_PCK.json', 'w'))  



#Generate validate
pubs_train = json.load(open(pubs_validate_path))
local_output = pkl.load(open(local_output_path,'rb'))                                                                                                                                                                         
p = mkl.Pool(CPU_COUNT)                                
length = len(pubs_validate)
res = p.starmap(clustering_with_const,  zip( pubs_validate.keys(), ['PCKMeans']*length, [cluster_num]*length) ) 
J = dict(zip(pubs_validate.keys(), res))                                                                                                                                                                                      
json.dump(J, open('assignment_validate_result_PCK.json', 'w'))      


#Generate test
pubs_test = json.load(open(TEST_PATH))
local_output = pkl.load(open(local_output_path,'rb'))                                                                                                                           
p = mkl.Pool(CPU_COUNT)
length = len(pubs_test)
res = p.starmap(clustering_with_const,  zip( pubs_test.keys(), ['PCKMeans']*length, [cluster_num]*length) ) 
J = dict(zip(pubs_test.keys(), res))
json.dump(J, open('assignment_test_PC_2.json', 'w'))  