In [1]:
import os
import pandas as pd
import json
import tools
import numpy as np
# from sklearn import svm
# import sklearn.model_selection as sm
from settings import *
from unionfind import UnionFind
from collections import defaultdict, Counter


本代码使用python3(版本需>=3.6), 需要使用的包在`requirements.txt`.

首先需要创建一个`data`目录,并将数据都放到里面, 数据在[这里可以下载到](https://biendata.com/competition/scholar2018/data/).  

# 全局度量学习 Global metric learning  

请直接执行`python global_embedding.py`来生成全局论文向量表示.  

# 建立论文图  prepare local data  
请到`local`目录下执行`prepare_localdata.py`文件:    

```sh
cd local 
python prepare_localdata.py 

```  

# 利用强规则生成Pair-wise constraint  



In [None]:
import rules.pos
rules.pos.generate_positive_pair() 

# 并查集算法 Union-Find

In [37]:

ass_train = json.load(open(assignments_train_path))
ass_val = json.load(open(VAL_NAME2PUB)
testname2ids = json.load(open('./data/await_test.json'))
valname2ids = ass_val
trainname2ids = defaultdict(list)
for name, pubs in ass_train.items():
    trainname2ids[name] = [i for pub in pubs for i in pub]

pos_pair = json.load(open(pos_pair_path))

In [40]:
for name in pos_pair.keys():
    file_path = './output/graph-%s/%s_pubs_network.txt'%(IDF_THRESHOLD, name)
    for line in open(file_path):
        a, b = line.strip().split()
        pos_pair[name].append([a,b])

## 训练集结果

In [48]:
train_gen = {}
for name, ids in trainname2ids.items():
    uf = UnionFind(ids)
    for pair in pos_pair[name]:
        uf.union(*pair)
    train_gen[name] = [list(i) for i in uf.components()]
metric = []
for k, v in ass_train.items():
    ids, labs = tools.assign2label(v)
    _, pre_labs = tools.assign2label(train_gen[k])

    f1 = tools.pairwise_precision_recall_f1(pre_labs, labs)
    
    print(k,len(pos_pair[k]), f1, len(ass_train[k]), len(train_gen[k]))
    metric.append(f1[-1])
print(np.mean(metric))

## 验证集结果

In [85]:
val = {}
from scipy.special import comb
ass_val = json.load(open(assignments_val_path))
valname2ids = {k: [j for i in v for j in i] for k, v in ass_val.items()}
for name, ids in valname2ids.items():
    if len(ids) == 0:
        val[name] = []
        continue
    uf = UnionFind(ids)
    for pair in pos_pair[name]:
        if (pair[0] in ids) and (pair[1] in ids):
            uf.union(*pair)
#     print('a')
    val[name] = [list(i) for i in uf.components()]
json.dump(val, open('res_union_find_val.json', 'w'))

ke_xu 5876 (0.41746817538896747, 0.18163076923076923, 0.2531303602058319) 14 137
jing_wu 5548 (0.5370762711864406, 0.3998422712933754, 0.4584086799276672) 34 87
d_li 10131 (0.4781599781599782, 0.5782436447672499, 0.5234608487746564) 10 41
liang_li 14015 (0.444317923024368, 0.39873288371142446, 0.42029297716501507) 21 78
dan_wang 8423 (0.3782051282051282, 0.5115606936416185, 0.4348894348894349) 24 77
juan_du 4382 (0.587012987012987, 0.2100371747211896, 0.30937713894592744) 20 76
bo_xu 16414 (0.5038889099669916, 0.5068503606457276, 0.5053652968036529) 16 111
l_sun 22219 (0.5939342881213142, 0.7777641578818337, 0.6735311289209701) 19 33
long_wang 10327 (0.48562680115273776, 0.6465397343053091, 0.5546481249099998) 18 53
bing_chen 9341 (0.5120850789558492, 0.46434833430742256, 0.48704980842911877) 50 140
c_c_lin 1601 (0.7175989085948158, 0.5723612622415669, 0.6368038740920098) 9 31
liang_zhou 3336 (0.5474613686534217, 0.3873486919172198, 0.4536931168534187) 15 49
jie_yang 15057 (0.430604982

In [66]:
pubs_ori = json.load(open('./data/pubs_validate.json'))
pubs_val = {}
c = []
for k, clus in ass_val.items():
    pubs_val[k] = []
    eids = set([j for i in clus for j in i])
    c.extend([len(i) for i in clus])
    for pub in pubs_ori[k]:
        if pub['id'] in eids:
            pubs_val[k].append(pub)
    
    

## 生成测试集答案

In [47]:
test = {}

for name, ids in testname2ids.items():
    
    uf = UnionFind(ids)
    for pair in pos_pair[name]:
        uf.union(*pair)
    test[name] = [list(i) for i in uf.components()]

json.dump(test, open('res_union_find_test.json', 'w'))

# 聚类  

另一种解法是聚类, 这里我们以半监督聚类演示一下使用方法. 半监督聚类和聚类的调用入口均在`opendac.py`中.  

在使用聚类算法前, 需要先生成精调后的paper的向量表征, 即需要:   

```sh
cd local/gae
python train.py 
```    

## Semi-supervised clustering

In [None]:
cluster_num = 15  

#Generate train
pubs_train = json.load(open(pubs_train_path))
local_output = pkl.load(open(local_output_path,'rb'))                                                                                                                           
p = mkl.Pool(CPU_COUNT)
length = len(pubs_train)
res = p.starmap(clustering_with_const,  zip( pubs_train.keys(), ['PCKMeans']*length, [cluster_num]*length) ) 
J = dict(zip(pubs_train.keys(), res))
json.dump(J, open('assignment_train_result_for_val_PCK.json', 'w'))  



#Generate validate
pubs_train = json.load(open(pubs_validate_path))
local_output = pkl.load(open(local_output_path,'rb'))                                                                                                                                                                         
p = mkl.Pool(CPU_COUNT)                                
length = len(pubs_validate)
res = p.starmap(clustering_with_const,  zip( pubs_validate.keys(), ['PCKMeans']*length, [cluster_num]*length) ) 
J = dict(zip(pubs_validate.keys(), res))                                                                                                                                                                                      
json.dump(J, open('assignment_validate_result_PCK.json', 'w'))      


#Generate test
pubs_test = json.load(open(TEST_PATH))
local_output = pkl.load(open(local_output_path,'rb'))                                                                                                                           
p = mkl.Pool(CPU_COUNT)
length = len(pubs_test)
res = p.starmap(clustering_with_const,  zip( pubs_test.keys(), ['PCKMeans']*length, [cluster_num]*length) ) 
J = dict(zip(pubs_test.keys(), res))
json.dump(J, open('assignment_test_PC_2.json', 'w'))  