In [1]:
# -*- coding: UTF-8 -*-
import os
import re
import csv
import sys
import json
import datetime
import requests
import difflib
RAW_DIR = "../raw/"
LIST_RAW_CSV = "../list_raw/all.csv"
LIST_RAW_JSON = "../transformed/all_json.json"
TRANSFORMED_DIR = "../transformed/"
OUTPUT = "output.txt"
GBTOUTPUT = "gbt.txt"
TITLEOUTPUT = "title.txt"

In [2]:
#
full_keys = ["author","title","journal","year","DOI","month","citations(google scholar)","abstract","keywords","reference_count","ccfClass","is_important","references"]
full_keys_default = {"author":[],"title":"","journal":"","year":0,"DOI":"","month":0,"citations(google scholar)":-1,"abstract":"","keywords":[],"reference_count":0,"ccfClass":"","is_important":None,"references":[]}
# 常用函数定义
# 调用crossref接口
def string_similar(s1, s2):
    return difflib.SequenceMatcher(None, s1, s2).ratio()
def get_crossref_info(ref):
    url = "https://doi.crossref.org/servlet/query"

    querystring = {"usr":"halloweenwx@163.com","pwd":"halloween","format":"json","qdata":"""<?xml version = "1.0" encoding="UTF-8"?>
    <query_batch version="2.0" xmlns = "http://www.crossref.org/qschema/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
      <head>
          <email_address>hisham@atypon.com</email_address>
          <doi_batch_id>Sample multi resolve</doi_batch_id>
      </head>
      <body>
           <query key="mykey" enable-multiple-hits="true">
      <unstructured_citation>"""+ref+"""</unstructured_citation>
    </query>
         </body>
    </query_batch>"""}

    headers = {
        'User-Agent': "PostmanRuntime/7.20.1",
        'Accept': "*/*",
        'Cache-Control': "no-cache",
        'Postman-Token': "123bedd6-8fb9-43e5-980a-54336e7aa684,6312f073-8e24-4222-abb2-789899bbc01f",
        'Host': "doi.crossref.org",
        'Accept-Encoding': "gzip, deflate",
        'Connection': "keep-alive",
        'cache-control': "no-cache"
        }
    response = requests.request("GET", url, headers=headers, params=querystring)
#     print(response.text)
    return response.text

# 保存至transform
def save_tran_json(addr,jsonfile):
    with open(TRANSFORMED_DIR + addr, 'w') as o:
        o.write(json.dumps(jsonfile,ensure_ascii=False))
        
def save_tran_plain(addr,txtfile):
    with open(TRANSFORMED_DIR + addr, 'w') as o:
        o.write(txtfile)
# 从transformd中读取
def read_tran_json(addr):
    with open(TRANSFORMED_DIR + addr,encoding = 'utf-8') as f:
        data = json.load(f)
    return data
    
#  查询ccf等级
def search_ccf(s):
    ccf_all_addr = "/Users/Halloween/Desktop/Study/复杂网络/lrcns/raw-data/ccf/ccf_all.csv"

    ccf_all = []
    ccf_search_res = {}
    with open(ccf_all_addr,encoding = 'utf-8') as f:
        reader = csv.reader(f)
        for line in reader:
            ccf_all.append(line)
    level = ""
    field = ""
    cORj = ""
    col = -1
#     print(ccf_all)
    for line in reversed(ccf_all):
        if col != 1 and col != 2 and col != -1:
            col = -1
            continue
        if col == -1:
            for word in line[1:3]:
                if word == "":
                    continue
                if(string_similar(word,s)>0.9):
                    col = line.index(word)
                    ccf_search_res['ccf_search_res'] = word
#                     print('"'+word.strip('"')+'"'+'"'+s.strip('"')+'"')
#                     print(col)
#                     print(line)
                    break
        else:
            if len(line[0])>1 and line[0][1]=='、' and level == "":
                level = line[0][2]
#                 print(level)
            if len(line[0])>10 and line[0][:2]=="中国" and field == "":
                field = line[0][line[0].find("（")+1:line[0].find("）")]
                cORj = line[0][line[0].find("术")+1:line[0].find("（")]
#                 print(field)
    ccf_search_res.update({"level":level,"field":field,"type":cORj})
    return {"level":level,"field":field,"type":cORj}
# 数据审查函数
def inspect(to_inspect):
    papers_arr = to_inspect
    all_error = 0
    for paper in papers_arr:
        inspect = []
        for key in full_keys:
            if(key not in paper.keys() or type(paper[key]) != type(full_keys_default[key]) or paper[key] == full_keys_default[key]):
                inspect.append(key + " lost")
                continue
            if(key == "author" or key == "keywords" or key == "references"):
                if(type(paper[key]) != list or len(paper[key])==0):
                    inspect.append(key + " lost")
                    continue
            if(key == "year"):
                if(paper[key]<1900):
                    inspect.append(key + " error")
            if(key == "month"):
                if(paper[key]<0 or paper[key] >12):
                    inspect.append(key + " error")
            if(key == "citations(google scholar)" or key == "reference_count"):
                if(paper[key]<0):
                    inspect.append(key + " error")
            if(key == 'ccfClass'):
                if(paper[key] not in ['A','B','C']):
                    inspect.append(key + " error")
            if(type(paper[key])==int):
                if(paper[key]<0):
                    inspect.append(key + " less than 0")
                continue
            if(type(paper[key])==float):
                inspect.append(key + " not int")
                continue
        paper['inspect'] = list(inspect)
        print(paper['title']+str(paper['inspect']))
        all_error += len(paper['inspect'])
    print(">>>>SUMMARY ERRORs:"+str(all_error)+"<<<<<")
    return papers_arr

In [3]:
# 路径解析
def init_from_raw():
    addr_list = os.listdir(RAW_DIR)
    for fname in list(addr_list):
        addr = RAW_DIR + fname
        if(fname[0]=='.' or addr == OUTPUT or os.path.isdir(addr)):
            addr_list.remove(fname)
            print("skip " + fname )
            continue
    print(len(addr_list))

    # 数据读入内存 all_arr
    all_arr = []
    for fname in addr_list:
        addr = RAW_DIR + fname
        with open(addr,'r') as f:
            farr = json.load(f)
            print(fname)
            en_topic,cn_topic = fname.replace(".txt","").split("_")
            for paper in farr:
                paper["topic"] = {"en":en_topic,"cn":cn_topic}
    #         max_title = 0
    #         max_title_self = ""
    #         for article in farr:
    #             if max_title < len(article['title']):
    #                 max_title = len(article['title'])
    #                 max_title_self = article['title']
    #         print("max_title: " + str(max_title))
    #         print(max_title_self)
            all_arr.extend(list(farr))
    print(len(all_arr))
    return all_arr

In [4]:
def read_list_raw():
    res = []
    with open(LIST_RAW_CSV,'r') as f: 
        res = [x.strip('\n') for x in f.readlines()]
    return res

In [5]:
def read_list_raw_json():
    with open(LIST_RAW_JSON,'r') as f: 
        raw_json = json.load(f)
        return raw_json

In [6]:
all_arr = init_from_raw()
list_arr = read_list_raw()
raw_json = read_list_raw_json()

skip .DS_Store
skip 动态嵌入（已合并至上层）
10
Multimedia Social Relationship Network_多媒体社交关系网络研究.txt
Social Influence Prediction_社会影响预测.txt
Dynamic Embedding (Temporal)_动态嵌入.txt
Knowledge Graph Embedding_知识图谱.txt
network embedding_网络嵌入.txt
InformationDiffusionDetection_信息扩散预测.txt
Network  Fusion_网络融合.txt
Community Detection_社团检测.txt
Graph based Recommendation_基图推荐.txt
GraphVite_GV系统介绍.txt
402


In [7]:
list_tran_arr = []
obj = {}
print (list_arr[0])
print (list_arr[0].split(','))
for record in list_arr:
    record_list = record.split(',')
    obj['title'] = record_list[0]
    obj['author'] = [x.strip('"') for x in record_list[1:-3]]
    obj['journal'] = record_list[-3]
    obj['year'] = int(record_list[-2])
    obj['importance'] = bool(record_list[-1])
    list_tran_arr.append(dict(obj))


A Comparative Study between "Prediction-response" and "Scenario-response" in Unconventional Emergency Decision-making Management,"Pang Jiaju,Li Shiming,Liu Liang",IEEE,2011,TRUE
['A Comparative Study between "Prediction-response" and "Scenario-response" in Unconventional Emergency Decision-making Management', '"Pang Jiaju', 'Li Shiming', 'Liu Liang"', 'IEEE', '2011', 'TRUE']


In [8]:
# union_arr = []
# def do_union():
#     union_arr.extend(all_arr)
#     for list_record in list_tran_arr:
# #         print("<<<"+temp_record['title'])
#         sim = 0;
#         for union_record in list(union_arr):
# #             print(string_similar(temp_record['title'],union_record['title']))            
#             sim = max(sim,string_similar(list_record['title'],union_record['title']))
#         if(sim<0.92):
#             union_arr.append(dict(list_record))
#     for record in union_arr:
#         for key in full_keys:
#             if(key not in record.keys()):
#                 record[key] = full_keys_default[key]
                
#     return union_arr

# union_arr = do_union()
# print(len(union_arr))

# save_tran_json('union.txt',union_arr)
union_arr = read_tran_json('union.txt')
print(len(union_arr))

489


In [9]:
for x in union_arr:
    print(x['author'])

['Scott A.Golder']
['Peng Wu', 'Weimin Ding', 'Zhidong Mao', 'Dan Tretter']
['Peng Wu', 'Dan Tretter']
['Siyu Xia', 'Ming Shao', 'Jiebo Luo', 'Yun Fu']
['Yan-Ying Chen', 'Winston H. Hsu', 'Hong-Yuan Mark Liao']
['Afshin Dehghan', 'Enrique G. Ortiz', 'Ruben Villegas', 'Mubarak Shah']
['Yuanhao Guo， Hamdi Dibeklio˘glu， Laurens van der Maaten']
['Xiaoqian Qin', 'Xiaoyang Tan', 'and Songcan Chen']
['Qieyun Dai', 'Peter Carr', 'Leonid Sigal', 'Derek Hoiem']
['Joseph P. Robinson', 'Ming Shao', 'Yue Wu', 'Hongfu Liu', 'Timothy Gillis', 'Yun Fu']
['Xiuzhuang Zhou', 'Kai Jin', 'Min Xu', 'Guodong Guo']
['Haibin Yan', 'Shiwei Wang']
['Zhanpeng Zhang', 'Ping Luo', 'Chen Change Loy', 'Xiaoou Tang']
['Qianru Sun', 'Bernt Schiele', 'Mario Fritz']
['Junnan Li', 'Yongkang Wong', 'Qi Zhao', 'Mohan S. Kankanhalli']
['Zhanpeng Zhang', 'Ping Luo', 'Chen Change Loy', 'Xiaoou Tang']
['ZhouxiaWang', 'Tianshui Chen', 'Jimmy Ren', 'Weihao Yu', 'Hui Cheng', 'Liang Lin']
['Arushi Goel', 'Keng Teck Ma', 'Cheston T

In [10]:
inspect_union_arr = inspect(union_arr)

Measuring Social Networks with Digital Photograph Collections['ccfClass lost', 'is_important lost']
CLOSE & CLOSER: DISCOVER SOCIAL RELATIONSHIP FROM PHOTO COLLECTIONS['is_important lost']
Close & Closer: Social Cluster and Closeness from Photo Collections['is_important lost']
Understanding Kin Relationships in a Photo['is_important lost']
Discovering Informative Social Subgraphs and Predicting Pairwise Relationships from Group Photos['is_important lost']
Who Do I Look Like? Determining Parent-Offspring Resemblance via Gated Autoencoders['is_important lost']
Graph-based Kinship Recognition['is_important lost']
Tri-Subject Kinship Verification: Understanding the Core of A Family['is_important lost']
Family Member Identification from Photo Collections['ccfClass lost', 'is_important lost']
Visual Kinship Recognition of Families in the Wild['is_important lost']
Learning deep compact similarity metric for kinship verification from face images['ccfClass lost', 'is_important lost']
Learning p

In [11]:
for paper in inspect_union_arr:
    for ins in paper['inspect']:
        if("ccfClass" in ins):
            print(paper['title'] + " " + str(paper['inspect']))

Measuring Social Networks with Digital Photograph Collections ['ccfClass lost', 'is_important lost']
Family Member Identification from Photo Collections ['ccfClass lost', 'is_important lost']
Learning deep compact similarity metric for kinship verification from face images ['ccfClass lost', 'is_important lost']
Automatic Social Network Construction from Movies Using Film-Editing Cues ['ccfClass lost', 'is_important lost']
Character Relationship Analysis in Movies Using Face Tracks ['DOI lost', 'ccfClass lost', 'is_important lost']
Active Clustering with Ensembles for Social Structure Extraction ['ccfClass lost', 'is_important lost']
Social Network Analysis of TV Drama Characters via Deep Concept Hierarchies ['ccfClass lost', 'is_important lost']
Social Network Construction of the Role Relation in Unstructured Data Based on Multi-view ['ccfClass lost', 'is_important lost']
StoryRoleNet: Social Network Construction of Role Relationship in Video ['ccfClass lost', 'is_important lost']
SRE-

In [12]:
def add_data():
    temp_arr = inspect_union_arr
    for paper in temp_arr:
        paper['search_ccf'] = search_ccf(paper['journal'])
        for ins in paper['inspect']:
            if('ccfClass' in ins):
                paper['ccfClass'] = paper['search_ccf']['level']                
    return temp_arr

In [13]:
added_inspect_union_arr = add_data()
for x in added_inspect_union_arr:
#     if(x['ccfClass']!=x['search_ccf']['level']):
    print(x['journal']+"-->"+x['ccfClass']+str(x['search_ccf']))

ACM Conference on Hypertext and Hypermedia-->{'level': '', 'field': '', 'type': ''}
ICME-->B{'level': 'B', 'field': '计算机图形学与多媒体', 'type': '会议'}
MM-->A{'level': '', 'field': '', 'type': ''}
IEEE MM-->B{'level': '', 'field': '', 'type': ''}
MM-->A{'level': '', 'field': '', 'type': ''}
CVPR-->A{'level': 'A', 'field': '人工智能', 'type': '会议'}
ICPR-->C{'level': 'C', 'field': '人工智能', 'type': '会议'}
IEEE MM-->B{'level': '', 'field': '', 'type': ''}
WACV-->{'level': '', 'field': '', 'type': ''}
IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE-->A{'level': '', 'field': '', 'type': ''}
Information Fusion-->{'level': '', 'field': '', 'type': ''}
Pattern Recognition Letters-->C{'level': 'C', 'field': '人工智能', 'type': '期刊'}
ICCV-->A{'level': 'A', 'field': '人工智能', 'type': '会议'}
CVPR-->A{'level': 'A', 'field': '人工智能', 'type': '会议'}
ICCV-->A{'level': 'A', 'field': '人工智能', 'type': '会议'}
IJCV-->A{'level': 'A', 'field': '人工智能', 'type': '期刊'}
IJCAI-->A{'level': 'A', 'field': '人工智能', 'type': '会议'}

In [33]:
with open(TRANSFORMED_DIR + "c copy.txt",'r') as f:
    arrt = f.read()
    yarr = []
    print((arrt[949631:])[0])
    arr = json.loads(arrt[949631:])
    print(len(arr))
    xt = "Measuring Social Networks with Digital Photograph Collections"
    xf = {}
    for x in arr:
        print(x['title']+str(len(str(x['references']))))
        if(xt != x['title']):
            xt = x['title']
            yarr.append(dict(xf))
            xf = x
        else:
            xf = x
    yarr.append(dict(xf))
    for y in yarr:
        print(y['title']+str(len(str(y['references']))))
    save_tran_json('y.txt',y)

[
1980
Measuring Social Networks with Digital Photograph Collections2905
Measuring Social Networks with Digital Photograph Collections2937
Measuring Social Networks with Digital Photograph Collections4051
Measuring Social Networks with Digital Photograph Collections5652
Measuring Social Networks with Digital Photograph Collections7237
Measuring Social Networks with Digital Photograph Collections7269
Measuring Social Networks with Digital Photograph Collections8919
Measuring Social Networks with Digital Photograph Collections8951
Measuring Social Networks with Digital Photograph Collections10270
Measuring Social Networks with Digital Photograph Collections13464
Measuring Social Networks with Digital Photograph Collections15276
CLOSE & CLOSER: DISCOVER SOCIAL RELATIONSHIP FROM PHOTO COLLECTIONS1178
CLOSE & CLOSER: DISCOVER SOCIAL RELATIONSHIP FROM PHOTO COLLECTIONS2392
CLOSE & CLOSER: DISCOVER SOCIAL RELATIONSHIP FROM PHOTO COLLECTIONS2424
CLOSE & CLOSER: DISCOVER SOCIAL RELATIONSHIP FRO

Deep Reasoning with Knowledge Graph for Social Relationship Understanding24875
Deep Reasoning with Knowledge Graph for Social Relationship Understanding24907
Deep Reasoning with Knowledge Graph for Social Relationship Understanding24939
Deep Reasoning with Knowledge Graph for Social Relationship Understanding24971
Deep Reasoning with Knowledge Graph for Social Relationship Understanding25003
Deep Reasoning with Knowledge Graph for Social Relationship Understanding25035
Deep Reasoning with Knowledge Graph for Social Relationship Understanding26380
Deep Reasoning with Knowledge Graph for Social Relationship Understanding27775
Deep Reasoning with Knowledge Graph for Social Relationship Understanding29201
Deep Reasoning with Knowledge Graph for Social Relationship Understanding30527
Deep Reasoning with Knowledge Graph for Social Relationship Understanding31979
Deep Reasoning with Knowledge Graph for Social Relationship Understanding33314
An End-to-End Network for Generating Social Relation

Video-based kinship verification using distance metric learning17931
Video-based kinship verification using distance metric learning17963
Video-based kinship verification using distance metric learning17995
Video-based kinship verification using distance metric learning18027
Video-based kinship verification using distance metric learning18059
Video-based kinship verification using distance metric learning18091
Video-based kinship verification using distance metric learning18123
Video-based kinship verification using distance metric learning18155
Video-based kinship verification using distance metric learning18187
Video-based kinship verification using distance metric learning18219
Video-based kinship verification using distance metric learning18251
Supervised Mixed Norm Autoencoder for Kinship Verification in Unconstrained Videos9081
Supervised Mixed Norm Autoencoder for Kinship Verification in Unconstrained Videos9113
Supervised Mixed Norm Autoencoder for Kinship Verification in Uncon

Towards social pattern characterization in egocentric photo-streams27886
Towards social pattern characterization in egocentric photo-streams27918
Towards social pattern characterization in egocentric photo-streams27950
Towards social pattern characterization in egocentric photo-streams27982
Towards social pattern characterization in egocentric photo-streams28014
Towards social pattern characterization in egocentric photo-streams28046
Towards social pattern characterization in egocentric photo-streams28078
Towards social pattern characterization in egocentric photo-streams28110
Towards social pattern characterization in egocentric photo-streams29072
Towards social pattern characterization in egocentric photo-streams29104
Towards social pattern characterization in egocentric photo-streams29136
Visual Social Relationship Recognition11616
Visual Social Relationship Recognition13112
Visual Social Relationship Recognition14945
Visual Social Relationship Recognition16408
Visual Social Relatio

Influence Factorization for identifying authorities in Twitter54434
Influence Factorization for identifying authorities in Twitter55769
Influence Factorization for identifying authorities in Twitter57629
Measuring Social Networks with Digital Photograph Collections15276
CLOSE & CLOSER: DISCOVER SOCIAL RELATIONSHIP FROM PHOTO COLLECTIONS5275
Close & Closer: Social Cluster and Closeness from Photo Collections5042
Understanding Kin Relationships in a Photo58688
Discovering Informative Social Subgraphs and Predicting Pairwise Relationships from Group Photos22018
Who Do I Look Like? Determining Parent-Offspring Resemblance via Gated Autoencoders27542
Graph-based Kinship Recognition19726
Tri-Subject Kinship Verification: Understanding the Core of A Family59374
Family Member Identification from Photo Collections26938
Visual Kinship Recognition of Families in the Wild95965
Learning deep compact similarity metric for kinship verification from face images30945
Learning part-aware attention netwo

In [14]:
with open(TRANSFORMED_DIR + "c2.txt",'a+') as o:
    o.write("[")

In [None]:
doii = 0
cnt = 0
fail = 0
with open(TRANSFORMED_DIR + "c2.txt",'a+') as o:
    for x in added_inspect_union_arr:
        print(".")
        for p in x['references']:
            if p['ref'].lower().find('doi') != -1:
                doii += 1
            else:
                try:
                    p['cross'] = get_crossref_info(p['ref'])
                    cnt += 1
                    print(cnt)
                except:
                    print("ERROR:"+str(fail))
                    fail += 1
    o.write(json.dumps(x,ensure_ascii=False))                    
    o.write(',')
    print(doii)
#         print(get_crossref_info(p['ref']))
#         if p['ref'].lower().find('doi'):
#             print(p['ref']) 

.
1
2
3
4
5
6
7
8
9
10
11
.
12
13
14
15
16
17
18
.
19
20
ERROR:0
21
.
ERROR:1
22
ERROR:2
ERROR:3
ERROR:4
ERROR:5
ERROR:6
ERROR:7
ERROR:8
ERROR:9
23
ERROR:10
24


In [None]:
author_title = ""
for paper in all_arr:
    author_title = author_title +paper['title'] +"|"+str(paper['author']) +"\n"
save_tran_plain('author.txt',author_title)

In [None]:
def data_clean(ori):
    for paper in ori:
        for key in full_keys:
            if key in paper.keys():
                if key == "author":
                    paper[key].replace('，',',').replace('\ufeff','')
                    
            else:
#                 缺失，填入默认值
                paper[key] = full_keys_default[key]
        if(paper['author'])

In [26]:
to_update = all_arr
for paper in to_update:
    for key in full_keys:
#         对全体可能的参数
        for ins in paper['inspect']:
            if(key in ins):
#                 对于每个错误
                if(key in paper.keys()):
#                 如果存在
                    print(paper['title'])
                    paper['journal']
                    print(paper['journal']+ str(search_ccf(paper['journal'])))

ccfClass
Measuring Social Networks with Digital Photograph Collections
ACM Conference on Hypertext and Hypermedia{'level': '', 'field': '', 'type': ''}
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
ccfClass
Family Member Identification from Photo Collections
WACV{'level': '', 'field': '', 'type': ''}
is_important
is_important
ccfClass
Learning deep compact similarity metric for kinship verification from face images
Information Fusion{'level': '', 'field': '', 'type': ''}
is_important
is_important
is_important
is_important
is_important
is_important
is_important
DOI
An End-to-End Network for Generating Social Relationship Graphs
CVPR{'level': 'A', 'field': '人工智能', 'type': '会议'}
is_important
is_important
is_important
is_important
is_important
is_important
ccfClass
Automatic Social Network Construction from Movies Using Film-Editing Cues
ICMEW{'level': 'B', 'field': '计算机图形学与多媒体', 'type': '会议'}
is_important
DOI
Character Relationship

ArXiv{'level': '', 'field': '', 'type': ''}
is_important
is_important
is_important
DOI
Community Preserving Network Embedding
The 31st AAAI Conference on Artificial Intelligence. 2017.{'level': 'A', 'field': '人工智能', 'type': '会议'}
is_important
ccfClass
DEEPEYE: Link Prediction in Dynamic Networks Based on Non-negative Matrix Factorization
Big data minging and analytics{'level': '', 'field': '', 'type': ''}
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
ccfClass
DHNE: Network Representation Learning Method for Dynamic Heterogeneous Networks
IEEE Access{'level': '', 'field': '', 'type': ''}
is_important
is_important
ccfClass
EpiRep: Learning Node Representations through Epidemic Dynamics on Networks
ACM International Conference on Web Intelligence{'level': 'C', 'field': '人工智能', 'type': '期刊'}
is_important
is_important
is_important
is_important
is_important
is_important
ccfClass
Deep Dynamic Netw

EMNLP{'level': 'B', 'field': '人工智能', 'type': '会议'}
citations(google scholar)
ccfClass
is_important
Knowledge graph and text jointly embedding
EMNLP{'level': 'B', 'field': '人工智能', 'type': '会议'}
DOI
Representation learning of knowledge graphs with entity descriptions
AAAI{'level': 'A', 'field': '人工智能', 'type': '会议'}
month
Representation learning of knowledge graphs with entity descriptions
AAAI{'level': 'A', 'field': '人工智能', 'type': '会议'}
citations(google scholar)
ccfClass
is_important
Representation learning of knowledge graphs with entity descriptions
AAAI{'level': 'A', 'field': '人工智能', 'type': '会议'}
DOI
Text-enhanced representation learning for knowledge graph
IJCAI{'level': 'A', 'field': '人工智能', 'type': '会议'}
month
Text-enhanced representation learning for knowledge graph
IJCAI{'level': 'A', 'field': '人工智能', 'type': '会议'}
citations(google scholar)
ccfClass
is_important
Text-enhanced representation learning for knowledge graph
IJCAI{'level': 'A', 'field': '人工智能', 'type': '会议'}
month
J

SIGKDD{'level': 'A', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
references
Knowledge vault: A web-scale approach to probabilistic knowledge fusion
SIGKDD{'level': 'A', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
DOI
Probabilistic reasoning via deep learning: Neural association models
arXiv preprint{'level': '', 'field': '', 'type': ''}
month
Probabilistic reasoning via deep learning: Neural association models
arXiv preprint{'level': '', 'field': '', 'type': ''}
citations(google scholar)
ccfClass
is_important
Probabilistic reasoning via deep learning: Neural association models
arXiv preprint{'level': '', 'field': '', 'type': ''}
references
Probabilistic reasoning via deep learning: Neural association models
arXiv preprint{'level': '', 'field': '', 'type': ''}
month
Encoding temporal information for time-aware link prediction
EMNLP{'level': 'B', 'field': '人工智能', 'type': '会议'}
citations(google scholar)
ccfClass
is_important
Encoding temporal information for time-aware link prediction
EMNLP{'leve

Complexity{'level': '', 'field': '', 'type': ''}
is_important
references
A local random walk model for complex networks based on discriminative feature combinations
Expert Systems with Applications{'level': 'C', 'field': '人工智能', 'type': '期刊'}
is_important
is_important
is_important
is_important
references
Random Walk Decay Centrality
AAAI Conference on Artificial Intelligence. 2019{'level': 'A', 'field': '人工智能', 'type': '会议'}
is_important
is_important
is_important
DOI
Graph Representation Ensemble Learning
arXiv preprint arXiv:1909.02811{'level': '', 'field': '', 'type': ''}
ccfClass
Graph Representation Ensemble Learning
arXiv preprint arXiv:1909.02811{'level': '', 'field': '', 'type': ''}
is_important
ccfClass
Quantification of network structural dissimilarities
Nature communications{'level': '', 'field': '', 'type': ''}
is_important
DOI
Fast graph representation learning with PyTorch Geometric
International Conference on Learning Representations{'level': '', 'field': '', 'type': ''}


ACM International Conference on Information and Knowledge Management{'level': 'B', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
is_important
references
HeteroMed: Heterogeneous Information Network for Medical Diagnosis
ACM International Conference on Information and Knowledge Management{'level': 'B', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
is_important
references
Abnormal Event Detection via Heterogeneous Information Network Embedding
ACM International Conference on Information and Knowledge Management{'level': 'B', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfClass
is_important
ccfCl

WSDM{'level': 'B', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
DOI
Neural Graph Collaborative Filtering
SIGIR{'level': 'C', 'field': '人机交互与普适计算', 'type': '会议'}
ccfClass
Neural Graph Collaborative Filtering
SIGIR{'level': 'C', 'field': '人机交互与普适计算', 'type': '会议'}
is_important
references
Neural Graph Collaborative Filtering
SIGIR{'level': 'C', 'field': '人机交互与普适计算', 'type': '会议'}
DOI
Meta-Graph Based Recommendation Fusion over Heterogeneous Information Networks
Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining{'level': 'A', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
ccfClass
Meta-Graph Based Recommendation Fusion over Heterogeneous Information Networks
Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining{'level': 'A', 'field': '数据库／数据挖掘／内容检索', 'type': '会议'}
is_important
references
Meta-Graph Based Recommendation Fusion over Heterogeneous Information Networks
Proceedings of the 23rd ACM SIGKDD Internatio

In [41]:
# 数据处理
authors = set()
doi = set()
# for article in all_arr:
#     authors.update(article['author'])
for refs in all_arr[0]['references']:
    print(refs['ref'])
    print(get_crossref_info(refs['ref']))
    
#     doi.add(article['DOI'])
# authors_list = list(authors)
# authors_list.sort()
# print(authors_list)
print(doi)

[1] Adamic, Lada A. and Bernardo A. Huberman. (2002). Zipf's Law and the Internet. Glottometrics. 3. 143--150.
Resource not found.
Resource not found.
[2] Ames, Morgan. (2006). The Social Life of Snapshots. Thesis. UC Berkeley School of Information Mgt. and Systems.
Resource not found.
Resource not found.
[3] Marko Balabanović , Lonny L. Chu , Gregory J. Wolff, Storytelling with digital photographs, Proceedings of the SIGCHI conference on Human Factors in Computing Systems, p.564-571, April 01-06, 2000, The Hague, The Netherlands  [doi>10.1145/332040.332505]
{
 "issued": {
 "date-parts": [
 2000
]
},
 "link": [
 {
 "intended-application": "crawler-based",
 "content-version": "vor",
 "content-type": "unspecified",
 "URL": "http://dl.acm.org/ft_gateway.cfm?id=332505&amp;ftid=528&amp;dwn=1"
}
],
 "score": 1,
 "prefix": "http://id.crossref.org/10.1145",
 "author": [
 {
 "family": "Balabanovi\u0107",
 "given": "Marko"
},
 {
 "family": "Chu",
 "given": "Lonny L."
},
 {
 "family": "Wolff",
 "

{
 "license": [
 {
 "content-version": "tdm",
 "URL": "https://www.elsevier.com/tdm/userlicense/1.0/",
 "delay-in-days": 0,
 "date-parts": [
 2005,
 7,
 1
],
 "timestamp": 1120190400000
}
],
 "issued": {
 "date-parts": [
 2005,
 7
]
},
 "link": [
 {
 "intended-application": "text-mining",
 "content-version": "vor",
 "content-type": "text/xml",
 "URL": "https://api.elsevier.com/content/article/PII:S0378873305000092?httpAccept=text/xml"
},
 {
 "intended-application": "text-mining",
 "content-version": "vor",
 "content-type": "text/plain",
 "URL": "https://api.elsevier.com/content/article/PII:S0378873305000092?httpAccept=text/plain"
}
],
 "score": 1,
 "prefix": "http://id.crossref.org/10.1016",
 "author": [
 {
 "family": "Fu",
 "given": "Yang-chih"
}
],
 "container-title": "Social Networks",
 "citedby-count": 53,
 "reference-count": 30,
 "page": "169-186",
 "deposited": {
 "date-parts": [
 2019,
 1,
 28
],
 "timestamp": 1548674716000
},
 "created": {
 "date-parts": [
 2005,
 3,
 1
],
 "is

{
 "issued": {
 "date-parts": [
 2005,
 4
]
},
 "link": [
 {
 "intended-application": "crawler-based",
 "content-version": "vor",
 "content-type": "unspecified",
 "URL": "http://xplorestaging.ieee.org/ielx5/7756/30819/01427648.pdf?arnumber=1427648"
}
],
 "score": 1,
 "prefix": "http://id.crossref.org/10.1109",
 "author": [
 {
 "family": "Kindberg",
 "given": "T."
},
 {
 "family": "Spasojevic",
 "given": "M."
},
 {
 "family": "Fleck",
 "given": "R."
},
 {
 "family": "Sellen",
 "given": "A."
}
],
 "container-title": "IEEE Pervasive Computing",
 "citedby-count": 109,
 "reference-count": 12,
 "page": "42-50",
 "deposited": {
 "date-parts": [
 2017,
 3,
 15
],
 "timestamp": 1489548332000
},
 "created": {
 "date-parts": [
 2005,
 5,
 11
],
 "issue": "2",
 "title": "The Ubiquitous Camera: An In-Depth Study of Camera Phone Use",
 "subtitle": "",
 "type": "journal_article",
 "DOI": "10.1109/MPRV.2005.42",
 "ISSN": [
 "15361268"
],
 "URL": "https://doi.org/10.1109/MPRV.2005.42",
 "source": "Cros

KeyboardInterrupt: 

In [5]:
# 数据写入
with open( TRANSFORMED_DIR + OUTPUT,'w') as o:
    o.write(json.dumps(all_arr,ensure_ascii=False ))

In [12]:
# 数据处理并写入 group by topic
gbts_arr = []
for fname in addr_list:
    gbt_obj = {}
    addr = RAW_DIR + fname
#     添加话题
    en_topic,cn_topic = fname.replace(".txt","").split("_")
    topic = {"en":en_topic,"cn":cn_topic}
    gbt_obj["topic"] = topic
#     话题下论文
    with open(addr,'r') as f:
        farr = json.load(f)
        gbt_obj["papers"] = farr
        

    gbts_arr.append(dict(gbt_obj))
    
with open(TRANSFORMED_DIR + GBTOUTPUT,'w') as o:
    o.write(json.dumps(gbts_arr[2],ensure_ascii=False))


In [17]:
# 数据处理并写入
title_arr = []
for fname in addr_list:
    addr = RAW_DIR + fname
    with open(addr,'r') as f:
        farr = json.load(f)
        for fobj in farr:
            title_arr.append(str(fobj["title"]))

save_tran_json(TITLEOUTPUT,title_arr)
print(len(title_arr))


361
