In [1]:
# Imports
from core.config import *
from core.search.mag_interface import *

MAS_URL_PREFIX = "https://api.labs.cognitive.microsoft.com"
url = os.path.join(MAS_URL_PREFIX, "academic/v1.0/evaluate")

In [2]:
# Find paper names
paper_id = 2747135882

query = {
    'expr'      : 'Id={}'.format(paper_id),
    'count'     : 1,
    'offset'    : 0,
    'attributes': 'Ti'
}

data = query_academic_search('get', url, query)

print(data)

{'expr': 'Id=2747135882', 'entities': [{'logprob': -21.015, 'Id': 2747135882, 'Ti': 'high flux optical systems for solar thermochemistry'}]}


In [3]:
# Get References
query = {
    'expr'      : 'RId={}'.format(paper_id),
    'count'     : 100,
    'offset'    : 0,
    'attributes': 'Id,RId'
}

data = query_academic_search('get', url, query)

print(data)

{'expr': 'RId=2747135882', 'entities': [{'logprob': -21.64, 'Id': 2788750595, 'RId': [1516460738, 2052989605, 2008624509, 2126524277, 1986229972, 1986859383, 2121082943, 2074057609, 1969689829, 2060018992, 2073244242, 1986963983, 2015682556, 1971332796, 2084744394, 2044914162, 1998277481, 2169189912, 171260444, 2015827483, 2089124982, 1120801388, 2135006005, 1972591051, 1812729199, 2068015815, 2013008835, 2014484129, 2277555655, 2618550242, 2555990107, 1970594223, 2751893488, 2206597992, 1121021787, 2082425299, 2030114442, 829055464, 2530528476, 2513401070, 2081246734, 2747135882, 2551464536, 2738978407, 2080812797, 1911735896, 1977728070, 1965941474, 2612546432, 2774997576, 2777510846]}, {'logprob': -22.056, 'Id': 2792905515, 'RId': [1516460738, 2008624509, 1967712992, 1988933206, 1986229972, 2040229875, 1986859383, 2121082943, 2074057609, 2038310959, 2062117447, 1970920367, 2259119442, 2066617099, 2015682556, 2001300652, 2159388053, 2001924000, 2084744394, 2015827483, 1120801388, 213

In [19]:
# Update ES to remove the repeated citation
from graph.config              import conf
from core.search.query_utility import field_del
from elasticsearch             import Elasticsearch
from elasticsearch_dsl         import Search
from core.search.cache_data    import cache_paper_info

client = Elasticsearch(conf.get("elasticsearch.hostname"))

search = Search(index = 'paper_info', using = client)
#search = search.from_dict({"query": {"match_all": {}}})

to_cache = list()
processed = 0

for res in search.scan():
    paper_info = res.to_dict()
    processed += 1
    
    field_del(paper_info, 'CreatedDate')
    
    cite_dict = dict()
    for cite_info in paper_info['Citations']:
        cite_dict[cite_info['PaperId']] = cite_info
        
    cite_papers = list(cite_dict.keys())
    if len(cite_papers) != len(paper_info['Citations']):
        print(processed, paper_info['PaperId'], "---")
        new_cites = list()
        for paper_id in set(cite_papers):
            new_cites.append(cite_dict[paper_id])
        
        paper_info['Citations'] = new_cites
        to_cache.append(paper_info)


In [20]:
print("Total processed:", processed)
print("To cache:", len(to_cache))

Total processed: 20609
To cache: 0


In [18]:
import multiprocess

# Params for paper information
THREADS    = 8
BATCH_SIZE = 40

# Generate paper information for conference
p = multiprocess.Pool(THREADS)

# Set up for threads
batches = (to_cache[i:i+BATCH_SIZE] for i in \
               range(0, len(to_cache), BATCH_SIZE))

batch_res = p.map(cache_paper_info, batches)

Total processed: 20609
To cache: 5137
