In [1]:
import asyncio
import requests
import numpy as np
import pandas as pd
from pprint import pprint
from collections import defaultdict
from copy import deepcopy
import pickle

In [2]:
def request_work_list(query=None, filters={}, params={}, max_results=0):
    
    default_params = {}
    default_params["mailto"] = "nils.bock@tu-bs.de"
    default_params["sort"] = "score"
    #default_params["sort"] = "issued"
    default_params["rows"] = 1000
    if max_results > 0 and max_results < default_params["rows"]:
        default_params["rows"] = max_results
    default_params["cursor"] = "*"
    
    req_params = {**default_params, **params}
    
    if query:
        if type(query) == str:
            req_params["query"] = query
        elif type(query) == dict:
            for k, v in query.items():
                if k.startswith("query."):
                    req_params[k] = v
                else:
                    req_params["query."+k] = v
    
    if filters:
        req_params["filter"] = ",".join(["{}:{}".format(k, v) for k, v in filters.items()])
        
    work_list = []
    
    while True:
        r = requests.get("https://api.crossref.org/works", params=req_params)

        r.raise_for_status()

        print("requesting", r.url)

        rJson = r.json()
        if rJson["status"] == "ok" and rJson["message-type"] == "work-list":
            msg = rJson["message"]
            total_results = msg["total-results"]
            new_items = msg["items"]
            work_list += new_items
            next_cursor = msg["next-cursor"]
            req_params["cursor"] = next_cursor
        else:
            print("error requesting", r.url)
            print(r.json)
        
        print("got {} of {} items".format(len(work_list), total_results))
        
        if max_results > 0:
            if len(work_list) >= max_results:
                work_list = work_list[:max_results]
                break
        
        if len(work_list) >= total_results:
            break
            
    
    return work_list

In [3]:
def request_work(doi, params={}):
    default_params = {}
    default_params["mailto"] = "nils.bock@tu-bs.de"
    
    req_params = {**default_params, **params}
    
    r = requests.get("https://api.crossref.org/works/"+doi, params=req_params)
    r.raise_for_status()

    #print("requesting", r.url)
    
    rJson = r.json()
    
    if rJson["status"] == "ok" and rJson["message-type"] == "work":
        msg = rJson["message"]
        #print(msg)
        return msg
    else:
        raise KeyError(doi)

#test_doi = wl[0]["DOI"]
#request_work(test_doi)

In [4]:
# test request_work_list
#wl = request_work_list("deep+learning", filters={"has-references": 1, "from-pub-date" : 2010}, max_results=1000)
#wl = request_work_list("schmidhuber", filters={"has-references": 1, "from-pub-date" : 1950}, max_results=0)
#wl = request_work_list("heuristic+heuristics", filters={"has-references": 1, "from-pub-date" : 2010}, max_results=5000)
#wl = request_work_list({"author": "vietor"}, max_results=1000)

query_list = ["topology optimization", "shape optimization", "structural optimization", "robust optimization", "truss optimization", "crashworthiness"]
query_list = [
    "additive manufacturing", 
    "laser cladding", 
    "electron beam melting", 
    "selective laser melting", 
    "selective laser sintering",
    "rapid prototyping",
    "rapid tooling",
    "fused deposition modeling"
]

wl = []
for q in query_list:
    wl += request_work_list(q, filters={"has-references": 1, "from-pub-date" : 2010}, max_results=5000)

requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=%2A&query=additive+manufacturing&filter=has-references%3A1%2Cfrom-pub-date%3A2010
got 1000 of 78570 items
requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQWzjEX7Hp5GX4QI%2FDWh0dHA6Ly9keC5kb2kub3JnLzEwLjEwMzgvczQxNTk4LTAxNy0wNzQ0Ni04&query=additive+manufacturing&filter=has-references%3A1%2Cfrom-pub-date%3A2010
got 2000 of 78570 items
requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQUwhjHWhpPTr4gI%2FD2h0dHA6Ly9keC5kb2kub3JnLzEwLjEwMTYvai5pZmFjb2wuMjAxNi4xMi4xNjM%3D&query=additive+manufacturing&filter=has-references%3A1%2Cfrom-pub-date%3A2010
got 3000 of 78570 items
requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQSK2RXLc%2FrDq4gI%2FDmh0dHA6Ly9keC5kb2kub3JnLzEwLjEwMTYvai5tYXRwci4yMDE3LjExLjY0OQ%3D%3D&query=additive+manufacturing&filt

requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQPysfnfg8KL13wI%2FCWh0dHA6Ly9keC5kb2kub3JnLzEwLjEwMDIvbWFyYy4yMDE0MDAwNjE%3D&query=rapid+tooling&filter=has-references%3A1%2Cfrom-pub-date%3A2010
got 2000 of 58240 items
requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQPysfnnK88vK4gI%2FCWh0dHA6Ly9keC5kb2kub3JnLzEwLjEwMDIvbWFyYy4yMDExMDA4NzM%3D&query=rapid+tooling&filter=has-references%3A1%2Cfrom-pub-date%3A2010
got 3000 of 58240 items
requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQNZKLXLY4I2t3wI%2FA2h0dHA6Ly9keC5kb2kub3JnLzEwLjEwMDIvcmNtLjQ5NjI%3D&query=rapid+tooling&filter=has-references%3A1%2Cfrom-pub-date%3A2010
got 4000 of 58240 items
requesting https://api.crossref.org/works?mailto=nils.bock%40tu-bs.de&sort=score&rows=1000&cursor=AoMIQNBZw3vPgOqv3wI%2FDWh0dHA6Ly9keC5kb2kub3JnLzEwLjExMDkvZXBlcHMuMjAxNS43MzQ3MTMw&query=rapid

In [5]:

def update_work_dict(work_dict, wl):
    for w in wl:
        doi = w["DOI"]
        work_dict[doi] = deepcopy(w)
    return work_dict

work_dict = {}
work_dict = update_work_dict(work_dict, wl)

In [6]:
def get_reference_dois(work):
    result = []
    if "reference" in work:
        for ref in work["reference"]:
            if "DOI" in ref:
                result.append(ref["DOI"])
    return result

In [7]:
def get_ref_counts(work_dict):
    ref_counts = defaultdict(int)
    for work in work_dict.values():
        #print(work)
        work_refs = get_reference_dois(work)
        #print(work_refs)
        #break
        for doi in work_refs:
            ref_counts[doi] += 1
    return ref_counts

In [None]:
ref_dois = set()
for w in wl:
    ref_dois.update(get_reference_dois(w))
len(ref_dois), len(ref_dois - work_dict.keys()), len(work_dict.keys())

In [None]:
len(work_dict)

In [None]:
dois_to_request = list(ref_dois - work_dict.keys())
len(dois_to_request)

In [8]:
import asyncio
import concurrent.futures
import requests
import time

def request_work_async(i, doi, N, params={}):
    
    max_retries = 10
    if (i % 100) == 0:
        print(i, "/", N)
        #print(i, "start")
    for k in range(max_retries):
        
        time.sleep(1)
        try:
            msg = request_work(doi)
            break
        except requests.HTTPError as e:
            print(e)
            status_code = e.response.status_code
            if status_code == 429:
                #asyncio.sleep(5)
                time.sleep(5)
            elif status_code == 404:
                return doi
            else:
                return doi
            
        #asyncio.sleep(1)
        
    #print(i, "done")
    return msg


async def main_request_loop(dois):

    items = []
    bad_dois = []
    N = len(dois)

    with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                request_work_async, 
                i,
                doi,
                N
            )
            for i, doi in enumerate(dois)
        ]    

        #print("loop done")

        i = 0

        for response in await asyncio.gather(*futures):
            #pprint (response)
            #print(i, "done")
            if response:
                if type(response) == str:
                    bad_dois.append(response)
                else:
                    items.append(response)
            i += 1

    #print ("main done")
    return items, bad_dois

def request_work_list_from_dois(dois_to_request):
    loop = asyncio.get_event_loop()
    items, bad_dois = loop.run_until_complete(main_request_loop(dois_to_request))

    #print(len(items), "/", len(dois_to_request))
    #print("done")
    return items, bad_dois
    #break

In [11]:
ref_counts = get_ref_counts(work_dict)

400 / 1000
500 / 1000
600 / 1000
700 / 1000
800 / 1000
900 / 1000


In [14]:
# update work_dict with referenced works

ref_dois = set()

for w in work_dict.values():
    ref_dois.update(get_reference_dois(w))
print(len(ref_dois), len(ref_dois - work_dict.keys()), len(work_dict.keys()))

dois_to_request = list(ref_dois - work_dict.keys())
print(len(dois_to_request))

min_ref_count = 3
dois_to_request = [doi for doi in dois_to_request if ref_counts[doi] >= min_ref_count]
print(len(dois_to_request))

time.sleep(1)
i = 0
k = 1000
work_list = []
while i < len(dois_to_request):
    j = i + k
    new_items, bad_dois = request_work_list_from_dois(dois_to_request[i:j])
    i = j
    
    work_list += new_items
    #work_dict = update_work_dict(work_dict, new_items)
    print(j, "/", len(dois_to_request))

work_dict = update_work_dict(work_dict, work_list)
ref_counts = get_ref_counts(work_dict)

len(work_dict), len(work_list)

291570 277700 42664
277700
15308
0 / 1000
100 / 1000
200 / 1000
300 / 1000
400 / 1000
500 / 1000
600 / 1000
700 / 1000
800 / 1000
900 / 1000
1000 / 15308
0 / 1000
100 / 1000
200 / 1000
300 / 1000
400 / 1000
500 / 1000
600 / 1000
700 / 1000
800 / 1000
900 / 1000
2000 / 15308
0 / 1000
100 / 1000
200 / 1000
300 / 1000
400 / 1000
500 / 1000
600 / 1000
700 / 1000
800 / 1000
900 / 1000
404 Client Error: Not Found for url: https://api.crossref.org/works/10.1002/1521-3773(20010417)40:8%3C1340::AID-ANIE1340%3E3.0.CO;2-?mailto=nils.bock%40tu-bs.de
3000 /0  15308/
 1000
100 / 1000
200 / 1000
300 / 1000
400 / 1000
500 / 1000
600 / 1000
700 / 1000
404 Client Error: Not Found for url: https://api.crossref.org/works/10.1002/1096-987X(20010130)22:2%3C178::AID-JCC5%3E3.0.CO;2-?mailto=nils.bock%40tu-bs.de
800 / 1000
900 / 1000
4000 /0  15308/
 1000
100 / 1000
200 / 1000
300 / 1000
400 / 1000
500 / 1000
600 / 1000
700 / 1000
800 / 1000
900 / 1000
5000 / 15308
0 / 1000
100 / 1000
404 Client Error: Not Fou

SSLError: HTTPSConnectionPool(host='api.crossref.org', port=443): Max retries exceeded with url: /works/10.1002/1521-4095(20020116)14:2%3C137::AID-ADMA137%3E3.0.CO;2-7?mailto=nils.bock%40tu-bs.de (Caused by SSLError(SSLError("bad handshake: SysCallError(-1, 'Unexpected EOF')",),))

In [16]:
pickle.dump(work_dict, open("work_dict.p", "wb"))

In [15]:
len(work_list)

8994

In [None]:
new_items, bad_dois = request_work_list_from_dois(dois_to_request[:])

In [None]:
#new_items = new_items[0]
len(new_items)


In [None]:
work_dict = update_work_dict(work_dict, new_items)

In [None]:
len(work_dict)

In [None]:
ref_counts = get_ref_counts(work_dict)

In [None]:
len(new_items), bad_dois

In [None]:
import pickle

In [None]:
pickle.dump(work_dict, open("work_dict.p", "wb"))

In [None]:
[1,2,3][1:5]