Skip to content

Commit

Permalink
Discard some Dissemin suggestions, use more from Unpaywall
Browse files Browse the repository at this point in the history
* Discard URL suggestions from Dissemin papers which have more than
  2 DOIs, which are often the result of overmerging (such as many
  different works with the same or very similar title merged into
  a single record).
* Prefer the DOI from the wikitext reference to search in Unpaywall,
  rather than the DOI from the Dissemin record. (If that even works.
  The Dissemin API did not return such a field for a test DOI.)
* Use all URL suggestions from Unpaywall rather than just the best:
  will allow to get repository and PMC suggestions even where the
  publisher version is considered OA and "best".

Bug: T228666
  • Loading branch information
nemobis committed Jul 22, 2019
1 parent 77c0433 commit e3c74bf
Showing 1 changed file with 26 additions and 17 deletions.
43 changes: 26 additions & 17 deletions src/oabot/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def propose_change(self):
dissemin_paper_object = get_dissemin_paper(reference)

# Otherwise, try to get a free link
link = get_oa_link(dissemin_paper_object)
doi = reference.get('ID_list', {}).get('DOI')
link = get_oa_link(paper=dissemin_paper_object, doi=doi)
if not link:
self.classification = 'not_found'
return
Expand Down Expand Up @@ -202,7 +203,7 @@ def get_dissemin_paper(reference):
headers={'User-Agent':OABOT_USER_AGENT},
timeout=10)

resp = req.json()
resp = req.json()
paper_object = resp.get('paper', {})

return paper_object
Expand All @@ -219,20 +220,28 @@ def get_paper_values(paper, attribute):

return None

def get_oa_link(paper):
def get_oa_link(paper, doi=None):

doi = paper.get('doi')
if doi is not None:
doi = "/".join(doi.split("/")[-2:])
if not doi:
doi = paper.get('doi')
if doi is not None:
doi = "/".join(doi.split("/")[-2:])

# Get all the URLs which Dissemin considers to be full-text links
candidate_urls = ([
record.get('pdf_url') for record in
paper.get('records',[]) if record.get('pdf_url')
])

# then, try OAdoi
# (OAdoi finds full texts that dissemin does not, so it's always good to have!)
dissemin_dois = set([ record.get('doi') for record in
paper.get('records',[]) if record.get('doi') ])
if len(dissemin_dois) > 2:
# Do not use Dissemin suggestions: many DOIs suggest a risk of overmerged
# records. https://github.com/dissemin/dissemin/issues/512
candidate_urls = []
else:
# Get all the URLs which Dissemin considers to be full-text links
candidate_urls = ([
record.get('pdf_url') for record in
paper.get('records',[]) if record.get('pdf_url')
])

# Then, try OAdoi/Unpaywall
# (It finds full texts that Dissemin does not, so it's always good to have!)
if doi:
resp = None
attempts = 0
Expand All @@ -250,9 +259,9 @@ def get_oa_link(paper):
else:
continue

best_oa = (resp.get('best_oa_location') or {})
if best_oa.get('url') and best_oa.get('host_type') != 'publisher':
candidate_urls += best_oa['url']
for oa_location in resp.get('oa_locations') or []:
if oa_location.get('url') and oa_location.get('host_type') != 'publisher':
candidate_urls += oa_location['url']

# Full text detection is not always accurate, so we try to pick
# the URL which is most useful for citation templates and we
Expand Down

0 comments on commit e3c74bf

Please sign in to comment.