Skip to content

Commit

Permalink
support getting corresponding authors from DOIs
Browse files Browse the repository at this point in the history
  • Loading branch information
jaanisoe committed Jan 17, 2020
1 parent 9b6fd4e commit e690645
Show file tree
Hide file tree
Showing 8 changed files with 613 additions and 447 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ public class Fetcher {

private static final Pattern KEYWORDS_BEGIN = Pattern.compile("(?i)^[\\p{Z}\\p{Cc}]*keywords?[\\p{Z}\\p{Cc}]*:*[\\p{Z}\\p{Cc}]*");
private static final Pattern SEPARATOR = Pattern.compile("[,;|]");
private static final Pattern MAILTO_BEGIN = Pattern.compile("(?i)^[\\p{Z}\\p{Cc}]*mailto[\\p{Z}\\p{Cc}]*:*[\\p{Z}\\p{Cc}]*");

private static final Pattern PMID_EXTRACT = Pattern.compile("(?i)pmid[\\p{Z}\\p{Cc}]*:*[\\p{Z}\\p{Cc}]*(" + PubFetcher.PMID.pattern() + ")");
private static final Pattern PMCID_EXTRACT = Pattern.compile("(?i)pmcid[\\p{Z}\\p{Cc}]*:*[\\p{Z}\\p{Cc}]*(" + PubFetcher.PMCID.pattern() + ")");
Expand All @@ -126,7 +127,7 @@ public class Fetcher {
private static final Pattern SCIENCEDIRECT = Pattern.compile("^https?://(www\\.)?sciencedirect\\.com/.+$");
private static final String SCIENCEDIRECT_LINK = "https://www.sciencedirect.com/science/article/pii/";

private static final Pattern F1000_DOI = Pattern.compile("^10.12688/F1000RESEARCH\\..+$");
private static final Pattern F1000_DOI = Pattern.compile("^10.12688/.+\\..+\\..+$");

private static Set<ActiveHost> activeHosts = new HashSet<>();

Expand Down Expand Up @@ -1307,6 +1308,83 @@ private void setCorrespAuthor(Publication publication, Element element, String l
}
}

private void setCorrespAuthor(Publication publication, Element element, String names, String emails, String location) {
if (publication.getCorrespAuthor().isEmpty() && (names != null && !names.trim().isEmpty() || emails != null && !emails.trim().isEmpty())) {
List<String> caNames = null;
if (names != null && !names.trim().isEmpty()) {
caNames = getAll(element, names, location, true).stream().map(e -> e.text()).collect(Collectors.toList());
}
List<String> caEmails = null;
if (emails != null && !emails.trim().isEmpty()) {
caEmails = getAll(element, emails, location, true).stream().map(e -> MAILTO_BEGIN.matcher(e.attr("abs:href").trim()).replaceFirst("")).collect(Collectors.toList());
}
if (caNames != null && !caNames.isEmpty() && caEmails != null && !caEmails.isEmpty() && (caEmails.size() == caNames.size() * caNames.size() && caNames.size() > 1 || caEmails.size() > 1 && caNames.size() == 1)) {
boolean reduce = true;
for (int i = 0; i < caNames.size(); ++i) {
for (int j = i + caNames.size(); j < caEmails.size(); j += caNames.size()) {
if (!caEmails.get(i).equals(caEmails.get(j))) {
reduce = false;
}
}
}
if (reduce) {
caEmails = caEmails.subList(0, caNames.size());
}
}
if (caEmails != null && caEmails.size() > 1) {
boolean namesEqual = false;
if (caNames != null && caNames.size() == caEmails.size()) {
namesEqual = true;
String firstName = caNames.get(0);
for (int i = 1; i < caNames.size(); ++i) {
if (!firstName.equals(caNames.get(i))) {
namesEqual = false;
break;
}
}
}
boolean emailsEqual = true;
String firstEmail = caEmails.get(0);
for (int i = 1; i < caEmails.size(); ++i) {
if (!firstEmail.equals(caEmails.get(i))) {
emailsEqual = false;
break;
}
}
if (emailsEqual && !namesEqual) {
caEmails = new ArrayList<>();
caEmails.add(firstEmail);
}
}
if ((caNames == null || caNames.isEmpty()) && (caEmails == null || caEmails.isEmpty())) {
logger.warn("No corresponding authors found in {}", location);
} else {
if (caNames != null && !caNames.isEmpty() && caEmails != null && !caEmails.isEmpty() && caNames.size() != caEmails.size()) {
logger.warn("Discarding corresponding author names as number of names ({}) is not equal to number of e-mails ({}) in {}", caNames.size(), caEmails.size(), location);
caNames = null;
}
List<CorrespAuthor> correspAuthor = new ArrayList<>();
int correspAuthorSize = 0;
if (caNames != null && !caNames.isEmpty()) {
correspAuthorSize = caNames.size();
} else if (caEmails != null && !caEmails.isEmpty()) {
correspAuthorSize = caEmails.size();
}
for (int i = 0; i < correspAuthorSize; ++i) {
CorrespAuthor ca = new CorrespAuthor();
if (caNames != null && !caNames.isEmpty()) {
ca.setName(caNames.get(i));
}
if (caEmails != null && !caEmails.isEmpty()) {
ca.setEmail(caEmails.get(i));
}
correspAuthor.add(ca);
}
publication.setCorrespAuthor(correspAuthor);
}
}
}

private boolean isFinal(Publication publication, PublicationPartName[] names, EnumMap<PublicationPartName, Boolean> parts, boolean oa, FetcherArgs fetcherArgs) {
for (PublicationPartName name : names) {
if (!publication.getPart(name).isFinal(fetcherArgs) && (parts == null || (parts.get(name) != null && parts.get(name)))) {
Expand Down Expand Up @@ -2238,6 +2316,8 @@ void fetchSite(Publication publication, String url, PublicationPartType type, St
for (String pdfHrefA : pdfHrefsA) {
links.add(pdfHrefA, type.toPdf(), finalUrl, publication, fetcherArgs, false);
}

setCorrespAuthor(publication, doc, scrape.getSelector(site, ScrapeSiteKey.corresp_author_names), scrape.getSelector(site, ScrapeSiteKey.corresp_author_emails), doc.location());
} else {
logger.warn("No scrape rules for {}", finalUrl);
if (parts == null || (parts.get(PublicationPartName.title) != null && parts.get(PublicationPartName.title))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ private static int testSite(String[] test, Publication publication, EnumMap<Publ
mismatch += equal(test[6], publication.getFulltext().getContent().length(), "fulltext length");
}
mismatch += equal(test[7], publication.getVisitedSites().size(), "visited sites size");
mismatch += equal(test[8], CorrespAuthor.toString(publication.getCorrespAuthor()).length(), "corresponding author length");
return mismatch;
}

Expand Down Expand Up @@ -426,14 +427,14 @@ private static void filterTests(List<String[]> tests, String regex, int column)

private static void testSite(Fetcher fetcher, EnumMap<PublicationPartName, Boolean> parts, FetcherArgs fetcherArgs, String regex) throws IOException, ReflectiveOperationException {
int mismatch = 0;
List<String[]> tests = getTest("journals.csv", "scrape", 9);
filterTests(tests, regex, 8);
List<String[]> tests = getTest("journals.csv", "scrape", 10);
filterTests(tests, regex, 9);
int i = 0;
long start = System.currentTimeMillis();
for (String[] test : tests) {
++i;
logger.info("Test {} {}", test[8], PubFetcher.progress(i, tests.size(), start));
Publication publication = fetchSite(test[8], fetcher, parts, fetcherArgs);
logger.info("Test {} {}", test[9], PubFetcher.progress(i, tests.size(), start));
Publication publication = fetchSite(test[9], fetcher, parts, fetcherArgs);
mismatch += testSite(test, publication, parts);
}
if (mismatch == 0) logger.info("OK");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

import java.util.ArrayList;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand All @@ -33,6 +32,7 @@
import org.jsoup.select.Elements;

import org.edamontology.pubfetcher.core.common.FetcherArgs;
import org.edamontology.pubfetcher.core.db.publication.CorrespAuthor;
import org.edamontology.pubfetcher.core.db.publication.Publication;
import org.edamontology.pubfetcher.core.db.publication.PublicationPartName;
import org.edamontology.pubfetcher.core.db.publication.PublicationPartType;
Expand All @@ -49,6 +49,7 @@ public final class HtmlMeta {
private static final String CITATION_ABSTRACT_SELECTOR = selectorCombinations("citation_abstract");
private static final String CITATION_FULLTEXT_SELECTOR = selectorCombinations("citation_fulltext_html_url") + ", " + selectorCombinations("citation_full_html_url");
private static final String CITATION_FULLTEXT_PDF_SELECTOR = selectorCombinations("citation_pdf_url");
private static final String CITATION_AUTHOR_EMAIL_SELECTOR = selectorCombinations("citation_author_email");

private static final String EPRINTS_PMID_SELECTOR = selectorCombinations("eprints.pubmed_id");
private static final String EPRINTS_TITLE_SELECTOR = selectorCombinations("eprints.title");
Expand Down Expand Up @@ -82,7 +83,6 @@ public final class HtmlMeta {

private static final Pattern BIOMEDCENTRAL = Pattern.compile("^https?://[a-zA-Z0-9.-]*biomedcentral\\.com/.+$");
private static final Pattern CITESEERX = Pattern.compile("^https?://(www\\.)?citeseerx\\..+$");
private static final Pattern F1000 = Pattern.compile("^https?://(www\\.)?f1000research\\.com/.+$");
private static final Pattern NATURE = Pattern.compile("^https?://(www\\.)?nature\\.com/.+$");
private static final Pattern WILEY = Pattern.compile("^https?://[a-zA-Z0-9.-]*onlinelibrary\\.wiley\\.com/.+$");
private static final Pattern SCIENCEMAG = Pattern.compile("^https?://[a-zA-Z0-9.-]*sciencemag\\.org/.+$");
Expand Down Expand Up @@ -148,11 +148,6 @@ private static void setKeywords(Publication publication, Document doc, Publicati
if (type.isBetterThan(publication.getKeywords().getType())) {
Elements metaKeywords = doc.select(keywordsSelector);
if (!metaKeywords.isEmpty()) {
if (F1000.matcher(doc.location()).matches()) {
for (Iterator<Element> it = metaKeywords.iterator(); it.hasNext(); ) {
if (it.next().attr("content").indexOf('|') < 0) it.remove();
}
}
List<String> keywords = metaKeywords.stream()
.flatMap(k -> SEPARATOR.splitAsStream(k.attr("content")))
.collect(Collectors.toList());
Expand Down Expand Up @@ -189,6 +184,21 @@ private static void addLinks(Publication publication, Document doc, PublicationP
}
}

private static void setCorrespAuthor(Publication publication, Document doc, PublicationPartType type, String authorEmailSelector) {
if (publication.getCorrespAuthor().isEmpty()) {
Elements metaAuthorEmails = doc.select(authorEmailSelector);
if (!metaAuthorEmails.isEmpty()) {
List<CorrespAuthor> correspAuthor = metaAuthorEmails.stream()
.map(e -> e.attr("content").trim())
.filter(a -> !a.isEmpty())
.map(e -> { CorrespAuthor ca = new CorrespAuthor(); ca.setEmail(e); return ca; })
.collect(Collectors.toList());
logger.info(" Found corresp. authors from meta {} in {}", type, doc.location());
publication.setCorrespAuthor(correspAuthor);
}
}
}

private static PublicationPartType chooseType(PublicationPartType metaType, PublicationPartType type) {
return (metaType.isBetterThan(type) ? type : metaType);
}
Expand All @@ -208,6 +218,7 @@ static void fillWith(Publication publication, Document doc, PublicationPartType
setAbstract(publication, doc, citationType, CITATION_ABSTRACT_SELECTOR, fetcherArgs, parts);
addLinks(publication, doc, citationLinkType, CITATION_FULLTEXT_SELECTOR, links, fetcherArgs);
addLinks(publication, doc, citationPdfType, CITATION_FULLTEXT_PDF_SELECTOR, links, fetcherArgs);
setCorrespAuthor(publication, doc, type, CITATION_AUTHOR_EMAIL_SELECTOR);

// eprints
PublicationPartType eprintsType = chooseType(PublicationPartType.eprints, type);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ public enum ScrapeSiteKey {
fulltext_a,
pdf_src,
pdf_dst,
pdf_a;
pdf_a,
corresp_author_names,
corresp_author_emails;

private String key;

Expand Down

0 comments on commit e690645

Please sign in to comment.