Skip to content

Commit

Permalink
update scraping rules
Browse files Browse the repository at this point in the history
  • Loading branch information
jaanisoe committed Jan 14, 2020
1 parent 771f738 commit 9b6fd4e
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 25 deletions.
13 changes: 6 additions & 7 deletions core/src/main/resources/scrape/journals.csv
Original file line number Diff line number Diff line change
Expand Up @@ -334,10 +334,9 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,site
0,0,25,85,3,1342,0,2,https://doi.org/10.1142/S0219720014420050

# f1000research_articles
# can't extract title
0,0,31,0,0,1283,15079,2,https://doi.org/10.12688/f1000research.2-192.v1
0,0,29,0,0,1216,20714,2,https://doi.org/10.12688/f1000research.5165.2
0,0,30,0,0,837,8968,2,https://doi.org/10.12688/f1000research.11223.1
0,0,31,86,0,1283,15167,2,https://doi.org/10.12688/f1000research.2-192.v1
0,0,29,100,0,1216,20816,2,https://doi.org/10.12688/f1000research.5165.2
0,0,30,71,0,837,9041,2,https://doi.org/10.12688/f1000research.11223.1

# f1000research_posters
# pdf_a not working (because href="#")
Expand Down Expand Up @@ -472,8 +471,8 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,site
0,0,29,86,0,496,0,1,https://doi.org/10.3233/978-1-61499-769-6-182

# researchgate
0,0,0,89,0,1608,0,1,https://doi.org/10.13140/RG.2.1.2763.4807
0,0,0,73,0,1435,0,1,https://doi.org/10.13140/RG.2.1.3547.6561
0,0,0,89,0,1608,0,2,https://doi.org/10.13140/RG.2.1.2763.4807
0,0,0,73,0,1435,0,2,https://doi.org/10.13140/RG.2.1.3547.6561

# frontiersin
0,0,24,215,0,0,7809,2,https://doi.org/10.3389/FGENE.2014.00130
Expand Down Expand Up @@ -508,7 +507,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,site
0,0,22,112,0,1600,76501,2,https://doi.org/10.1105/TPC.113.121913

# bloodjournal, now ashpublications.org
0,0,28,67,2,0,5035,2,https://doi.org/10.1182/BLOOD-2010-04-282616
0,0,28,67,2,0,5034,2,https://doi.org/10.1182/BLOOD-2010-04-282616

# bloodadvances, now ashpublications.org
0,0,32,107,12,1950,34205,2,https://doi.org/10.1182/BLOODADVANCES.2016000794
Expand Down
13 changes: 7 additions & 6 deletions core/src/main/resources/scrape/journals.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,15 @@ citeseerx:
wiley:
doi: .article-citation .epub-doi
title: .article-citation .citation__title
#keywords: .keywords li # not working, even if javascript enabled
keywords: .keywords li # not working, even if javascript enabled
abstract: .article-section__abstract > .article-section__content > *
fulltext_a: .coolBar .go-to-full
pdf_a: .coolBar .PdfLink > .pdf-download

wiley_full:
doi: .article-citation .epub-doi
title: .article-citation .citation__title
#keywords: .keywords li # not working, even if javascript enabled
keywords: .keywords li # not working, even if javascript enabled
abstract: .article-section__abstract > .article-section__content > *
fulltext: .article-section__full > [id].article-section__content > :not(.article-section__sub-content), .article-section__full > .article-section__supporting, .article-section__full .article-section__sub-content > :not(.article-section__sub-content)
pdf_a: .coolBar .PdfLink > .pdf-download
Expand All @@ -182,8 +182,8 @@ sciencedirect:
title: article h1 > .title-text
keywords: article .Keywords > div > .keyword
abstract: article .Abstracts > div:not(.graphical) > div
fulltext: article .Body > div:not(.Appendices) > *
pdf_a: .PdfDropDownMenu a:matchesOwn(^Article$)
fulltext: article .Body > div:not(.Appendices) > * # not working
pdf_a: .PdfDropDownMenu a:matchesOwn(^Article$) # not working

biomedcentral:
doi: '#article-info-content [data-track-action=view doi]'
Expand Down Expand Up @@ -363,7 +363,7 @@ iucr:

f1000research_articles:
doi: .article-information > [data-test-id=box-how-to-cite] > a[href^=https://doi.org]
#title: '#anchor-title' # has non-excludable information about approval as part of the title
title: '#anchor-title > h1'
abstract: .abstract-text
fulltext: '#article-context > .generated-article-body > *, #article-context > .generated-article-footer > :has(h2:containsOwn(Supplementary materials))'
pdf_src: /*$
Expand All @@ -373,6 +373,7 @@ f1000research_posters:
title: .asset-title > h1
keywords_split: .asset-subcontainer__title:containsOwn(Keywords) + *
abstract: .abstract__content
# pdf_a not working (because href="#")

peerj:
pmid: '#article-identifier-pmid'
Expand Down Expand Up @@ -524,7 +525,7 @@ researchgate:
#doi: .publication-details__section > .publication-meta > div:first-child + div
title: .publication-header > h1, .publication-details__section > h1
abstract: .publication-abstract > :not(:first-child), .publication-details__section > div > div:matches(^Abstract$) + div
#pdf_a: .publication-resources-summary--buttons > a.publication-header-full-text, a.gtm-fulltext-download-btn-section
pdf_a: a:matches(^Download full-text PDF$)

frontiersin:
doi: .abstract-container > .article-header-container > .header-bar-three > a
Expand Down
14 changes: 8 additions & 6 deletions core/src/main/resources/scrape/webpages.csv
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ title,content,license,language,webpage
4,493,0,0,https://github.com/chapmanb/bcbb/blob/master/nextgen/README.md

# galaxy.pasteur.fr
# java.lang.NullPointerException: JavascriptThread has not created a Document!
0,0,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=toolshed.pasteur.fr/repos/odoppelt/taxonomy_analysis/taxoptimizer/0.0.2
0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=aggregate_scores_in_intervals2
0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=clustalw
0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Fdevteam%2Fbam_to_sam%2Fbam_to_sam%2F1.0.3
0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr%2Frepos%2Fafelten%2Fmicrobiome_analyses%2FCD-HIT%2F4.6.1
100,1845,0,0,https://galaxy.pasteur.fr/root?tool_id=aggregate_scores_in_intervals2
85,1425,0,0,https://galaxy.pasteur.fr/root?tool_id=clustalw
50,187,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Fdevteam%2Fbam_to_sam%2Fbam_to_sam%2F1.0.3
88,2368,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr%2Frepos%2Fafelten%2Fmicrobiome_analyses%2FCD-HIT%2F4.6.1
# java.lang.NullPointerException: JavascriptThread has not created a Document!
0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr/repos/khillion/salmonella_crispr_typing/salmonella_crispr_typing/1.0.0
0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=vcf_intersect
0,0,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=CONVERTER_genbank_to_fasta
75,774,0,0,https://galaxy.pasteur.fr/root?tool_id=vcf_intersect
101,73,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=CONVERTER_genbank_to_fasta

# emboss.open-bio.org
# begins wrongly with "The master copies of EMBOSS documentation are available at on the EMBOSS Wiki."
Expand Down
10 changes: 4 additions & 6 deletions core/src/main/resources/scrape/webpages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,10 @@
title: h1 > [itemprop=name]
content: .repository-content .blob-wrapper > :not([aria-hidden=true]), .repository-content .blob

# HtmlUnit currently can't execute the JavaScript
'galaxy\.pasteur\.fr/+(root|tool_runner)':
# title: '#center .portlet-title-text'
# content: '#center .portlet-body .ui-form-title-text, #center .ui-form-help'
title: ''
content: ''
# javascript: 'true'
title: '#center .portlet-title-text'
content: '#center .portlet-body .ui-form-title-text, #center .ui-form-help'
javascript: 'true'

'emboss\.open-bio\.org/+rel/':
title: body > :first-child
Expand Down Expand Up @@ -130,6 +127,7 @@
# content: div[class^=Page__PageContent] header + div > div > div > p, div[class^=Page__PageContent] article
title: ''
content: ''
# javascript: 'true'

'sanger\.ac\.uk':
title: '#main-content > .pagetitle'
Expand Down

0 comments on commit 9b6fd4e

Please sign in to comment.