Skip to content

Commit

Permalink
updated HTML downloads
Browse files Browse the repository at this point in the history
  • Loading branch information
eddotman committed Aug 29, 2016
1 parent 5f395f1 commit b14ab02
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 9 deletions.
89 changes: 85 additions & 4 deletions articledownloader/articledownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def get_html_from_doi(self, doi, writefile, mode):
:param writefile: file object to write to
:type writefile: file
:param mode: either 'elsevier', depending on how we wish to access the file
:param mode: either 'elsevier' | 'springer' | 'acs' | 'rsc' | 'nature', depending on how we wish to access the file
:type mode: str
:returns: True on successful write, False otherwise
Expand All @@ -184,6 +184,89 @@ def get_html_from_doi(self, doi, writefile, mode):
return False
return False

if mode == 'springer':
base_url = 'http://link.springer.com/'
api_url = base_url + doi + '.html'

try:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False

if mode == 'acs':
base_url = 'http://pubs.acs.org/doi/full/'
api_url = base_url + doi

try:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(api_url, stream=True, headers=headers)
if r.status_code == 200:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False

if mode == 'rsc':
scraper = scrapers.RSC()
scrape_url = 'http://dx.doi.org/' + doi
download_url = None

r = requests.get(scrape_url)
if r.status_code == 200:
scraper.feed(r.content)

if scraper.download_link is not None:
download_url = scraper.download_link
download_url = download_url.replace('articlepdf', 'articlehtml') #Override for HTML mode

if download_url is not None:
headers = {
'Accept': 'text/html',
'User-agent': 'Mozilla/5.0'
}
r = requests.get(download_url, stream=True, headers=headers)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False

return False

if mode == 'nature':
download_url = 'http://dx.doi.org/' + doi

headers = {
'Accept': 'text/html'
}
r = requests.get(download_url, stream=True, headers=headers)
if r.status_code == 200:
try:
for chunk in r.iter_content(2048):
writefile.write(chunk)
return True
except:
return False
return False

return False

@traced
def get_pdf_from_doi(self, doi, writefile, mode):
'''
Expand All @@ -195,7 +278,7 @@ def get_pdf_from_doi(self, doi, writefile, mode):
:param writefile: file object to write to
:type writefile: file
:param mode: either 'crossref' | 'elsevier' | 'rsc' | 'springer', depending on how we wish to access the file
:param mode: either 'crossref' | 'elsevier' | 'rsc' | 'springer' | 'ecs' | 'nature' | 'acs', depending on how we wish to access the file
:type mode: str
:returns: True on successful write, False otherwise
Expand Down Expand Up @@ -345,8 +428,6 @@ def get_pdf_from_doi(self, doi, writefile, mode):
return False
return False

return False

if mode == 'springer':
base_url = 'http://link.springer.com/content/pdf/'
api_url = base_url + doi
Expand Down
6 changes: 3 additions & 3 deletions articledownloader/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class RSC(HTMLParser):
#RSC scraping implementation
def handle_starttag(self, tag, attrs):
'''
PDF link handler; never gets explicity called by user
PDF link handler; never gets explicitly called by user
'''
for attr in attrs:
if attr[0] == 'content':
Expand All @@ -26,7 +26,7 @@ class ECS(HTMLParser):
#ECS scraping implementation
def handle_starttag(self, tag, attrs):
'''
PDF link handler; never gets explicity called by user
PDF link handler; never gets explicitly called by user
'''
if tag == 'a' and ('rel', 'view-full-text.pdf') in attrs:
for attr in attrs:
Expand All @@ -43,7 +43,7 @@ class Nature(HTMLParser):
#Nature scraping implementation
def handle_starttag(self, tag, attrs):
'''
PDF link handler; never gets explicity called by user
PDF link handler; never gets explicitly called by user
'''
if tag == 'a' and ( ('class', 'download-pdf') in attrs or ('id', 'download-pdf') in attrs ):
for attr in attrs:
Expand Down
Binary file added dist/articledownloader-6.0.tar.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
setup(
name = 'articledownloader',
packages = ['articledownloader'], # this must be the same as the name above
version = '5.9',
version = '6.0',
description = 'A class for downloading scientific journal articles',
author = 'Edward Kim',
author_email = 'eddotman@gmail.com',
url = 'https://github.com/eddotman/article-downloader', # use the URL to the github repo
download_url = 'https://www.github.com/eddotman/article-downloader/tarball/5.9',
download_url = 'https://www.github.com/eddotman/article-downloader/tarball/6.0',
keywords = ['journal', 'paper', 'article', 'downloader'], # arbitrary keywords
)

0 comments on commit b14ab02

Please sign in to comment.