updated HTML downloads

olivettigroup · Aug 29, 2016 · b14ab02 · b14ab02
1 parent 5f395f1
commit b14ab02
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 9 deletions.
diff --git a/articledownloader/articledownloader.py b/articledownloader/articledownloader.py
@@ -158,7 +158,7 @@ def get_html_from_doi(self, doi, writefile, mode):
     :param writefile: file object to write to
     :type writefile: file
 
-    :param mode: either 'elsevier', depending on how we wish to access the file
+    :param mode: either 'elsevier' | 'springer' | 'acs' | 'rsc' | 'nature', depending on how we wish to access the file
     :type mode: str
 
     :returns: True on successful write, False otherwise
@@ -184,6 +184,89 @@ def get_html_from_doi(self, doi, writefile, mode):
           return False
       return False
 
+    if mode == 'springer':
+      base_url = 'http://link.springer.com/'
+      api_url = base_url + doi + '.html'
+
+      try:
+        headers = {
+          'Accept': 'text/html',
+          'User-agent': 'Mozilla/5.0'
+        }
+        r = requests.get(api_url, stream=True, headers=headers)
+        if r.status_code == 200:
+          for chunk in r.iter_content(2048):
+            writefile.write(chunk)
+          return True
+      except:
+        return False
+      return False
+
+    if mode == 'acs':
+      base_url = 'http://pubs.acs.org/doi/full/'
+      api_url = base_url + doi
+
+      try:
+        headers = {
+          'Accept': 'text/html',
+          'User-agent': 'Mozilla/5.0'
+        }
+        r = requests.get(api_url, stream=True, headers=headers)
+        if r.status_code == 200:
+          for chunk in r.iter_content(2048):
+            writefile.write(chunk)
+          return True
+      except:
+        return False
+      return False
+
+    if mode == 'rsc':
+      scraper = scrapers.RSC()
+      scrape_url = 'http://dx.doi.org/' + doi
+      download_url = None
+
+      r = requests.get(scrape_url)
+      if r.status_code == 200:
+        scraper.feed(r.content)
+
+        if scraper.download_link is not None:
+          download_url = scraper.download_link
+          download_url = download_url.replace('articlepdf', 'articlehtml') #Override for HTML mode
+
+      if download_url is not None:
+        headers = {
+          'Accept': 'text/html',
+          'User-agent': 'Mozilla/5.0'
+        }
+        r = requests.get(download_url, stream=True, headers=headers)
+        if r.status_code == 200:
+          try:
+            for chunk in r.iter_content(2048):
+              writefile.write(chunk)
+            return True
+          except:
+            return False
+
+      return False
+
+    if mode == 'nature':
+      download_url = 'http://dx.doi.org/' + doi
+
+      headers = {
+        'Accept': 'text/html'
+      }
+      r = requests.get(download_url, stream=True, headers=headers)
+      if r.status_code == 200:
+        try:
+          for chunk in r.iter_content(2048):
+            writefile.write(chunk)
+          return True
+        except:
+          return False
+      return False
+
+    return False
+
   @traced
   def get_pdf_from_doi(self, doi, writefile, mode):
     '''
@@ -195,7 +278,7 @@ def get_pdf_from_doi(self, doi, writefile, mode):
     :param writefile: file object to write to
     :type writefile: file
 
-    :param mode: either 'crossref' | 'elsevier' | 'rsc' | 'springer', depending on how we wish to access the file
+    :param mode: either 'crossref' | 'elsevier' | 'rsc' | 'springer' | 'ecs' | 'nature' | 'acs', depending on how we wish to access the file
     :type mode: str
 
     :returns: True on successful write, False otherwise
@@ -345,8 +428,6 @@ def get_pdf_from_doi(self, doi, writefile, mode):
         return False
       return False
 
-    return False
-
     if mode == 'springer':
       base_url = 'http://link.springer.com/content/pdf/'
       api_url = base_url + doi

diff --git a/articledownloader/scrapers.py b/articledownloader/scrapers.py
@@ -10,7 +10,7 @@ class RSC(HTMLParser):
   #RSC scraping implementation
   def handle_starttag(self, tag, attrs):
     '''
-    PDF link handler; never gets explicity called by user
+    PDF link handler; never gets explicitly called by user
     '''
     for attr in attrs:
       if attr[0] == 'content':
@@ -26,7 +26,7 @@ class ECS(HTMLParser):
   #ECS scraping implementation
   def handle_starttag(self, tag, attrs):
     '''
-    PDF link handler; never gets explicity called by user
+    PDF link handler; never gets explicitly called by user
     '''
     if tag == 'a' and ('rel', 'view-full-text.pdf') in attrs:
       for attr in attrs:
@@ -43,7 +43,7 @@ class Nature(HTMLParser):
   #Nature scraping implementation
   def handle_starttag(self, tag, attrs):
     '''
-    PDF link handler; never gets explicity called by user
+    PDF link handler; never gets explicitly called by user
     '''
     if tag == 'a' and ( ('class', 'download-pdf') in attrs or ('id', 'download-pdf') in attrs ):
         for attr in attrs:

diff --git a/dist/articledownloader-6.0.tar.gz b/dist/articledownloader-6.0.tar.gz
diff --git a/setup.py b/setup.py
@@ -2,11 +2,11 @@
 setup(
 name = 'articledownloader',
 packages = ['articledownloader'], # this must be the same as the name above
-version = '5.9',
+version = '6.0',
 description = 'A class for downloading scientific journal articles',
 author = 'Edward Kim',
 author_email = 'eddotman@gmail.com',
 url = 'https://github.com/eddotman/article-downloader', # use the URL to the github repo
-download_url = 'https://www.github.com/eddotman/article-downloader/tarball/5.9',
+download_url = 'https://www.github.com/eddotman/article-downloader/tarball/6.0',
 keywords = ['journal', 'paper', 'article', 'downloader'], # arbitrary keywords
 )