In [1]:
from bs4 import BeautifulSoup as bs
from requests import get
import requests
import re


Journal of Neurophysiology is structured such that you can find all volume and issue information hidden on any of the volume pages, so we can just start with the first volume and create a list of links for the remaining volumes.

In [None]:
start_url = 'https://journals.physiology.org/loi/jn/group/d1940.y1940'
url = start_url
year = get(url)
year_page = bs(year.text, 'html.parser')
all_vol = year_page.find_all('a',{'href': re.compile(r'/toc/jn/')})
vol_list = []
for v in all_vol:
    try: 
        vol_str = re.search(r'Volume\s\d{1,3}',str(v)).group()
        vol_int = int(vol_str[7:])
        vol_list.append(vol_int)
    except AttributeError:
        print('Something bad happened.')

vol_list.sort()


Default digital open access starts in Volume 77, issue 1. Perhaps start the loop from this point instead of going through all of the earlier volumes, which are all paywalled and in PDF form.

In [None]:
open_access = [i for i in vol_list if i>76]
print(open_access)

In [None]:
last_v = 0
base_url = 'https://journals.physiology.org/toc/jn/'
for v in open_access:
    # count the issues in each volume
    if v!=last_v:
        issue=1
    else:
        issue+=1
    
    # construct the url for the issue and load it
    issue_url = base_url + str(v) +'/' + str(issue)
    print(issue_url)
    get_issue = get(issue_url)
    issue_page = bs(get_issue.text, 'html.parser')
    issue_date = issue_page.find('div',{'class':'col-sm-4 gray-bg toc-right-side'}).\
        find('span',{'class':'coverDate'}).get_text()

    # scrape all of the articles in this issue
    toc = issue_page.find_all('div',{'class':'table-of-content'})[0].find_all('div',{'class':'issue-item'})

    # loop through the articles
    for c in toc:
        section = dict()
        
        # check if article is behind a paywall. If so, skip it (for now)
        if not c.find('div',{'class':'badges'}).get_text():
            continue
        
        # get article metadata
        art.url = 'https://journals.physiology.org' + c.find('a').get('href')
        art.title = c.find('h4').text
        auth = c.find('ul',{'class':'rlist--inline loa'}).find_all('li')
        art.authors = [a.get_text().replace(', and ','') for a in auth]
        art.doi = c.select(".epub-section__item")[0].find('a').get('href')[16:]
        art.id = get_next_id(db_name, table_name) # still needs to be written
        
        # check if this article is already in database
        # (currently, check the title for a match)
        # if not, add metadata to table and load article. If so, continue loop.
        if new_article_check(art.title): # still needs to be written
            add_new_row(db_name, table_name, art)
        else:
            continue
        
        # load article
        get_art = get(art.url)
        art_page = bs(get_art, 'html.parser')
        
        # get reference list
        rlist = art_page.find('ul',{'class':'rlist separator'}).find_all('li')
        # loop through references and add them to database
        for r in rlist:
            r.url = [i.get('href') for i in r.find_all('a')]
            r.title = r.find('span',{'class':'references__article-title'}).get_text()
            r.authors = r.find('span',{'class':'references__authors'}).get_text().split(', ')
            r.id = get_next_id(db_name, table_name) # still needs to be written
            
            # check if article is in database
            if new_article_check(art.title): # still needs to be written
                add_new_row(db_name, table_name, r)
            else:
                continue
            
            # add the citation
            # This function should add the citing ID to a list of citing IDs in the entry for the cited ID
            # It should also add the cited ID to a list of cited IDs in the entry for the citing ID
            add_citation_to_DB(citing_ID = art.id, cited_ID = r.id)
            
            
        # get article sections
        # get abstract
        section{'Abstract'} = art_page.select('div.hlFld-Abstract div.abstractSection')[0].get_text()
        # get all other sections
        section['Introduction'] = ''
        fulltext = test_page.find('div', {'class': 'hlFld-Fulltext'}).findChildren(recursive=False)
        for f in fulltext:
            heading = f.find('h1',{'class':'article-section__title section__title'})
            if not heading:
                section['Introduction']  = section['Introduction']  + ' ' + f.find_text()
            else:
                section[heading.get_text()] = ''
                for text in heading.find_parent().findChildren('div'):
                    section[heading.get_text()] = section[heading.get_text()] + text.get_text()
        
        
            
        
    
    

## Brainstorming:
* Have a function that cycles through the DB, checks for entries with no text, and tries to find the text via PubMed (with the PubMed parser) or with Google Scholar
* 

# Testing stuff out

In [None]:
# page = get('https://journals.physiology.org/doi/full/10.1152/jn.00104.2016')
page = get('https://journals.physiology.org/doi/full/10.1152/jn.00399.2013')
test_page = bs(page.text, 'html.parser')


In [None]:
loa = test_page.find('div',{'class':'accordion-tabbed loa-accordion'}).\
find_all('div',{'class':'accordion-tabbed__tab-mobile '})
[i.find('a').get_text() for i in loa]

In [None]:
test_page.select('div.cover-image__details ')[0].find('span',{'class':'volume'}).get_text()

In [None]:
test_page.select('div.cover-image__details ')[0].find('span',{'class':'issue'}).get_text()

In [None]:
fulltext = test_page.find('div', {'class': 'hlFld-Fulltext'}).findChildren(recursive=False)

In [None]:
fulltext[7].find('h1',{'class':'article-section__title section__title'})

In [None]:
refs = dict()

In [None]:
refs[0] = dict()
refs[0]['url'] = 'hello'
refs[0]['title'] = 'goodbye'
refs[1] = dict()
refs[1]['url'] = 'bonjour'
refs[1]['title'] = 'au revoir'


In [None]:
refs

In [None]:
section = dict()

In [None]:
section[fulltext[7].find('h1',{'class':'article-section__title section__title'}).get_text()] = 'hi'

In [None]:
sec_text = fulltext[7].find('h1',{'class':'article-section__title section__title'}).find_parent().findChildren('div')

In [None]:
heading = fulltext[7].find('h1',{'class':'article-section__title section__title'})
section[heading.get_text()] = ''
for text in heading.find_parent().findChildren('div'):
    section[heading.get_text()] = section[heading.get_text()] + text.get_text()

## Testing out pymysql with Amazon RDS

In [2]:
import journal_scrape as js

In [3]:
# import sys
# !{sys.executable} -m pip install -U PyMySQL
import pymysql

In [4]:
db = js.Database()

In [5]:
db.connect()

In [None]:


# dbname = "findingssm"
# host = "findingssm.c9zjgwsivgee.us-east-2.rds.amazonaws.com"
# port = 3306
# user = "danielkentwood"
# password = "findingsdbsm"

# conn = pymysql.connect(host, user=user, port=port, passwd=password, db=dbname, connect_timeout=5)