In [1]:
from requests_html import HTMLSession
import json
import os

with open('Issuelist/rfs_issuelist.json','r') as f:
    RFS_ISSUELIST = json.loads(f.read())

### TEST

#### url_list

In [None]:
url = 'https://academic.oup.com/rfs/issue/32/1?browseBy=volume'

s = HTMLSession()
r = s.get(url)

In [None]:
url_list = r.html.find('h5.item-title>a')
url_list = [i.absolute_links.pop() for i in url_list]

In [None]:
#vol,iss
vol_iss = r.html.find('div.issue-info-pub',first=True).text.split(',')
print(vol_iss[0].split('Volume ')[1])
print(vol_iss[1].split('Issue ')[1])

#### page

In [None]:
test_url = 'https://academic.oup.com/rfs/article/32/1/1/5058062'
r = s.get(test_url)

In [None]:
# title
r.html.find('h1.article-title-main', first=True).text

In [None]:
# authors
authors = r.html.find('div.al-authors-list a.linked-name')

In [None]:
[i.text for i in authors]

In [None]:
# date
r.html.find('div.citation-date', first=True).text

In [None]:
# abstract
r.html.find('section.abstract>p', first=True).text

In [None]:
# link
r.html.find('div.ww-citation-primary>a', first=True).absolute_links.pop()

### MAIN

In [11]:
RFS_URL = 'https://academic.oup.com/rfs/issue/{}/{}?browseBy=volume'
MAX_RETRY = 5

def get_rfs(vol, iss):
    s = HTMLSession()
    
    try:
        r = s.get(RFS_URL.format(vol,iss))
    except:
        for c , _ in enumerate(range(MAX_RETRY+1)):
            print('Retry#{} '.format(c+1), end='')
            r = s.get(RFS_URL.format(vol,iss))
            if r.status_code == 200:
                break
    

    print('Volume \033[32m{}\033[m, Issue \033[32m{}\033[m: Total {} '.format(vol,iss,len(r.html.find('h5.item-title>a'))))
    
    # get url_list
    url_list = r.html.find('h5.item-title>a')
    url_list = [i.absolute_links.pop() for i in url_list]
    
    # create dict
    article = {}
    # isse info write
    article['journal'] = 'RFS'
    vol_iss = r.html.find('div.issue-info-pub',first=True).text.split(',')
    article['volume'] = vol_iss[0].split('Volume ')[1]
    article['issue'] = vol_iss[1].split('Issue ')[1]
    article['date'] = r.html.find('div.issue-info-date', first=True).text
    article['page'] = RFS_ISSUELIST[vol][iss]['page']
    
    
    # article info write
    article['article'] = []
    for i, url in enumerate(url_list):
        print('#{} '.format(i+1), end='')
        # request
        try:
            r = s.get(url)
        except:
            for c , _ in enumerate(range(MAX_RETRY+1)):
                print('Retry#{} '.format(c+1), end='')
                r = s.get(url)
                if r.status_code == 200:
                    break
        
        article['article'].append({'no':str(i+1)})
        # article title
        article['article'][i]['title'] = r.html.find('h1.article-title-main', first=True).text
        # article date
        article['article'][i]['date'] = r.html.find('div.citation-date', first=True).text
        
        # aritcle author
        authors = r.html.find('div.al-authors-list a.linked-name')
        if authors:
            article['article'][i]['author'] = [i.text for i in authors]
        else:
            article['article'][i]['author'] = []
        
        # article abstract
        if r.html.find('section.abstract>p', first=True):
            article['article'][i]['abstract'] = r.html.find('section.abstract>p', first=True).text
        else:
            article['article'][i]['abstract'] = ''
        # article link
        article['article'][i]['link'] = r.html.find('div.ww-citation-primary>a', first=True).absolute_links.pop()
        
    # write json file
    path_name = 'JSON/RFS/Volume {}/Volume {} - Issue {}.json'.format(vol,vol,iss) 
    os.makedirs(os.path.dirname(path_name), exist_ok=True)
    with open(path_name, 'w') as f:
        # get article items
        f.write(json.dumps(article, indent=4))
    print('\033[32m{}\033[m'.format('DONE!'))


            
def rfs_main(vol,iss):
    if os.getcwd()[13:] != 'light-speed-engine':
        print('Not in the right folder')
    else:
        # write json
        get_rfs(vol,iss)

In [12]:
rfs_main('22','12')

Volume [32m22[m, Issue [32m12[m: Total 15 
#1 #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 #14 #15 [32mDONE![m


In [None]:
rfs_main()

In [18]:
vol = 16
for item in RFS_ISSUELIST[str(vol)]:
    rfs_main(str(vol),item)

Volume [32m16[m, Issue [32m1[m: Total 9 
#1 #2 #3 #4 #5 #6 #7 #8 #9 [32mDONE![m
Volume [32m16[m, Issue [32m2[m: Total 9 
#1 #2 #3 #4 #5 #6 #7 #8 #9 [32mDONE![m
Volume [32m16[m, Issue [32m3[m: Total 10 
#1 #2 #3 #4 #5 #6 #7 #8 #9 #10 [32mDONE![m
Volume [32m16[m, Issue [32m4[m: Total 13 
#1 #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 [32mDONE![m


In [19]:
%%time
for i in range(1,16):
    vol = i
    for item in RFS_ISSUELIST[str(vol)]:
        rfs_main(str(vol),item)

Volume [32m1[m, Issue [32m1[m: Total 5 
#1 #2 #3 #4 #5 [32mDONE![m
Volume [32m1[m, Issue [32m2[m: Total 4 
#1 #2 #3 #4 [32mDONE![m
Retry#1 Volume [32m1[m, Issue [32m3[m: Total 6 
#1 #2 #3 #4 #5 #6 [32mDONE![m
Volume [32m1[m, Issue [32m4[m: Total 8 
#1 #2 #3 #4 #5 #6 #7 #8 [32mDONE![m
Volume [32m2[m, Issue [32m1[m: Total 6 
#1 #2 #3 #4 #5 #6 [32mDONE![m
Volume [32m2[m, Issue [32m2[m: Total 7 
#1 #2 #3 #4 #5 #6 #7 [32mDONE![m
Volume [32m2[m, Issue [32m3[m: Total 7 
#1 #2 #3 #4 #5 #6 #7 [32mDONE![m
Volume [32m2[m, Issue [32m4[m: Total 7 
#1 #2 #3 #4 #5 #6 #7 [32mDONE![m
Volume [32m3[m, Issue [32m1[m: Total 11 
#1 #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 [32mDONE![m
Volume [32m3[m, Issue [32m2[m: Total 7 
#1 #2 #3 #4 #5 #6 #7 [32mDONE![m
Volume [32m3[m, Issue [32m3[m: Total 6 
#1 #2 #3 #4 #5 #6 [32mDONE![m
Volume [32m3[m, Issue [32m4[m: Total 9 
#1 #2 #3 #4 #5 #6 #7 #8 #9 [32mDONE![m
Volume [32m4[m, Issue [32m1[m: Total 10 