### IMPORT

In [None]:
from requests_html import HTMLSession
import os
import time
from tqdm import tqdm

### TEST

In [None]:
# main page
session = HTMLSession()
url = 'https://pubsonline.informs.org/toc/mnsc/19/7'
r = session.get(url)

In [None]:
content = r.html.find('div.issue-item')

In [None]:
<div class="issue-item">
<div class="badges">
    <span class="access__icon icon-Icon_Permissions-Locked"/>
</div><h5 class="issue-item__title">
<a href="/doi/abs/10.1287/mnsc.19.7.717">Semi-Markov Decision Processes with Unbounded Rewards</a></h5><ul aria-label="author" class="rlist--inline loa"><a class="entryAuthor linkable hlFld-ContribAuthor" href="/author/Lippman%2C+Steven+A">Steven A. Lippman</a></ul><div class="rlist--inline separator toc-item__detail"><p><span>Pages:</span>717–731</p><p><span>Published Online:</span>March 1, 1973</p></div><p><a href="https://doi.org/10.1287/mnsc.19.7.717">https://doi.org/10.1287/mnsc.19.7.717</a></p><div class="toc-item__footer"><ul class="rlist--inline separator toc-item__detail"><li><a href="/doi/fpi/10.1287/mnsc.19.7.717" title="Abstract"><span>First Page</span></a></li><li><a href="/doi/pdf/10.1287/mnsc.19.7.717" title="PDF"><span>PDF&#13;\n                             (582 KB)</span></a></li><li><a class="rightslink" href="/servlet/linkout?type=rightslinkBasic&amp;url=issn%3D1526-5501%26WT.mc.id%3DINFORMS">Permissions</a></li></ul><div class="accordion"><a class="accordion__control" href="#" title="Preview Abstract"><span>Preview Abstract<i class="icon-section_arrow_d"/></span></a><div class="accordion__content toc-item__abstract" style="display: none;"><h5 class="article-section__title section__title">&#13;\n            Abstract&#13;\n        </h5><span class="hlFld-Abstract">We consider a semi-Markov decision process with arbitrary action space; the state space is the nonnegative integers. As in queueing systems, we assume that {0, 1, 2, …, <i>n</i> + <i>N</i>} is the set of states accessible from state <i>n</i> in one transition, where <i>N</i> is ...<i/><i/></span></div></div></div></div>        

In [None]:
content[0].html

In [None]:
content[0].find('a',first=True).absolute_links.pop()

In [None]:
# sub page
session = HTMLSession()
url = 'https://doi.org/10.1287/mnsc.19.7.751'
r = session.get(url)

In [None]:
r = r.html.find('div.article__content',first=True)

In [None]:
r.find('div.table-of-content p>a')

In [None]:
# title
r.find('h1.citation__title',first=True).text

In [None]:
# author
temp = r.find('div.accordion-tabbed.loa-accordion a.entryAuthor')
for _ in temp:
    print(_.text)

In [None]:
# link
r.find('div.epub-section a.epub-section__doi__text',first=True).text

In [None]:
# date
r.find('div.epub-section span.epub-section__date',first=True).text

In [None]:
# abstract
r.find('div.abstractInFull',first=True).text

### MAIN

In [None]:
MS_URL = 'https://pubsonline.informs.org/toc/mnsc/'
MAX_RETRY = 3

def get_ms_issue_list():
    url = 'https://pubsonline.informs.org/toc/mnsc/current'
    s = HTMLSession()
    r = s.get(url)
    table = r.html.find('div.loi__issue')
    
    ms_dict = {}
    for item in table:
    #     vol = item.find('span.comma',first=True).text
        string = item.find('div.loi__issue>div.parent-item>a',first=True).text
        vol = string[:string.find('Issue')].split('Volume')[1].strip()
        iss = string[string.find('Issue'):].split('Issue')[1].strip()
        if vol not in ms_dict.keys():
            ms_dict[vol] = []
        ms_dict[vol].append(iss)
    return(ms_dict)


def ms_main(vol, iss):
    if os.getcwd()[13:] != 'light-speed-engine':
        print('Not in the right folder')
    else:
        path= 'MS' + '/' + 'Volume ' + vol
        os.makedirs(path,exist_ok=True)
        os.chdir(path)
        
        # get item link list
        session = HTMLSession()
        main_request = session.get(MS_URL + vol + '/' + iss) 
        # main_request: get links for all articles
        items = main_request.html.find('div.issue-item')
        url_list = [] # doi address
        url_list_bak = [] # pubsonline long address
        for item in items:
            url_list.append(item.find('div.issue-item>p>a',first=True).absolute_links.pop())
            url_list_bak.append(item.find('a',first=True).absolute_links.pop())    
        # start write md file
        # check vol, iss format
        if vol.isdigit():
            vol = '{:02d}'.format(int(vol))
        if iss.isdigit():
            iss = '{:02d}'.format(int(iss))
        md_file = open('Volume {} - Issue {}.md'.format(vol, iss),'w',errors='ignore')
        # title
        md_file.write('# Volume {}, Issue {}\n'.format(vol, iss))
        # date
        md_file.write('- {}\n'.format(main_request.html.find('div.volume--date', first=True).text))
        # page
        md_file.write('- {}\n'.format(main_request.html.find('div.volume--pages', first=True).text))
        # editor
        md_file.write('- {}\n\n'.format(main_request.html.find('div.editor-in-chief>span', first=True).text))
        
        # get issue
        print('Volume {}, Issue {}: {}'.format(vol, iss, len(url_list)))
        for num, url in enumerate(url_list):            
            print('#{} '.format(num+1),end='')
            
            #try to solve proxyerror
            try:
                sub_request = session.get(url)
                if sub_request.status_code == 404: 
                    sub_request = session.get(url_list_bak[num])
            except:
                # anyway to check if request is sucessful?
                for i in range(1,MAX_RETRY+1):
                    time.sleep(1) # wait 1s
                    print('RETRY#{} '.format(i))
                    try:
                        sub_request = session.get(url)
                        if sub_request.status_code == 404: 
                            sub_request = session.get(url_list_bak[num])
                    except:
                        pass

            # only keep main body
            sub_request = sub_request.html.find('div.article__content',first=True)
            # title
#             print(sub_request.find('h1.citation__title',first=True).text)
            md_file.write('## {}. {}\n'.format(num+1 ,sub_request.find('h1.citation__title',first=True).text))
            # author(s)
            md_file.write('### Author(s):\n')
            for author in sub_request.find('div.accordion-tabbed.loa-accordion a.entryAuthor'):
                md_file.write('- {}\n'.format(author.text))    
            # date
            md_file.write('### Published:\n- {}\n'.format(sub_request.find('div.epub-section span.epub-section__date',first=True).text))
            # Abstract
            try:
                md_file.write('### Abstract:\n{}\n'.format(sub_request.find('div.abstractInFull>p',first=True).text))
            except:
                md_file.write('### Abstract:\n{}\n'.format('None'))        
            # Link
            md_file.write('### Link:\n- {}\n\n'.format(url))
        md_file.close()
        print('DONE!')
    # return back to root folder
    os.chdir('../..')

In [None]:
MS_ISSUE_LIST = get_ms_issue_list()

In [None]:
MS_ISSUE_LIST

In [None]:
%%time
ms_main('1','1')

In [None]:
vol = 'MT-1'
for item in MS_ISSUE_LIST[vol]:
    ms_main(vol,item)

In [None]:
%%time
for i in range(1,6):
    vol = str(i)
    for item in MS_ISSUE_LIST[vol]:
        ms_main(vol,item)