### IMPORT

In [4]:
from requests_html import HTMLSession
import os
from tqdm import tqdm

### TEST

In [41]:
session = HTMLSession()
url = 'https://pubsonline.informs.org/doi/abs/10.1287/mnsc.1060.0589'
r = session.get(url)
r = r.html.find('div.article__content',first=True)

In [42]:
# title
r.find('h1.citation__title',first=True).text

'Impact of Licensing on Investment and Financing of Technology Development'

In [43]:
# author
temp = r.find('div.accordion-tabbed.loa-accordion a.entryAuthor')
for _ in temp:
    print(_.text)

Nalin Kulatilaka
Lihui Lin


In [44]:
# link
r.find('div.epub-section a.epub-section__doi__text',first=True).text

'https://doi.org/10.1287/mnsc.1060.0589'

In [45]:
# date
r.find('div.epub-section span.epub-section__date',first=True).text

'1 Dec 2006'

In [46]:
# abstract
r.find('div.abstractInFull',first=True).text

'Technology innovations continue to be one of the greatest drivers of economic growth. Realizing the value of such innovations, however, requires substantial follow-on investments in development and commercialization. The value of these investments is difficult to capture because of uncertain demand and potential competition. This often leads to difficulties in obtaining outside financing for these investments. In this paper, we explore how licensing contracts can both dissuade other firms from developing alternative technologies and alleviate the financing problem. We develop a model in which a firm that invests in the development efforts of an innovation can license its technology to a potential competitor. A variety of licensing possibilities is considered, including fixed fees, royalty schedules, and two-part licenses consisting of an up-front payment and a capped royalty schedule. When the firm has no financial constraint, a royalty schedule that depends on realized demand dominat

### MAIN

In [7]:
os.getcwd()

'/Users/dyang/light-speed-engine/MS/Volume 52'

In [65]:
sub_request = None

In [204]:
os.chdir('light-speed-engine')

In [117]:
MS_URL = 'https://pubsonline.informs.org/toc/mnsc/'
RETRY = 3

def get_subrequest(url):
    session = HTMLSession()
    result = []
    r = session.get(url)
    result.append(r.status_code)
    result.append(r)
    return result


def ms_main(mag, vol, iss):
    if os.getcwd()[13:] == 'light-speed-engine':
        path=mag + '/' + 'Volume ' + vol
        os.makedirs(path,exist_ok=True)
        os.chdir(path)
        
        # get item link list
        main_session = HTMLSession()
        main_request = main_session.get(MS_URL + vol + '/' + iss) 
        # main_request: get links for all articles
        items = main_request.html.find('div.issue-item')
        url_list = []
        for item in items:
            url_list.append(item.find('div.issue-item>p>a',first=True).absolute_links.pop())        
        # start write md file
        # check vol, iss format
        if vol.isdigit():
            vol = '{:02d}'.format(int(vol))
        if iss.isdigit():
            iss = '{:02d}'.format(int(iss))
        md_file = open('Volume {} - Issue {}.md'.format(vol, iss),'w',errors='ignore')
        # title
        md_file.write('# Volume {}, Issue {}\n'.format(vol, iss))
        # date
        md_file.write('- {}\n'.format(main_request.html.find('div.volume--date', first=True).text))
        # page
        md_file.write('- {}\n'.format(main_request.html.find('div.volume--pages', first=True).text))
        # editor
        md_file.write('- {}\n\n'.format(main_request.html.find('div.editor-in-chief>span', first=True).text))
        
        # get issue
        print('Volume {}, Issue {}: {} articles'.format(vol, iss, len(url_list)))
        for num, url in enumerate(url_list):            
            print('Retriving article #{:02d}'.format(num+1))

            sub_request = get_subrequest(url)
            print('Code: ',sub_request[0])
            if sub_request[0] != 200:
                print('Error code {}'.format(sub_request[0]))
                break

            # only keep main body
            sub_request = sub_request[1].html.find('div.article__content',first=True)
            
            # title
            md_file.write('## {}. {}\n'.format(num+1 ,sub_request.find('h1.citation__title',first=True).text))
            # author(s)
            md_file.write('### Author(s):\n')
            for author in sub_request.find('div.accordion-tabbed.loa-accordion a.entryAuthor'):
                md_file.write('- {}\n'.format(author.text))    
            # date
            md_file.write('### Published:\n- {}\n'.format(sub_request.find('div.epub-section span.epub-section__date',first=True).text))
            # Abstract
            try:
                md_file.write('### Abstract:\n{}\n'.format(sub_request.find('div.abstractInFull>p',first=True).text))
            except:
                md_file.write('### Abstract:\n{}\n'.format('None'))        
            # Link
            md_file.write('### Link:\n- {}\n\n'.format(url))
        md_file.close()
    else:
        print('ERORR: Not in the right folder')
    os.chdir('../..')

In [128]:
ms_main('MS',str(44),'12-part-1')

Volume 44, Issue 12-part-1: 9 articles
Retriving article #01
Code:  200
Retriving article #02
Code:  200
Retriving article #03
Code:  200
Retriving article #04
Code:  200
Retriving article #05
Code:  200
Retriving article #06
Code:  200
Retriving article #07
Code:  200
Retriving article #08
Code:  200
Retriving article #09
Code:  200
