In [1]:
from urllib.request import urlopen
from html.parser import HTMLParser
import os
import wget

In [2]:
class MyHTMLParser(HTMLParser):
    def __init__(self, predicate):
        super().__init__()
        self.journals = []
        self.state = False
        self.predicate = predicate
        
    def handle_starttag(self, tag, attrs):
        self.state = tag == 'a'
        
    def handle_data(self, data):
        text = data.strip()
        if self.state and len(text) > 0 and self.predicate(text):
            self.journals.append(data)

class PDFHTMLParser(HTMLParser):
    def __init__(self, predicate):
        super().__init__()
        self.journals = []
        self.predicate = predicate
        
    def handle_data(self, data):
        text = data.strip()
        if self.predicate(text):
            self.journals.append(data)

# Выгружение всех журналов

In [3]:
url_root = "https://socionet.ru/~cyrcitec/newr-nbr/nberwo/"
html_root = urlopen(url_root).read().decode()

journals_catalogue = MyHTMLParser(lambda x: x.endswith('.xml'))
journals_catalogue.feed(html_root)

In [4]:
xml_root = "http://no-xml.socionet.ru/~cyrcitec/newr-nbr/nberwo/"

In [5]:
len(journals_catalogue.journals)

23173

In [6]:
from lxml import etree

In [7]:
def parseXML(xml):
    """
    Парсинг XML
    """
    handle, pdf_link, json_link = None, None, None
    
    if xml.decode().count('\n') <= 17:
        return handle, pdf_link, json_link
    root = etree.fromstring(xml)
    
    for appt in root.getchildren():
        handle = appt.attrib.get('handle') 
        for elem in appt.getchildren():
            if elem.text:
                json_link = elem.attrib.get('rich')
                for t in elem:
                    pdf_link = t.attrib.get('data')
    return handle, pdf_link, json_link

In [8]:
parseXML(urlopen(''.join([xml_root, journals_catalogue.journals[3]])).read())

('repec:nbr:nberwo:0005',
 'http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w0005.pdf',
 'http://no-xml.socionet.ru/~cyrcitec/j-nbr/nberwo/0005.json')

# Из каждого журнала выбираем статью (например, первую) и ищем для нее PDF

In [9]:
try:
    os.mkdir('experiment1')
except:
    pass

In [10]:
url_prefix = "http://no-xml.socionet.ru/~cyrcitec/"
url_prefix_len = len(url_prefix)

In [26]:
import numpy as np
np.random.seed(42)

In [12]:
to_test = [
    '22986.xml', '20867.xml', '15174.xml', '21592.xml', '9413.xml', '7798.xml', '8918.xml', 
    '7131.xml', '8269.xml', '19862.xml', '19035.xml', '14176.xml', '19979.xml']

In [13]:
for journal in to_test:
    print(journal)
    try:
        journal_id, journal_link_pdf, journal_link_json = parseXML(urlopen(''.join([xml_root, journal])).read())
        print(journal_id, journal_link_pdf, journal_link_json)
        if (journal_id is None) or (journal_link_pdf is None) or (journal_link_json is None):
            continue
    except:
        continue
    
    try:
        os.mkdir(f'experiment1/{journal_id}')
    except:
        pass
    try:
        wget.download(journal_link_pdf, out=f'experiment1/{journal_id}/{journal_id}:orig.pdf')
    except:
        pass
    try:
        # http://no-xml.socionet.ru/~cyrcitec/jfmt.cgi?file=j-nbr/nberwo/0011.json
        wget.download(f"{url_prefix}jfmt.cgi?file={journal_link_json[url_prefix_len:]}", 
                      out=f'experiment1/{journal_id}/{journal_id}:orig.txt')
    except:
        pass
    

22986.xml
repec:nbr:nberwo:22986 http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w22986.pdf http://no-xml.socionet.ru/~cyrcitec/j-nbr/nberwo/22986.json
20867.xml
repec:nbr:nberwo:20867 http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w20867.pdf http://no-xml.socionet.ru/~cyrcitec/j-nbr/nberwo/20867.json
15174.xml
repec:nbr:nberwo:15174 http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w15174.pdf http://no-xml.socionet.ru/~cyrcitec/j-nbr/nberwo/15174.json
21592.xml
repec:nbr:nberwo:21592 http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w21592.pdf http://no-xml.socionet.ru/~cyrcitec/j-nbr/nberwo/21592.json
9413.xml
repec:nbr:nberwo:9413 http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w9413.pdf http://no-xml.socionet.ru/~cyrcitec/j-nbr/nberwo/9413.json
7798.xml
None None None
8918.xml
repec:nbr:nberwo:8918 http://no-xml.socionet.ru/~cyrcitec/pdf-cache/www.nber.org/papers/w8918.pdf http://no-xml.socionet.