In [37]:
import sys, os
sys.path.append('..')

import datetime as dt
from capture_dou.inlabs_driver import InLabsDriver

#tipo_dou="DO1 DO2 DO3 DO1E DO2E DO3E" # Seções separadas por espaço
secoes = "DO2"

In [2]:
def brasilia_day():
    """
    No matter where the code is ran, return UTC-3 day
    (Brasilia local day, no daylight savings)
    """
    return (dt.datetime.utcnow() + dt.timedelta(hours=-3)).replace(hour=0, minute=0, second=0, microsecond=0)

In [3]:
import requests
import io
import zipfile
from lxml import etree

def parse_zipped_response(response):
    """
    Download a ZIP file and extract its contents in memory
    yields (filename, file-like object) pairs
    """

    # Extract the contents of the .zip file in memory
    zip_file = io.BytesIO(response.content)
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        for xml_file in zip_ref.namelist():
            with zip_ref.open(xml_file) as f:
                # Read the xml file from memory
                xml_data = f.read()
            # Parse the XML data
            root = etree.fromstring(xml_data)
            # Extract and parse the information you need from the XML data

            return root

## PARSE_DOU_ARTICLE

In [11]:
from datetime import datetime
from collections import defaultdict
import re

def branch_text(branch):
    """
    Takes and lxml tree element 'branch' and returns its text, 
    joined to its children's tails (i.e. the content that follows 
    childrens of 'branch').
    """
    texts = list(filter(lambda s: s != None, [branch.text] + [child.tail for child in branch]))
    if len(texts) == 0:
        return None
    text  = ' | '.join(texts)
    return text


def add_to_data(branch, data, key):
    """
    Given a dict 'data' and a key, add the text (see branch_text function) 
    found in a branch to its value.
    """
    if key in data:
        if data[key] is None:
            data[key] = branch_text(branch)
        else:
            data[key] = data[key] + ' | %s' % branch_text(branch)            
    else:
        data[key] = branch_text(branch)        
    return data


def recurse_over_nodes(tree, parent_key, data):
    """
    Recursevely gets the text of the xml leafs and saves
    its classes and keys and text as values
    
    input: 
        tree: lxml.etree._Element
        parent_key: lxml.etree._Element
        data: dict
    return: dict
    """            
    for branch in tree:
        key = branch.attrib.get('class') or branch.attrib.get('id')
        
        if list(branch):
            if parent_key:
                key = '%s_%s' % (parent_key, key)    
            add_to_data(branch, data, key) 
            data = recurse_over_nodes(branch, key, data)
        
        else:            
            if parent_key:
                key = '%s_%s' % (parent_key, key)            
            add_to_data(branch, data, key)
    
    return data

def filter_keys(data):
    """
    Filter keys paths to get only last class from html
    
    input:
        data: dict
    return: dict
    """

    final = defaultdict(lambda: '')

    for k, v in data.items():
        if v is not None:            
            k_new = k.split('_')[-1]
            final[k_new] =  ' | '.join([final[k_new], v]) if len(final[k_new]) > 0 else v
            
    return final


def filter_values(data):
    """
    Filter values that do not have letters or numbers
    
    input:
        data: dict
    return: dict
    """    
    final = {}
    
    for k, v in data.items():        
        if re.search('[a-zA-Z0-9]', v):        
            final[k] = v
            
    return final    


def decode(data, encoding= 'iso-8859-1', decoding='utf8'):
    """
    Change enconding from string with secure error handling
    
    input:
        data: dict
        encoding: string
        decoding: string
    return: dict
    """    
    final = {}
    
    for k, v in data.items():        
        try:
            final[k] = v.encode('iso-8859-1').decode('utf8')
        except Exception as e:
            print("Error", e)
            final[k] = v
    
    return final

In [65]:
def data_schema(key, value, url, url_certificado):
    """
    Final data schema
    
    input:
        key: string
        value: string
        url: string
        url_certificado: string
    return: dict
    """    
    return {
        "key": key,
        "value": value,
        "url": url,
        "capture_date": datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S'),
        "url_certificado": url_certificado
    }


def get_url_certificado(article):
    """
    Gets the certified url in the xml
    
    input: 
        article: lxml.etree._Element
    return: string
    """
    return article.xpath('//article/@pdfPage')[0]

def get_data(article):
    """
    Get relevant data from xml. It recursevely gets leaf text from xml
    and saves theirs classes as keys. 
    It also creates an item in dict's key 'full-text' with all text 
    in the xml, without tags.
    
    input: 
        article: lxml.etree._Element
    return: dict
    """
    data = recurse_over_nodes(article, None, {})

    # filtra None e melhora keys
    data = filter_keys(data)
    data = filter_values(data)
    data = {k: v for k,v in data.items() if len(k) != 0}

    # Include full-text:
    data['fulltext'] = article.text

    return data


def structure_data(data, url, article):
    """
    Structures html parsed data to list of dicts 
    ready to be processed by http-request Lambda
    Function.
    It adds the capture date, url, and 
    certified url
    
    input: 
        data: dict
        url: string
        artigo: lxml.html.HtmlElement
    return: list of dict
    """    
    url_certificado = get_url_certificado(article)
    
    final = []
    for key, value in data.items():        
        final.append(data_schema(key, value, url, url_certificado))
        
    return final
        

def parse_dou_article(response, url=''):
    """
    Gets an HTTP request response for a DOU article's URL and that url 
    and parse the relevant fields to a list of dicts. Each dict has the 
    keys: 
    * key             -- an html tag class identifying the field;
    * value           -- the respective value (text) in that field;
    * url             -- The original article URL;
    * capture_date    -- The date when capture occured;
    * url_certificado -- The link to the certified version of the article.
    """
    data    = get_data(response)    
    data    = structure_data(data, url, response)
    
    return data

In [63]:
etree.tostring(data, pretty_print=True, encoding='unicode', with_tail=True)

'<xml>\n  <article id="30079829" name="Portaria 256.2022 dispensa FlAvi" idOficio="9288474" pubName="DO2" artType="Portaria" pubDate="13/01/2023" artClass="00043:00012:00013:00000:00000:00000:00000:00000:00000:00000:00017:00000" artCategory="Ministério Público da União/Ministério Público do Trabalho/Procuradoria Regional do Trabalho da 12ª Região" artSize="12" artNotes="" numberPage="46" pdfPage="http://pesquisa.in.gov.br/imprensa/jsp/visualiza/index.jsp?data=13/01/2023&amp;jornal=529&amp;pagina=46" editionNumber="10" highlightType="" highlightPriority="" highlight="" highlightimage="" highlightimagename="" idMateria="20165078">\n  <body>\n    <Identifica> Portaria Nº 256, de 13 de dezembro de 2022</Identifica>\n    <Data></Data>\n    <Ementa/>\n    <Titulo/>\n    <SubTitulo/>\n    <Texto>&lt;p class="identifica"&gt;Portaria Nº 256, de 13 de dezembro de 2022&lt;/p&gt;&lt;p&gt;PGEA 20.02.1200.0001047/2022-18&lt;/p&gt;&lt;p&gt;O Procurador-Chefe da Procuradoria Regional do Trabalho da 12

In [82]:
tree = data
parent_key = None
data2 = {}
for branch in tree:
    print(branch)
    key = branch.attrib.get('class') or branch.attrib.get('id')
    if list(branch):
        print(key)
        if parent_key:
            key = '%s_%s' % (parent_key, key)    
        add_to_data(branch, data2, key) 
        data2 = recurse_over_nodes(branch, key, data2)
    
    else:            
        if parent_key:
            key = '%s_%s' % (parent_key, key)            
        add_to_data(branch, data2, key)


<Element article at 0x7f0b1bf1f900>
30079829


In [76]:
data2

{'30079829': '\n   | \n   | \n',
 '30079829_None': '\n     | \n     | \n     | \n     | \n     | \n     | \n   | None',
 '30079829_None_None': ' Portaria Nº 256, de 13 de dezembro de 2022 |  | None | None | None | <p class="identifica">Portaria Nº 256, de 13 de dezembro de 2022</p><p>PGEA 20.02.1200.0001047/2022-18</p><p>O Procurador-Chefe da Procuradoria Regional do Trabalho da 12ª Região, no uso de suas atribuições institucionais e considerando a Portaria PGT nº 1728, de 02 de outubro de 2017, resolve:</p><p>Art. 1º Dispensar, a servidora Flávia Carolina Postalli Rodrigues Alloy, matrícula 6007731-X, do encargo de substituta eventual da Chefe da Assessoria Jurídica do 14º Ofício Geral da PRT 12ª Região, código CC-02.</p><p>Art. 2º Designar a servidora Rosmari Rudolf Mezzomo, matrícula 6002253-1 para o encargo de substituta eventual do Chefe da Assessoria Jurídica do 14º Ofício Geral da PRT 12ª Região, código CC-02.</p><p class="assina">MARCELO GOSS NEVES</p><p></p>'}

## FINAL CODE

In [38]:
# Inicialização do driver:
driver = InLabsDriver()
driver.login()

# Montagem da URL:
do_date_format = '%Y-%m-%d'
# Transforms date to DOU format:
date_string    = brasilia_day().strftime(do_date_format)

for dou_secao in secoes.split(' '):
    file_url = driver.url_download + date_string + "&dl=" + date_string + "-" + dou_secao + ".zip"
    file_header = {'Cookie': 'inlabs_session_cookie=' + driver.cookie, 'origem': '736372697074'}
    file_response = driver.session.request("GET", file_url, headers = file_header)
    if file_response.status_code == 200:
        data = parse_zipped_response(file_response)
        del file_response

    elif file_response.status_code == 404:
        print("File not found: %s" % (date_string + "-" + dou_secao + ".zip"))

