In [1]:
import json
import pandas as pd

In [2]:
def get_file_local(file_path):
    """
    Retrieve content of file stored locally.

    Input
    -----

    file_path : str
        The full path to the file

    Returns
    -------

    A string with the file's content.
    """

    with open(file_path, 'r') as file:
        content = file.read()
    return content



def get_field(key_value_pair_list, key):
    """
    Given a list of key-value pairs (dicts with keys 'key' and 'value'), 
    find the entry that has the provided `key` and return its value.
    
    If no `key` is found, return None.
    
    It assumes that each `key` only appears one in `key_value_pair_list`,
    so the first appearance is returned.
    """
    entry = list(filter(lambda d: d['key'] == key, key_value_pair_list))
    
    if len(entry) == 0:
        return None
    
    return entry[0]['value']

def keyvalue_to_structure(raw_data, key_list):
    """
    Given a list of dicts containing key-value pairs `raw_data`, 
    a list of relevant keys `key_list`, returns a dict where the keys 
    are those in `key_list` and values are given by the values 
    associated to each key.
    """
    structured_data = {}
    for key in key_list:
        structured_data[key] = get_field(raw_data, key)
        
    return structured_data


def load_njson(newline_delimited_json):
    """
    Given a string `newline_delimited_json` in newline-delimited 
    JSON format, load it and return a list of dicts.
    """
    data = json.loads('[' + newline_delimited_json.replace('\n', ',') + ']')
    return data

def rename_dict_keys(dict_list, old_keys, new_keys):
    """
    Given a list of dicts `dict_list`, rename the keys to the same
    pattern used in the DOU website.
    """
    copy = dict_list.copy()

    for d in copy:
        for old_key, new_key in zip(old_keys, new_keys):
            if d.get('key') == old_key:
                d['key'] = new_key
    return copy

    
def s3_file_to_dict_list(file, key_list=None, honorary_keys=None):
    """
    Load one newline-delimited JSON file stored in S3 as a list of dicts.
    
    Input
    -----
    
    bucket : str 
        S3 data bucket where to look for the data.
    
    path : str
        S3 file key that identifies the file (it looks like a file path).
        
    key_list : list of str or ints (default None)
        If a list is provided here, the function assumes the data 
        in S3 is in the form of key-value pairs (each nJSON line is 
        a dict containing a field called 'key' and another called 
        'value'). Each pair is then converted into an entry in a dict.
        If `None`, does not process the data.
    
    honorary_keys : list of str or ints (default None)
        If a list is provided here, the function looks for extra 
        keys in the nJSON lines other than 'key' and 'value', 
        and copies them to the dict to return. It assumes such 
        keys have the same values in all nJSON lines. It is only
        used if `key_list` != None.
        
    Returns
    -------
    
    structured_data : list of dicts
        The data in S3 nJSON file parsed into a list of dicts.
        If key_list is provided, the list contains only one entry:
        that with the data from the S3 file.
    """
    # Transform the content into a list of dicts:
    content = get_file_local(file)
    raw_data = load_njson(content)
    data = rename_dict_keys(raw_data, old_keys = [ 'article-body-Identifica'], new_keys = ['identifica'])

    # If it is not a key-value pair, return list of dicts:
    if key_list == None:
        structured_data = data
    
    # Else, parse into a dict with selected keys:
    else:
        data_dict = keyvalue_to_structure(data, key_list)
        # If there are other entries in raw_data other than 'key' and 'value', extract them here:
        if honorary_keys != None:
            for honorary in honorary_keys:
                data_dict[honorary] = data[0][honorary]
        
        structured_data = [data_dict]
        
    return structured_data

In [3]:
novo = 'local_files/novo.json'
antigo = 'local_files/antigo.json'
novo_formatado = 'local_files/novo_formatado.json'

In [4]:
antigo_key_list = ['identifica', 'orgao-dou-data', 'ementa', 'fulltext', 'secao-dou', 'edicao-dou-data', 'secao-dou-data', 'publicado-dou-data', 'assina', 'cargo']
antigo_honorary_keys = ['url', 'url_certificado', 'capture_date']
novo_key_list = []
novo_honorary_keys = []
cols_names = ['identifica', 'orgao', 'ementa', 'fulltext', 'secao', 'edicao', 'pagina', 'data_pub', 'assina', 'cargo', 'url', 'url_certificado', 'capture_date']

In [13]:
input_data = pd.DataFrame(s3_file_to_dict_list(novo, key_list=antigo_key_list, honorary_keys=antigo_honorary_keys))
input_antigo = pd.DataFrame(s3_file_to_dict_list(antigo, key_list=antigo_key_list, honorary_keys=antigo_honorary_keys))

In [14]:
mapper = {k:v for k,v in zip(antigo_key_list + antigo_honorary_keys, cols_names) if k != v}
input_data = input_data.rename(mapper=mapper, axis=1)
input_antigo = input_antigo.rename(mapper=mapper, axis=1)

In [15]:
input_data

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date
0,"PORTARIA Nº 1, DE 18 DE JANEIRO DE 2023",Ministério da Economia/Banco do Brasil S.A./Di...,,UNIDADE DE SEGURANÇA INSTITUCIONALPORTARIA Nº ...,DO1,16,81,23/01/2023,Luiz Fernando Ferreira Martins,,,http://pesquisa.in.gov.br/imprensa/jsp/visuali...,2023-01-24 00:05:15


In [16]:
input_antigo

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date
0,ACÓRDÃO DE 11 DE JANEIRO DE 2022,Entidades de Fiscalização do Exercício das Pro...,,Brasão do Brasil Diário Oficial da União Publi...,Seção: 1 | Página:,8,114,12/01/2022,José Albertino Souza,Corregedor,http://www.in.gov.br/web/dou/-/acordao-de-11-d...,http://pesquisa.in.gov.br/imprensa/jsp/visuali...,2022-01-12 06:34:27


# Processamento de dados

In [8]:
# These functions take raw DOU data and process it like it was processed in BigQuery SQL.
# The thing is the data used to train the model was preprocessed by SQL, so we had to 
# reproduce the steps here. Not the best approach.

import re
import pandas as pd

def clean_text(identifica, ementa, fulltext):
    """
    Given a DOU article titles `identifica` and an abstract `ementa`, 
    remove the first title in `identifica`, `ementa` and the hard-coded
    footnote from the full text of the DOU article `fulltext`.
    """
    # ATTENTION: this code should reproduce the cleaning performed in BigQuery,
    # since its resulting text was used to train the model.
    
    if identifica == None:
        return fulltext

    if fulltext == None:
        return None
    
    # Remove primeiro título que aparece no artigo (e tudo que vem antes):
    first_identifica = identifica.split(' | ')[0]
    text_pos   = fulltext.find(first_identifica) + len(first_identifica)
    clean_text = fulltext[text_pos:]
    
    # Remove rodapé:
    clean_text = clean_text.replace('Este conteúdo não substitui o publicado na versão certificada.', '')
    
    if ementa == None:
        return clean_text
    
    # Remove ementa:
    clean_text = clean_text.replace(ementa, '')
    
    return clean_text    


def create_resumo(fulltext):
    """
    Get the first `resumo_length` (hard-coded) characters from `fulltext` 
    that appear after `beginning_marker` (hard-coded) or small variations 
    of it. If `beginning_marker` is not found, return `fulltext`.
    """
    beginning_marker = 'resolve'
    resumo_length    = 500

    if fulltext == None:
        return None
    
    marker_pos = fulltext.find(beginning_marker)
    if marker_pos != -1:
        marker_pos = marker_pos + len('resolve')
        
        if fulltext[marker_pos] == 'u':
            marker_pos = marker_pos + 1
        if fulltext[marker_pos] == ':':
            marker_pos = marker_pos + 1
        return fulltext[marker_pos: marker_pos + resumo_length].strip()
    
    return fulltext[:resumo_length].strip()


def get_secao(secao_dou):
    """
    Extract the first single digit in `secao_dou` (str) and
    return it as an int. The purpose is to parse the DOU section.
    """
    match = re.search('[0-9]', secao_dou)
    
    if type(match) != type(None):
        return int(match[0])
    
    return match


def tipo_edicao_Q(edicao):
    """
    Define se edição do artigo é ordinária ou extra.
    """
    return 'Extra' if len(edicao.split('-')) > 1 else 'Ordinária'


def prepare_dou_df(input_data):
    """
    Transforms a Pandas DataFrame with DOU articles' data in place.
    """
    
    # Clean text:
    input_data['fulltext']    = input_data.apply(lambda r: clean_text(r['identifica'], r['ementa'], r['fulltext']), axis=1)
    input_data['resumo']      = input_data['fulltext'].apply(create_resumo)
    input_data['secao']       = input_data['secao'].apply(get_secao)
    input_data['data_pub']    = pd.to_datetime(input_data['data_pub'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')
    input_data['tipo_edicao'] = input_data['edicao'].astype(str).apply(tipo_edicao_Q)

In [17]:
prepare_dou_df(input_data)

In [18]:
input_data

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date,resumo,tipo_edicao
0,"PORTARIA Nº 1, DE 18 DE JANEIRO DE 2023",Ministério da Economia/Banco do Brasil S.A./Di...,,"RIA Nº 1, DE 18 DE JANEIRO DE 2023O BANCO DO B...",1,16,81,2023-01-23,Luiz Fernando Ferreira Martins,,,http://pesquisa.in.gov.br/imprensa/jsp/visuali...,2023-01-24 00:05:15,"RIA Nº 1, DE 18 DE JANEIRO DE 2023O BANCO DO B...",Ordinária


In [19]:
prepare_dou_df(input_antigo)
input_antigo

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date,resumo,tipo_edicao
0,ACÓRDÃO DE 11 DE JANEIRO DE 2022,Entidades de Fiscalização do Exercício das Pro...,,RECURSO EM PROCESSO ÉTICO-PROFISSIONAL PROCES...,1,8,114,2022-01-12,José Albertino Souza,Corregedor,http://www.in.gov.br/web/dou/-/acordao-de-11-d...,http://pesquisa.in.gov.br/imprensa/jsp/visuali...,2022-01-12 06:34:27,RECURSO EM PROCESSO ÉTICO-PROFISSIONAL PROCESS...,Ordinária


In [20]:
def sel_dou_1(input_data):
    """
    Use hard-coded criteria to pre-select articles from DOU section 1.
    Input and output are Pandas DataFrames.
    """
    identifica_regex = '(?:portaria|decreto|resolu|medida provisória|lei )'
    veto_orgao_regex = '(?:universidade|instituto federal|superintendência regional|superintendência estadual|colégio|coordenação de processos migratórios|secretaria de fomento e incentivo à cultura|departamento de radiodifusão comercial)'
    veto_orgao_root  = ['Conselho Nacional do Ministério Público',
                        'Entidades de Fiscalização do Exercício das Profissões Liberais', 
                        'Governo do Estado', 'Ineditoriais', 'Defensoria Pública da União', 
                        'Ministério Público da União', 'Poder Judiciário', 'Prefeituras', 
                        'Tribunal de Contas da União', 'Atos do Poder Judiciário']

    # Get secao 1:
    sel_data = input_data.loc[input_data['secao'] == 1]
    
    # Apply cuts:
    sel_data = sel_data.loc[(~sel_data['identifica'].isnull()) & 
                            (sel_data['identifica'].str.lower().str.contains(identifica_regex))]
    sel_data = sel_data.loc[~sel_data['orgao'].str.lower().str.contains(veto_orgao_regex)]
    sel_data = sel_data.loc[~sel_data.orgao.apply(lambda s: s.split('/')[0]).isin(veto_orgao_root)]
    
    return sel_data

In [None]:


# Select relevant data:
input_data = sel_dou_1(input_data)

# Predict:
predicted_class = code.predict(input_data)

# Join prediction to data:
input_data['predicted_rank'] = pd.Series(predicted_class, index=input_data.index)

In [22]:
sel_dou_1(input_antigo)

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date,resumo,tipo_edicao


In [21]:
sel_dou_1(input_data)

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date,resumo,tipo_edicao
0,"PORTARIA Nº 1, DE 18 DE JANEIRO DE 2023",Ministério da Economia/Banco do Brasil S.A./Di...,,"RIA Nº 1, DE 18 DE JANEIRO DE 2023O BANCO DO B...",1,16,81,2023-01-23,Luiz Fernando Ferreira Martins,,,http://pesquisa.in.gov.br/imprensa/jsp/visuali...,2023-01-24 00:05:15,"RIA Nº 1, DE 18 DE JANEIRO DE 2023O BANCO DO B...",Ordinária
