In [1]:
import json
import pandas as pd

In [95]:
def get_file_local(file_path):
    """
    Retrieve content of file stored locally.

    Input
    -----

    file_path : str
        The full path to the file

    Returns
    -------

    A string with the file's content.
    """

    with open(file_path, 'r') as file:
        content = file.read()
    return content



def get_field(key_value_pair_list, key):
    """
    Given a list of key-value pairs (dicts with keys 'key' and 'value'), 
    find the entry that has the provided `key` and return its value.
    
    If no `key` is found, return None.
    
    It assumes that each `key` only appears one in `key_value_pair_list`,
    so the first appearance is returned.
    """
    entry = list(filter(lambda d: d['key'] == key, key_value_pair_list))
    
    if len(entry) == 0:
        return None
    
    return entry[0]['value']

def keyvalue_to_structure(raw_data, key_list):
    """
    Given a list of dicts containing key-value pairs `raw_data`, 
    a list of relevant keys `key_list`, returns a dict where the keys 
    are those in `key_list` and values are given by the values 
    associated to each key.
    """
    structured_data = {}
    for key in key_list:
        structured_data[key] = get_field(raw_data, key)
        
    return structured_data


def load_njson(newline_delimited_json):
    """
    Given a string `newline_delimited_json` in newline-delimited 
    JSON format, load it and return a list of dicts.
    """
    data = json.loads('[' + newline_delimited_json.replace('\n', ',') + ']')
    return data

def rename_dict_keys(dict_list, old_keys, new_keys):
    """
    Given a list of dicts `dict_list`, rename the keys to the same
    pattern used in the DOU website.
    """
    copy = dict_list.copy()

    for d in copy:
        for old_key, new_key in zip(old_keys, new_keys):
            if d.get('key') == old_key:
                d['key'] = new_key
    return copy

    
def s3_file_to_dict_list(file, key_list=None, honorary_keys=None):
    """
    Load one newline-delimited JSON file stored in S3 as a list of dicts.
    
    Input
    -----
    
    bucket : str 
        S3 data bucket where to look for the data.
    
    path : str
        S3 file key that identifies the file (it looks like a file path).
        
    key_list : list of str or ints (default None)
        If a list is provided here, the function assumes the data 
        in S3 is in the form of key-value pairs (each nJSON line is 
        a dict containing a field called 'key' and another called 
        'value'). Each pair is then converted into an entry in a dict.
        If `None`, does not process the data.
    
    honorary_keys : list of str or ints (default None)
        If a list is provided here, the function looks for extra 
        keys in the nJSON lines other than 'key' and 'value', 
        and copies them to the dict to return. It assumes such 
        keys have the same values in all nJSON lines. It is only
        used if `key_list` != None.
        
    Returns
    -------
    
    structured_data : list of dicts
        The data in S3 nJSON file parsed into a list of dicts.
        If key_list is provided, the list contains only one entry:
        that with the data from the S3 file.
    """
    # Transform the content into a list of dicts:
    content = get_file_local(file)
    raw_data = load_njson(content)
    data = rename_dict_keys(raw_data, old_keys = [ 'article-body-Identifica'], new_keys = ['identifica'])

    # If it is not a key-value pair, return list of dicts:
    if key_list == None:
        structured_data = data
    
    # Else, parse into a dict with selected keys:
    else:
        data_dict = keyvalue_to_structure(data, key_list)
        # If there are other entries in raw_data other than 'key' and 'value', extract them here:
        if honorary_keys != None:
            for honorary in honorary_keys:
                data_dict[honorary] = data[0][honorary]
        
        structured_data = [data_dict]
        
    return structured_data

In [82]:
novo = 'local_files/novo.json'
antigo = 'local_files/antigo.json'
novo_formatado = 'local_files/novo_formatado.json'

In [105]:
antigo_key_list = ['identifica', 'orgao-dou-data', 'ementa', 'fulltext', 'secao-dou', 'edicao-dou-data', 'secao-dou-data', 'publicado-dou-data', 'assina', 'cargo']
antigo_honorary_keys = ['url', 'url_certificado', 'capture_date']
novo_key_list = []
novo_honorary_keys = []
cols_names = ['identifica', 'orgao', 'ementa', 'fulltext', 'secao', 'edicao', 'pagina', 'data_pub', 'assina', 'cargo', 'url', 'url_certificado', 'capture_date']

In [112]:
input_data = pd.DataFrame(s3_file_to_dict_list(novo, key_list=antigo_key_list, honorary_keys=antigo_honorary_keys))

In [113]:
mapper = {k:v for k,v in zip(antigo_key_list + antigo_honorary_keys, cols_names) if k != v}
input_data = input_data.rename(mapper=mapper, axis=1)

In [114]:
input_data

Unnamed: 0,identifica,orgao,ementa,fulltext,secao,edicao,pagina,data_pub,assina,cargo,url,url_certificado,capture_date
0,"PORTARIA Nº 1, DE 18 DE JANEIRO DE 2023",Ministério da Economia/Banco do Brasil S.A./Di...,,UNIDADE DE SEGURANÇA INSTITUCIONALPORTARIA Nº ...,DO1,16,81,23/01/2023,Luiz Fernando Ferreira Martins,,,http://pesquisa.in.gov.br/imprensa/jsp/visuali...,2023-01-24 00:05:15


# Processamento de dados