# Prototyping for local captura_dou version
---

## Prototyping

In [None]:
import time
import capture_driver as cd
import global_settings as gs

if not gs.local:
    import boto3                                  
    from dynamodb_json import json_util as dyjson

In [None]:
def lambda_handler(event, context):
    client = boto3.client('dynamodb')
    
    # Load config from AWS DynamoDB:
    config = client.get_item(TableName="configs",  Key={'name': {'S': 'capture_DOU'}})
    config = dyjson.loads(config)['Item']
    
    # Run DOU articles capture:
    updated_config = cd.capture_DOU_driver(config)
    
    # Save config to AWS DynamoDB:
    response = client.put_item(TableName="configs", Item=dyjson.dumps(updated_config, as_dict=True))

In [None]:
def local_scheduler(config):
    while True:
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        config = cd.capture_DOU_driver(config)
        time.sleep(60*config['sched_interval'])

In [None]:
local_scheduler('../configs/capture_DOU_test.json')

## Work bench

# Propotype new excerto

In [None]:
def get_excert(paragraph):
    """
    Gets a string "paragraph", split it by " | " (which we used to 
    separate multiple paragraphs), and returns the combination of parts of the first 
    and second paragraphs. If there is just one paragraph, returns a part of it.
    """
    paragraph_list = paragraph.split('|')
    if len(paragraph_list) > 1:
        return paragraph_list[0][:200] + '... | ...' + paragraph_list[1][:200] 
    else:
        return paragraph_list[0][:300]
    

def make_resumo(paragraph):
    """
    Given a string 'paragraph' with all paragraphs of the DOU's article
    (separated by ' | '), creates a excert that acts as an abstract of 
    the article and return it.
    """
    if type(paragraph) == str:
        # Look for a marker of start of a possible resumo:
        marker = 'resolve: | '
        marker_start = paragraph.find(marker)
        # If no marker, just take the first paragraphs:
        if marker_start == -1:
            resumo = get_excert(paragraph)
        # If marker, get the following paragraphs:
        else:
            marker_end = marker_start + len(marker)
            paragraph  = paragraph[marker_end:]
            resumo = get_excert(paragraph)
        # Add ... to end of resumo if not the end of a phrase:
        if resumo[-1] != '.' and resumo[-2:] != '. ':
            resumo = resumo + '...'
        return resumo
    else:
        return None
    

### Testing

In [1]:
config = {
    "sched_interval": 60,
    "date_format": "%Y-%m-%d",
    "end_date": "2019-07-11",
    "timedelta": 0,
    "secao": ["e"],
    "secao_all":[1,2,"e"],
    "last_extra": 0,
    "storage_path": "../temp/",
    "save_articles": False,
    "filter_file": "../filters/all_DOU_filters_2019-07-01.json",
    "post_articles": True,
    "slack_token": "../keys-configs/slack_token.pass"
}

In [2]:
import requests
# This project's functions:
import global_settings as gs
import get_articles_url as gu
import parse_dou_article as pa
import write_article as wa
import filter_articles as fa
import structure_article as sa
import post_to_slack as ps


In [3]:
    # Get list of URLs and filenames (in case one wants to save the articles):    
    if gs.debug:
        print("Getting articles' URLs...")
    url_file_list, next_config = gu.get_articles_url(config)
    Nurls = len(url_file_list)
    if True or gs.debug:
        print('# URLs:', Nurls)

Getting articles' URLs...
Starting get_articles_url with config:
{'sched_interval': 60, 'date_format': '%Y-%m-%d', 'end_date': '2019-07-11', 'timedelta': 0, 'secao': ['e'], 'secao_all': [1, 2, 'e'], 'last_extra': 0, 'storage_path': '../temp/', 'save_articles': False, 'filter_file': '../filters/all_DOU_filters_2019-07-01.json', 'post_articles': True, 'slack_token': '../keys-configs/slack_token.pass'}
Reading date range...
Reading selected sections...
Will enter loop over config date and section range:
-- 2019-07-11
   -- se
      Looping over URLs...
# URLs: 9


In [4]:
# Specifies number of retries for GET:
session = requests.Session()
session.mount('http://www.in.gov.br', requests.adapters.HTTPAdapter(max_retries=3))


In [8]:
url_file = url_file_list[3]
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/decreto-de-19-de-junho-de-2019-167085561'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/portaria-n-70-de-12-de-agosto-de-2019-210735799'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/portaria-n-1.480-de-9-de-agosto-de-2019-210283760'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/despacho-209287540'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/portaria-n-1.303-de-1-de-julho-de-2019-209010426'
url_file['url'] = 'http://www.in.gov.br/web/dou/-/decreto-n-9.970-de-14-de-agosto-de-2019-210740817'
response = session.get(url_file['url'], timeout=5)

raw_article = pa.parse_dou_article(response, url_file['url'])

In [9]:
article = sa.structure_article(raw_article)

print(article['resumo'])

, no uso da atribuição que lhe confere o art. 84, ... | ... , inciso VI, alínea "a", da Constituição, ...


In [10]:
article

{'secao': '1',
 'orgao': 'Atos do Poder Executivo',
 'assina': 'JAIR MESSIAS BOLSONARO | Onyx Lorenzoni',
 'identifica': 'DECRETO Nº 9.970, DE 14 DE AGOSTO DE 2019',
 'cargo': None,
 'pagina': '1',
 'edicao': '157',
 'italico': None,
 'ementa': 'Dispõe sobre o Comitê Federal de Assistência Emergencial.',
 'strong': 'O PRESIDENTE DA REPÚBLICA | caput | D E C R E T A | caput',
 'ato_orgao': None,
 'subtitulo': None,
 'paragraph': ', no uso da atribuição que lhe confere o art. 84, | , inciso VI, alínea "a", da Constituição, | : | Art. 1º Este Decreto dispõe sobre o Comitê Federal de Assistência Emergencial para acolhimento a pessoas em situação de vulnerabilidade decorrente de fluxo migratório provocado por crise humanitária. | Art. 2º O Comitê Federal de Assistência Emergencial é órgão deliberativo, instituído pelo art. 6º da Lei nº 13.684, de 21 junho de 2018, ao qual compete: | I - articular ações, projetos e atividades desenvolvidos com apoio dos Governos federal, estadual, distrital 

## Prototyping assinaPr

In [None]:
# Prototyping the use of 'assinaPr':
def structure_article(article_raw):
    """
    Takes a list of dicts that represent a DOU article with the keywords
    key, value, capture_date, url and url_certificado and select relevant 
    keys (hard-coded), rename them and output a dict with only the relevant 
    keys.
    """
    relevant_keys = ['secao-dou', 'orgao-dou-data', 'assina', 'identifica', 'cargo', 'secao-dou-data', 
                     'edicao-dou-data', 'dou-em', 'ementa', 'dou-strong', 'titulo', 'subtitulo', 
                     'dou-paragraph', 'publicado-dou-data', 'assinaPr']
    new_keys      = ['secao', 'orgao', 'assina', 'identifica', 'cargo', 'pagina',
                     'edicao', 'italico', 'ementa', 'strong', 'ato_orgao', 'subtitulo', 
                     'paragraph', 'pub_date', 'assinaPr']
    
    relevant_values = [sa.get_key_value(key, article_raw) for key in relevant_keys]
    struct = dict(zip(new_keys, relevant_values))
    
    # Join with identifying fields:
    struct['capture_date']    = article_raw[0]['capture_date']
    struct['url']             = article_raw[0]['url']
    struct['url_certificado'] = article_raw[0]['url_certificado']
    
    # Format selected fields:
    struct['secao']  = struct['secao'].split('|')[0].split(':')[1].strip()
    struct['assina'] = struct['assina'] if struct['assinaPr'] == None else struct['assinaPr'] + ' | ' + struct['assina'] 
    
    # Create new field (all the text):
    fields_list = filter(lambda s: s!=None, [struct['ato_orgao'], struct['subtitulo'], struct['ementa'], 
                                            struct['strong'], struct['italico'], struct['paragraph']])
    struct['alltext'] = ' | '.join(fields_list)
    # Another new field (a clipping):
    struct['resumo'] = sa.make_resumo(struct['paragraph'])
        
    return struct