# Prototyping for local captura_dou version
---

## Prototyping

In [None]:
import time
import capture_driver as cd
import global_settings as gs

if not gs.local:
    import boto3                                  
    from dynamodb_json import json_util as dyjson

In [None]:
def lambda_handler(event, context):
    client = boto3.client('dynamodb')
    
    # Load config from AWS DynamoDB:
    config = client.get_item(TableName="configs",  Key={'name': {'S': 'capture_DOU'}})
    config = dyjson.loads(config)['Item']
    
    # Run DOU articles capture:
    updated_config = cd.capture_DOU_driver(config)
    
    # Save config to AWS DynamoDB:
    response = client.put_item(TableName="configs", Item=dyjson.dumps(updated_config, as_dict=True))

In [None]:
def local_scheduler(config):
    while True:
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
        config = cd.capture_DOU_driver(config)
        time.sleep(60*config['sched_interval'])

In [None]:
local_scheduler('../configs/capture_DOU_test.json')

## Work bench

# Propotype new excerto

In [None]:
def get_excert(paragraph):
    """
    Gets a string "paragraph", split it by " | " (which we used to 
    separate multiple paragraphs), and returns the combination of parts of the first 
    and second paragraphs. If there is just one paragraph, returns a part of it.
    """
    paragraph_list = paragraph.split('|')
    if len(paragraph_list) > 1:
        return paragraph_list[0][:200] + '... | ...' + paragraph_list[1][:200] 
    else:
        return paragraph_list[0][:300]
    

def make_resumo(paragraph):
    """
    Given a string 'paragraph' with all paragraphs of the DOU's article
    (separated by ' | '), creates a excert that acts as an abstract of 
    the article and return it.
    """
    if type(paragraph) == str:
        # Look for a marker of start of a possible resumo:
        marker = 'resolve: | '
        marker_start = paragraph.find(marker)
        # If no marker, just take the first paragraphs:
        if marker_start == -1:
            resumo = get_excert(paragraph)
        # If marker, get the following paragraphs:
        else:
            marker_end = marker_start + len(marker)
            paragraph  = paragraph[marker_end:]
            resumo = get_excert(paragraph)
        # Add ... to end of resumo if not the end of a phrase:
        if resumo[-1] != '.' and resumo[-2:] != '. ':
            resumo = resumo + '...'
        return resumo
    else:
        return None
    

### Testing

In [1]:
config = {
    "sched_interval": 60,
    "date_format": "%Y-%m-%d",
    "end_date": "2019-07-11",
    "timedelta": 0,
    "secao": ["e"],
    "secao_all":[1,2,"e"],
    "last_extra": 0,
    "storage_path": "../temp/",
    "save_articles": False,
    "filter_file": "../filters/all_DOU_filters_2019-07-01.json",
    "post_articles": True,
    "slack_token": "../keys-configs/slack_token.pass"
}

In [2]:
import requests
# This project's functions:
import global_settings as gs
import get_articles_url as gu
import parse_dou_article as pa
import write_article as wa
import filter_articles as fa
import structure_article as sa
import post_to_slack as ps


In [3]:
    # Get list of URLs and filenames (in case one wants to save the articles):    
    if gs.debug:
        print("Getting articles' URLs...")
    url_file_list, next_config = gu.get_articles_url(config)
    Nurls = len(url_file_list)
    if True or gs.debug:
        print('# URLs:', Nurls)

Getting articles' URLs...
Starting get_articles_url with config:
{'sched_interval': 60, 'date_format': '%Y-%m-%d', 'end_date': '2019-07-11', 'timedelta': 0, 'secao': ['e'], 'secao_all': [1, 2, 'e'], 'last_extra': 0, 'storage_path': '../temp/', 'save_articles': False, 'filter_file': '../filters/all_DOU_filters_2019-07-01.json', 'post_articles': True, 'slack_token': '../keys-configs/slack_token.pass'}
Reading date range...
Reading selected sections...
Will enter loop over config date and section range:
-- 2019-07-11
   -- se
      Looping over URLs...
# URLs: 9


In [4]:
# Specifies number of retries for GET:
session = requests.Session()
session.mount('http://www.in.gov.br', requests.adapters.HTTPAdapter(max_retries=3))


In [19]:
url_file = url_file_list[6]
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/decreto-de-19-de-junho-de-2019-167085561'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/portaria-n-70-de-12-de-agosto-de-2019-210735799'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/portaria-n-1.480-de-9-de-agosto-de-2019-210283760'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/despacho-209287540'
#url_file['url'] = 'http://www.in.gov.br/web/dou/-/portaria-n-1.303-de-1-de-julho-de-2019-209010426'
response = session.get(url_file['url'], timeout=5)

raw_article = pa.parse_dou_article(response, url_file['url'])

article = sa.structure_article(raw_article)

print(article['resumo'])

Art. 1º Fica habilitado o Município descrito no anexo a esta Portaria a receber recursos referentes ao incremento temporário do Piso da Atenção Básica (PAB). ... | ... Art. 2º Os recursos tratados nesta Portaria referem-se à aplicação das emendas parlamentares para incremento temporário do Piso da Atenção Básica (PAB), observando o disposto no Capítulo II da Portar...
