In [10]:
from lambda_function import *

In [11]:
event = {
  "table_name": "capture_urls",
  "key": {
    "name": {
      "S": "senado-proposicoes-lista"
    },
    "capture_type": {
      "S": "historical"
    }
  }
}

In [12]:
# Cria cliente do dynamo:
client = boto3.client('dynamodb')

# Seleciona um arquivo do dynamo:
response = client.get_item(TableName=event['table_name'], 
                            Key=event['key'])

# Lê o arquivo do dynamo (retorna uma lista de dicionários ou um dicionário):
response = dyjson.loads(response)
if debug == True:
    print("dict of dynamo Table:") 
    print(response)


# Gera as URLs e os filenames (destino):
body = generate_body(response, event)
if debug:
    print('# items to capture (body length):', len(body))
# Rename 'url' key if it is not an url:
body = [adapt_url_key(b) for b in body]


dict of dynamo Table:
{'Item': {'parameters': [{'name': 'year', 'body': {'from': 2019, 'to': 2021}, 'type': 'from_to'}], 'data_path': ['PesquisaBasicaMateria', 'Materias', 'Materia'], 'capture_type': 'historical', 'bucket': 'brutos-publicos', 'exclude_keys': ['AtualizacoesRecentes'], 'key': 'legislativo/senado/v1/proposicoes-lista/', 'url': 'http://legis.senado.leg.br/dadosabertos/materia/pesquisa/lista?ano=%(year)s&dataInicioApresentacao=%(year)s0101', 'headers': {'Accept': 'application/json'}, 'name': 'senado-proposicoes-lista', 'data_type': 'json'}, 'ResponseMetadata': {'RequestId': '6JQ6B28VSTO4JA5MVUE8QJTIHFVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Thu, 11 Feb 2021 20:02:50 GMT', 'content-type': 'application/x-amz-json-1.0', 'content-length': '635', 'connection': 'keep-alive', 'x-amzn-requestid': '6JQ6B28VSTO4JA5MVUE8QJTIHFVV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '1376242741'}, 'RetryAttempts': 0}}
{'name': 'year', 'body': 

In [13]:
# Split requests in parallel batches according to config:
n_batches = read_parallel_batches(response)
body_batches = split_parallel_batches(body, n_batches)

# Chama cliente do lambda:
lambd = boto3.client('lambda')


for body in body_batches:  # If body_batches == [], it skips everything in the loop.
    print('Create dynamo temp table with', len(body), 'entries')

    # Salva os as informações geradas acima no dynamo como uma tabela temp:
    params = create_and_populate_dynamodb_table(body, event)
    if debug == True:
        print('URLs to capture listed in:')
        print(params)

    # Faz a captura efetivamente, com os parâmetros criados por generate_body e 
    # salvos por create_and_populate_dynamodb_table:    
    if False:
        if debug:
            print('Invoking http-request...')
        lambd.invoke(
            FunctionName='arn:aws:lambda:us-east-1:085250262607:function:http-request:JustLambda',
            #FunctionName='arn:aws:lambda:us-east-1:085250262607:function:http-request:DEV',
            InvocationType='Event',
            Payload=json.dumps(params))

Create dynamo temp table with 3 entries
URLs to capture listed in:
{'dynamo_table_name': 'temp-capture-senado-proposicoes-lista-historical-2021-02-11-17-01-47', 'order': 2}


# Parametrize API requests

In [17]:
import boto3
from dynamodb_json import json_util as dyjson 
from pyathena import connect
from datetime import timedelta, date, datetime
from collections import defaultdict
import time
import json
import random
import sys
import google.auth
from google.cloud import bigquery
import os
sys.path.insert(0, "external_modules")
import importlib

# Switch for printing messages to log:
debug = True
# Wheter this code is ran locally or on AWS:
local = True

def query_bigquery(query):
    """
    Runs a `query` (str) on Google BigQuery and returns the results as
    a list of dictionaries.
    """
    
    if local:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/tmp/key.json'
    
    # Get key for accessing BigQuery:
    s3 = boto3.client('s3')
    a  = s3.get_object(
                  Bucket='config-lambda', 
                  Key='layers/google-cloud-storage/gabinete-compartilhado.json')
    open('/tmp/key.json', 'w').write(a['Body'].read().decode('utf-8'))

    # Create credentials with Drive & BigQuery API scopes
    # Both APIs must be enabled for your project before running this code
    credentials, project = google.auth.default(scopes=[
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/bigquery',
    ])
    bq = bigquery.Client(credentials=credentials, project=project)
        
    result = bq.query(
        query,
        # Location must match that of the dataset(s) referenced in the query.
        location="US",
    )  # API request - starts the query
    
    result = [dict(r.items()) for r in result] 
    
    return result


def forms_bigquery(par, item, forms):
    """
    Faz um query no Google BigQuery e usa os resultados para 
    construir uma lista de URLs e filenames (destino).
    """
       
    # Substitui parâmetros de input na query:
    query = par['query'] % par['query_config']
    
    # Executa a query:
    data = query_bigquery(query)
        
    # LOOP sobre as linhas do retorno do SQL:
    for d in data:
        
        # Create data destination filename:
        if len(d) > 1:
            end_filename = '&'.join(map(lambda x: '='.join(map(str, x)), zip(par['url_params'], list(d.values()))))
        else:
            end_filename = d.values()[0]
        filename = '_'.join(map(str, [item['name'], end_filename])) + '.json'
        
        # Create source url:    
        url = item['url'] % dict(zip(par['url_params'], list(d.values())))
        
        if 'url' in d:
            raise Exception("'url' key already exists in data; avoiding its redefinition.")
        if 'filename' in d:
            raise Exception("'filename' key already exists in data; avoiding its redefinition.")
        d['url']      = url
        d['filename'] = filename
        
        forms.append(d)
    
    return forms


def forms_athena_query(par, item, forms):
    """
    Faz um query no Athena (SQL da Amazon) e usa os resultados para 
    construir uma lista de URLs e filenames (destino).
    """
    
    # Get AWS security credentials:
    client = boto3.client('s3')
    a = client.get_object(Bucket='config-lambda', Key='aws_accessKeys.json')
    aws_key = json.loads(a['Body'].read().decode('utf-8'))

    # Conecta à Athena com pacote do Joe.
    cursor = connect(aws_access_key_id=aws_key['aws_access_key_id'],
                         aws_secret_access_key=aws_key['aws_secret_access_key'],
                         s3_staging_dir='s3://stagging-random/',
                         region_name='us-east-1').cursor()
    
    # Substitui parâmetros de input na query:
    query = par['query'] % par['query_config']
    
    # Executa a query:
    data = cursor.execute(query).fetchall() 
    
    # LOOP sobre as linhas do retorno do SQL:
    for d in data:
        
        if len(d) > 1:
            end_filename = '&'.join(map(lambda x: '='.join(map(str, x)),
                                                  zip(par['url_params'], 
                                                      list(d))))
        else:
            end_filename = d[0]

        forms.append({'url': item['url'] % dict(zip(par['url_params'], list(d))),
                      'filename': '_'.join(map(str, [item['name'], end_filename])) + '.json'
                      })
    
    return forms


def forms_from_to(par, item, forms):
    """
    A partir de um modelo de URL e de filename, cria realizações concretas 
    substituindo cada um dos anos listados como input nos URLs e filenames.
    
    Dynamodb data structure:
    {
      "body": {
        "from": 1993,
        "to": 2019
      },
      "name": "id",
      "type": "from_to"
    }
    """
    
    # LOOP sobre os anos:
    for year in range(par['body']['from'], par['body']['to'] + 1):
        
        forms.append({'url': item['url'] % {par['name']: year},
                      'filename': '_'.join(map(str, [item['name'], year])) + '.json'
                      })
    
    return forms
   
    
def forms_from_external_list(par, item, forms, event):
    
    for item_from_list in event['external_params']['list']:
        
        forms.append({'url': item['url'] % {par['url_param']: item_from_list},
                      'filename': '_'.join(map(str, [item['name'], item_from_list])) + '.json'
                      })
    
    return forms


def daterange(start_date, end_date):
    """
    Given a 'start_date' and an 'end_date' (datetimes), returns a generator
    for dates in that range, with the same behaviour as 'range' (i.e. excludes 
    the 'end_date' from the returned values).
    
    NOTE: if 'start_date' > 'end_date', it returns the dates from 'end_date' 
    to 'start_date', excluding 'start_date' instead of 'end_date'. In other
    words, it always excludes the farthest future date.
    """
    if end_date - start_date < timedelta(0):
        temp_date  = end_date
        end_date   = start_date
        start_date = temp_date
    
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)


def forms_date_start_end(par, item, forms):
    """
    A partir de um modelo de URL e de filename, cria realizações concretas 
    substituindo cada um das datas listadas como input nos URLs e filenames.
    As datas tem formato definido por date_format que vem no input.
    """
    
    # Parse relative or specified capture's end date: 
    if par['end_date'] == 'yesterday':
        end_date = date.today() - timedelta(1)
    elif par['end_date'] == 'now':
        end_date = date.today()
    else:
        end_date = datetime.strptime(par['end_date'], par['date_format'])

    start_date = end_date + timedelta(par['timedelta'])
    
    for single_date in daterange(start_date, end_date):
        dates = {'start_date': single_date, 'end_date': single_date + timedelta(1)}
        
        # Create filename for data:
        # In case both dates are required in the url:
        if (item['url'].find('start_date') != -1) and (item['url'].find('end_date') != -1):
            filename = '_'.join([item['name'],
                                          datetime.strftime(dates['start_date'], '%Y-%m-%d'),
                                          datetime.strftime(dates['end_date'], '%Y-%m-%d'),]) + '.json'
        # In case only the start date is required in the url:
        elif item['url'].find('start_date') != -1:
            filename = '_'.join([item['name'], datetime.strftime(dates['start_date'], '%Y-%m-%d')]) + '.json'
        # In case only the end date is required in the url:
        elif item['url'].find('end_date') != -1:
            filename = '_'.join([item['name'], datetime.strftime(dates['end_date'], '%Y-%m-%d')]) + '.json'
        # In case no dates are required in the URL:
        else:
            filename = item['name'] + '.json'

        forms.append({'url': item['url'] % {key: datetime.strftime(value, par['date_format']) for key, value in dates.items()},
            'filename': filename})
    
    return forms


def forms_external_module(par, item, forms):
    
    em = importlib.import_module(item['name'].replace('-', '_'))
    
    return em.entrypoint(par)


def generate_forms(item, event):
    """
    Cria URLs a partir das informações no dynamo.
    
    Retorno: forms, que é basicamente uma lista de dicionários que 
    cada dicionário contém um URL e uma filename (destino).
    """
    
    # Pega entrada 'parameters' no arquivo do dynamo:
    parameters = item['parameters']
    
    forms = []
    for par in parameters:
        print(par)
        
        # Verifica o tipo de tarefa e executa o código apropriado:
        if par['type'] == 'from_to':
            
            forms = forms_from_to(par, item, forms)

        elif par['type'] == 'date_start_end':

            forms = forms_date_start_end(par, item, forms)
        
        elif par['type'] == 'athena_query':

            forms = forms_athena_query(par, item, forms)
        
        elif par['type'] == 'bigquery':
            forms = forms_bigquery(par, item, forms)
            
        elif par['type'] == 'external_list':
            
            form = forms_from_external_list(par, item, forms, event)
        
        elif par['type'] == 'empty':
            
            forms = [{'url': item['url'],
                      'filename': item['name'] + '.json'
                     }]
        
        elif par['type'] == 'external_module':
            forms = forms_external_module(par['params'], item, forms)
        
        else:

            raise 'Parameter type not identified'
            
    return forms 
    
    
def generate_body(response, event):
    """
    Gera as URLs a partir de informações em arquivo 'response' do dynamo,
    e outras coisas (metadados necessários).
    """
    
    # Gera as URLs:
    forms = generate_forms(response['Item'], event)
    
    # O response item é um dicionário. Aqui incluímos o default para 
    # não dar pau se faltar alguma key do dicionário (e.g. records_keys)
    response['Item'] = defaultdict(lambda: None, response['Item'])

    # Vamos popular uma lista de dicionários 'body' com URLs e metadados:    
    body = []
    for item in forms:
    # Do item vem filename e url, o resto vem do dynamo, basicamente infos 
    # sobre localização dos dados.
    
        request_pars = dict(url=item.pop('url'),
                            params={}, # Parâmetros do HTTP GET.
                            headers=response['Item']['headers'], # Headers do HTTP GET.
                            bucket=response['Item']['bucket'],
                            key=response['Item']['key'] + item.pop('filename'),
                            data_type=response['Item']['data_type'],
                            data_path=response['Item']['data_path'],
                            exclude_keys=response['Item']['exclude_keys'],
                            records_keys=response['Item']['records_keys'],
                            name=response['Item']['name'],
                            requests_pars=response['Item']['requests_pars']
                           )
        request_pars['aux_data'] = item
    
        body.append(request_pars)
        
    return body
    

def create_dynamo_temp_table(table_name, dynamodb):
    
    try:
        table = dynamodb.Table(table_name)
        table.table_status

    except:
        create_table_response = dynamodb.create_table(
            TableName= table_name,
            AttributeDefinitions=[{
            'AttributeName': 'order',
            'AttributeType': 'N'
            }],
            KeySchema=[{
                'AttributeName': 'order',
                'KeyType': 'HASH'
            }],
            BillingMode='PAY_PER_REQUEST'
        )
    
    
def create_and_populate_dynamodb_table(urls, event):
    """
    urls é uma lista de dicionários. Cada dicionário tem 
    entradas descritas em 'body' na função generate forms acima.
    """
    
    dynamodb = boto3.resource('dynamodb')
   
    # Determina o nome da tabela de output no dynamo a partir das informações de captura: 
    table_name = '-'.join(['temp-capture',
                            event['key']['name']['S'],
                            event['key']['capture_type']['S'],
                            datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M-%S')])
    
    # Cria uma tabela vazia no dynamo:
    create_dynamo_temp_table(table_name, dynamodb)

    time.sleep(60)
    
    # Pega a referência (pointer) da tabela do dynamo:    
    table = dynamodb.Table(table_name)

    # Escreve os dicionários criados pela função generate_body na tabela do dynamo: 
    # REAAALY FAST!
    with table.batch_writer() as batch:
        for order, url in enumerate(urls): 
            url.update({'order': order})   # Cria um novo key com a ordem dos dicionários 'url' na lista 'urls'.
            batch.put_item(Item=url)

    # Retorna o nome da tabela e o número de linhas - 1:    
    return {'dynamo_table_name': table_name, 'order': len(urls) - 1}


def adapt_url_key(body_entry):
    """
    Rename the `body_entry` dict key 'url' to 'identifier' 
    if its value does not start with 'http' or 'ftp'.
    
    PS: It changes the content of the input dict `body_entry`.
    """
    
    adapted = body_entry
    if body_entry['url'][:4] != 'http' and body_entry['url'][:3] != 'ftp':
        body_entry['identifier'] = body_entry.pop('url')
    
    return body_entry
    

def read_parallel_batches(response):
    """
    Given a `response` from dynamoDB's get_item (after translating from dyJSON, 
    that is, a dict where the important information, the content of a table's item, 
    is inside the key 'Item'), return the number of parallel batches in which to 
    split the requests. This is 1 by default, or something else if specified in the 
    dynamoDB item.
    """
    
    parallel_key = 'parallel_batches'
    config = response['Item']
    
    if parallel_key not in config.keys() or config[parallel_key] == None or config[parallel_key] <= 1:
        return 1
    
    else:
        return config[parallel_key]


def split_parallel_batches(body, n_batches):
    """
    Given a list `body` and an integer `n_batches`, tries to split `body` 
    into `n_batches` sub-lists. For certain combinations of parameters, 
    the number of sub-lists is different than the requested number `n_batches`.
    It is recommended to measure the length of the returned list of batches.
    """
    n_requests = len(body)
    
    # Set batch sizes:
    batch_sizes = [round(n_requests / n_batches) for i in range(n_batches)]
    batch_sizes[0] = max(n_requests - sum(batch_sizes[1:]), 0)

    # Set positions that mark the start and end of batches:
    batch_pos = [sum(batch_sizes[:i]) for i in range(n_batches + 1)]
    
    # Split into batches
    batches = [body[batch_pos[i]:batch_pos[i+1]] for i in range(n_batches) \
               if len(body[batch_pos[i]:batch_pos[i+1]]) > 0]
    
    return batches    
    

def lambda_handler(event, context):
    """
    Cria lista de de URLs para baixar, e depois chama o lambd.invoke que 
    efetivamente baixa o conteúdo dos URLs.
    
    # Exemplo de input em `event`:
    {
      "table_name": "capture_urls",
      "key": {
        "name": {
          "S": "camara-deputados-detalhes"
        },
        "capture_type": {
          "S": "historical"
        }
      }
    }"""
    
    
    print("Starting parametrize-API-requests with event:")
    print(event)

    # Cria cliente do dynamo:
    client = boto3.client('dynamodb')

    # Seleciona um arquivo do dynamo:
    response = client.get_item(TableName=event['table_name'], 
                                Key=event['key'])

    # Lê o arquivo do dynamo (retorna uma lista de dicionários ou um dicionário):
    response = dyjson.loads(response)
    if debug == True:
        print("dict of dynamo Table:") 
        print(response)

    # Gera as URLs e os filenames (destino):
    body = generate_body(response, event)
    # Rename 'url' key if it is not an url:
    body = [adapt_url_key(b) for b in body]

    # Split requests in parallel batches according to config:
    n_batches = read_parallel_batches(response)
    body_batches = split_parallel_batches(body, n_batches)

    # Chama cliente do lambda:
    lambd = boto3.client('lambda')

    
    for body in body_batches:  # If body_batches == [], it skips everything in the loop.
        print('Create dynamo temp table with', len(body), 'entries')

        # Salva os as informações geradas acima no dynamo como uma tabela temp:
        params = create_and_populate_dynamodb_table(body, event)
        print(params)
        if debug == True:
            print('URLs to capture listed in:')
            print(params)

        # Faz a captura efetivamente, com os parâmetros criados por generate_body e 
        # salvos por create_and_populate_dynamodb_table:    
        if False:
            if debug:
                print('Invoking http-request...')
            lambd.invoke(
                FunctionName='arn:aws:lambda:us-east-1:085250262607:function:http-request:JustLambda',
                #FunctionName='arn:aws:lambda:us-east-1:085250262607:function:http-request:DEV',
                InvocationType='Event',
                Payload=json.dumps(params))

In [18]:
item = {
  "bucket": "brutos-publicos",
  "capture_type": "monthly",
  "data_path": [],
  "data_type": "csv",
  "headers": {},
  "key": "legislativo/senado/v1/pessoas/comissionados/",
  "name": "senado-pessoas-comissionados",
  "parameters": [
    {
      "query": "SELECT DATE_FORMAT(CURRENT_DATE, '%(date_format)s') AS date",
      "query_config": {"date_format": "%Y-%m-%d"},
      "type": "athena_query",
      "url_params": ["date"]
    }
  ],
  "requests_pars": {"skip_rows": 1, "sep_row": "\r\n", "sep_col": ";"},
  "url": "http://www.senado.gov.br/transparencia/LAI/secrh/servidores_comissionados_csv.csv"
}

In [19]:
response = dict()
response['Item'] = item

In [20]:
generate_body(response, {})

{'query': "SELECT DATE_FORMAT(CURRENT_DATE, '%(date_format)s') AS date", 'query_config': {'date_format': '%Y-%m-%d'}, 'type': 'athena_query', 'url_params': ['date']}


[{'url': 'http://www.senado.gov.br/transparencia/LAI/secrh/servidores_comissionados_csv.csv',
  'params': {},
  'headers': {},
  'bucket': 'brutos-publicos',
  'key': 'legislativo/camara/scrapping/comissionados/camara-deputados-comissionados_2020-06-29.json',
  'data_type': 'csv',
  'data_path': [],
  'exclude_keys': None,
  'records_keys': None,
  'name': 'camara-deputados-comissionados',
  'requests_pars': {'skip_rows': 1, 'sep_row': '\r\n', 'sep_col': ';'},
  'aux_data': {}}]

### Roda lambda_handler

In [4]:
event = {
  "table_name": "capture_urls",
  "key": {
    "name": {
      "S": "camara-deputados-comissionados"
    },
    "capture_type": {
      "S": "monthly"
    }
  }
}

In [5]:
lambda_handler(event, {})

Starting parametrize-API-requests with event:
{'table_name': 'capture_urls', 'key': {'name': {'S': 'camara-deputados-comissionados'}, 'capture_type': {'S': 'monthly'}}}
dict of dynamo Table:
{'Item': {'parameters': [{'query_config': {}, 'url_params': ['id', 'ano', 'mes'], 'type': 'athena_query', 'query': "SELECT DISTINCT split(uri, '/')[7] as id, YEAR(CURRENT_DATE) AS ano, MONTH(CURRENT_DATE) AS mes FROM camara_v2.deputados WHERE idlegislaturafinal = 56"}], 'data_path': ['dados'], 'capture_type': 'monthly', 'bucket': 'brutos-publicos', 'key': 'legislativo/camara/scrapping/comissionados/', 'url': 'https://www.camara.leg.br/deputados/%(id)s/pessoal-gabinete?ano=%(ano)s', 'headers': {}, 'name': 'camara-deputados-comissionados', 'data_type': 'external_module'}, 'ResponseMetadata': {'RequestId': '9EUFUR5TH3KJ6SDS259TQPQ2PVVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Mon, 15 Jun 2020 15:00:58 GMT', 'content-type': 'application/x-amz-json-1.

### Roda a função lambda até certo ponto

In [2]:
event = {
  "table_name": "capture_urls",
  "key": {
    "name": {
      "S": "camara-deputados-comissionados"
    },
    "capture_type": {
      "S": "monthly"
    }
  }
}

In [3]:
print("Starting parametrize-API-requests with event:")
print(event)

# Cria cliente do dynamo:
client = boto3.client('dynamodb')

# Seleciona um arquivo do dynamo:
response = client.get_item(TableName=event['table_name'], 
                            Key=event['key'])

# Lê o arquivo do dynamo (retorna uma lista de dicionários ou um dicionário):
response = dyjson.loads(response)
if debug == True:
    print("dict of dynamo Table:") 
    print(response)

Starting parametrize-API-requests with event:
{'table_name': 'capture_urls', 'key': {'name': {'S': 'executivo-federal-dou'}, 'capture_type': {'S': 'now'}}}


In [39]:
# Gera as URLs e os filenames (destino):
body = generate_body(response, event)

{'params': {'end_date': '2020-05-07', 'debug': False, 'timedelta': 0, 'url_list': 'douDB_captured_urls', 'date_format': '%Y-%m-%d', 'save_articles': True, 'secao': [1, 2, 3, 'e'], 'secao_all': [1, 2, 3, 'e'], 'use_config': False, 'post_articles': False, 'update_config': False}, 'type': 'external_module'}


In [40]:
# O que será passado ao http-request via item da tabela temp do dynamo:
body[0]

{'url': 'http://www.in.gov.br/web/dou/-/portaria-n-1.278-de-4-de-maio-de-2020-255609815',
 'params': {},
 'headers': {},
 'bucket': 'brutos-publicos',
 'key': 'executivo/federal/dou/2020-05-07_s1_portaria-n-1.278-de-4-de-maio-de-2020-255609815.json',
 'data_type': 'external_module',
 'data_path': [],
 'exclude_keys': None,
 'records_keys': None,
 'name': 'executivo-federal-dou',
 'aux_data': {'url_list': 'douDB_captured_urls'}}

### Temos que passar info da response['Item'] para o t[0]

In [None]:
t = generate_forms(response['Item'], event)

In [None]:
t[0]

#### t[0] é um elemento do return de entrypoint

## Entry point

In [12]:
import requests
from lxml import html
import json
import datetime as dt
import boto3
from dynamodb_json import json_util as dyjson

debug = False

def daterange(start_date, end_date):
    """
    Same as python's 'range', but for datetime.
    NOTE: currently it does not support steps input.
    """
    for n in range(int ((end_date - start_date).days)):
        yield start_date + dt.timedelta(n)
        
def get_artigos_do(data, secao):
    """
    Para uma data (datetime) e uma seção (str) do DOU,
    retorna uma lista de jsons com todos os links e outros metadados dos 
    artigos daquele dia e seção. 
    """
    # Hard-coded:
    do_date_format = '%d-%m-%Y'
    # Transforma data:
    data_string = data.strftime(do_date_format)
    
    # Exemplo de URL: 'http://www.in.gov.br/leiturajornal?data=13-05-2019&secao=do1'
    url   = 'http://www.in.gov.br/leiturajornal?data=' + data_string + '&secao=do' + str(secao)

    # Specifies number of retries for GET:
    session = requests.Session()
    session.mount('http://www.in.gov.br', requests.adapters.HTTPAdapter(max_retries=3))
    
    # Captura a lista de artigos daquele dia e seção:
    res   = session.get(url)
    tree  = html.fromstring(res.content)
    xpath = '//*[@id="params"]/text()'
    return json.loads(tree.xpath(xpath)[0])['jsonArray']

def fix_filename(urlTitle):
    """
    Change the url 'urlTitle' substring used to acess the DOU article to something 
    that can be used as part of a filename.    
    """
    fixed = urlTitle.replace('//', '/')
    return fixed

def load_remote_config(table_name, key):
    """
    Given a hard-coded table reference in dynamoDB (AWS) (see event), 
    loads the configuration for the DOU articles' capture.
    """
    
    # Format key to dynamoDB json:
    key = {"name": {"S": key}}
    
    if debug:
        print('load_remote_config key:', key)
    
    # Read json from dynamoDB: 
    client   = boto3.client('dynamodb')
    response = client.get_item(TableName=table_name,Key=key)
    response = dyjson.loads(response)
    # Get configurations:
    config   = response['Item']
    
    return config


def brasilia_day():
    """
    No matter where the code is ran, return UTC-3 day
    (Brasilia local day, no daylight savings)
    """
    return (dt.datetime.utcnow() + dt.timedelta(hours=-3)).replace(hour=0, minute=0, second=0, microsecond=0)

def update_config(config, Narticles_in_section):
    """
    Given a config file for capturing DOU articles' URLs and a dict 
    that states how many articles were found in each requested section
    'Narticles_in_section', return an updated config for the next request 
    try. 
    
    Required config keys:
    * end_date    > The articles' date to request the URLs;
    * date_format > The format of the date above (e.g. %Y-%m-%d);
    * secao       > Current list of sections to request URLs;
    * secao_all   > All sections one may want to request (does not update);
    * timedelta   > Current implementation requires this to be 0.
    * last_extra  > The extra edition number of the last capture.
    """
    
    if config['timedelta'] != 0:
        raise Exception('current implementation only allows timedelta=0.')
    
    # Copy config:
    config2  = dict(config)
    end_date = dt.datetime.strptime(config['end_date'], config['date_format'])
            
    # If end_date is in the future, keep the same config:
    if end_date > brasilia_day():
        return config2
    
    # If end_date is in the past, return next day and all sections:
    if end_date < brasilia_day():
        config2['secao'] = config['secao_all']
        config2['end_date'] = (end_date + dt.timedelta(days=1)).strftime(config['date_format'])
        config2['last_extra'] = 0
        return config2
    
    # PRESENT DAY: find out missing sections and set config to that:
    # PS: always keep Extra ('e') because it can appear at any time 
    section_keys = list(filter(lambda k: Narticles_in_section[k] == 0 or k == 'e', Narticles_in_section.keys()))
    config2['secao'] = section_keys

    # If there are no missing sections, reset sections list and get next day:
    if len(section_keys)==0:
        config2['end_date'] = (end_date + dt.timedelta(days=1)).strftime(config['date_format'])
        config2['secao'] = config['secao_all']
        
    return config2

def get_articles_url(config):
    """
    Get as input a dict 'config' with keys:
    
    * 'date_format': format of 'end_date' below, e.g. '%Y-%m-%d';
    * 'end_date':    last date to search for URLs (one can set to 'now' to get the current day); 
    * 'secao':       list of DOU sections to scan (1, 2, 3, e and/or 1a, or set to 'all' for '[1,2,3,e]';
    * 'timedelta':   number of days from end_date to start URL search (is a negative number);
    
    and creates a list of DOU articles' URLs to download. 
    """
    
    # Hard-coded stuff:
    url_prefix = 'http://www.in.gov.br/web/dou/-/'
    
    # Debug message:
    if debug:
        print("Starting get_articles_url with config:")
        print(config)
    
    # Translate string representing date to datetime:
    if debug:
        print('Reading date range...')
    if config['end_date'] == 'now':
        end_date = brasilia_day()
    elif config['end_date'] == 'yesterday':
        end_date = brasilia_day() + dt.timedelta(days=-1)
    else:
        end_date = dt.datetime.strptime(config['end_date'], config['date_format'])
    # Save it back to config dict:
    config['end_date'] = end_date.strftime(config['date_format'])
    
    timedelta = dt.timedelta(days=config['timedelta'])
    
    # If end_date is in the future, return empty list and same config
    # (wait for the next day):
    # PS: this will skip request URLs even for negative timedelta.
    if end_date > brasilia_day():
        return [], config
        
    # Translate secao config to a list of strings:
    if debug:
        print('Reading selected sections...')    
    secoes = config['secao']
    secoes = [1, 2, 3, 'e', '1a'] if secoes == 'all' else secoes
    secoes = secoes if type(secoes) == list else [secoes]
    secoes = [str(s) for s in secoes]
    
    # LOOP over dates:
    url_file_list = []
    Narticles_in_section = dict(zip(secoes, [0]*len(secoes)))
    start_date = end_date + timedelta
    if debug:
        print('Will enter loop over config date and section range:')    
    for date in daterange(start_date, end_date + dt.timedelta(days=1)):
        if debug:
            print('-- '+date.strftime('%Y-%m-%d'))
        # LOOP over DOU sections:
        for s in secoes:
            if debug:
                print('   -- s'+str(s))
            jsons = get_artigos_do(date, s)
            Narticles_in_section[s] = len(jsons)
            # LOOP over downloaded URL list:
            if debug:
                print('      Looping over URLs...')            
            for j in jsons:
                url      = url_prefix + j['urlTitle']
                filename = date.strftime('%Y-%m-%d') + '_s' + str(s) + '_' + fix_filename(j['urlTitle']) + '.json'
                url_file_list.append({'url':url, 'filename':filename})
        
    if debug:
        print('Narticles_in_section:', Narticles_in_section)
    
    # Only update config if it is going to be saved in Dynamo later on.
    if config['update_config']:
        next_config = update_config(config, Narticles_in_section)
    else:
        next_config = config
        
    return url_file_list, next_config


def entrypoint(params):
    """
    Input:   params (dict)
             Com as keywords 'dynamo_table' e 'config_key'
    Retorna: lista de dicts com url e path
    
    Atualiza a config no dynamoDB
    """
    
    # Load config from dynamoDB:
    if params['use_config']:
        config = load_remote_config(params['dynamo_table'], params['config_key'])
        config['update_config'] = True
    # Or use directly supplied parameters:
    else:
        config = params
        config['update_config'] = False
        
    # Get list of articles to download and update config:
    url_file_list, next_config = get_articles_url(config)
    
    # Save config to AWS DynamoDB:
    if params['use_config']:
        client = boto3.client('dynamodb')
        response = client.put_item(TableName=params['dynamo_table'], Item=dyjson.dumps(next_config, as_dict=True))
    
    return url_file_list

#### É elemento do url_file_list que precisamos adicionar como aux_data

### Fix get_articles_do

In [53]:
import time

In [85]:
def get_artigos_do(data, secao):
    """
    Para uma data (datetime) e uma seção (str) do DOU,
    retorna uma lista de jsons com todos os links e outros metadados dos 
    artigos daquele dia e seção. 
    """
    # Hard-coded:
    do_date_format = '%d-%m-%Y'
    # Transforma data:
    data_string = data.strftime(do_date_format)
    
    # Exemplo de URL: 'http://www.in.gov.br/leiturajornal?data=13-05-2019&secao=do1'
    url   = 'http://www.in.gov.br/leiturajornal?data=' + data_string + '&secao=do' + str(secao)
    
    # Captura a lista de artigos daquele dia e seção:
    n_matches = 0
    n_tries   = 5
    i         = 0 
    while n_matches == 0 and i < n_tries:
        i = i + 1
        
        # Specifies number of retries for GET:
        session = requests.Session()
        session.mount('http://www.in.gov.br', requests.adapters.HTTPAdapter(max_retries=3))
        # GET html:
        res   = session.get(url)
        
        # Busca por local onde json de artigos está guardado:
        tree  = html.fromstring(res.content)
        xpath = '//*[@id="params"]/text()'
        html_matches = tree.xpath(xpath)
        n_matches = len(html_matches)
        
        # Se achou, pega:
        if n_matches > 0:
            return json.loads(html_matches[0])['jsonArray']
        else:
            print('Retry scraping article list', i)
            time.sleep(2)
        
    raise Exception("Failed to find article's jsonArray (likely a in.gov.br connection problem).")

In [86]:
data = dt.datetime.strptime('2020-05-07', '%Y-%m-%d')
secao = '1'

get_artigos_do(data, secao)

Retry scraping article list 1


[{'artType': 'Portaria',
  'urlTitle': 'portaria-n-1.278-de-4-de-maio-de-2020-255609815',
  'numberPage': '178',
  'pubOrder': 'DO100012:00031:00000:00000:00000:00000:00000:00000:00000:00000:00054:00003',
  'hierarchyStr': 'MinistÃ©rio do Desenvolvimento Regional/Secretaria Nacional de ProteÃ§Ã£o e Defesa Civil',
  'hierarchyList': ['MinistÃ©rio do Desenvolvimento Regional',
   'Secretaria Nacional de ProteÃ§Ã£o e Defesa Civil'],
  'title': 'PORTARIA NÂº 1.278, DE 4 DE MAIO DE 2020',
  'content': 'autoriza o empenho e a transferÃªncia de recursos ao estado de minas gerais - mg, para execuÃ§Ã£o de aÃ§Ãµes de defesa civil.',
  'hierarchyLevelSize': 2},
 {'artType': 'DeliberaÃ§Ã£o',
  'urlTitle': 'deliberacao-n-253-de-5-de-maio-de-2020-255610018',
  'numberPage': '200',
  'pubOrder': 'DO100017:00031:00009:00000:00000:00000:00000:00000:00000:00000:00022:00010',
  'hierarchyStr': 'MinistÃ©rio da Infraestrutura/AgÃªncia Nacional de Transportes Terrestres/Diretoria Colegiada',
  'hierarchyLis

<h2>teste</h2>


In [28]:
json.loads(tree.xpath(xpath)[0])

IndexError: list index out of range