In [None]:
import pandas as pd

import re

In [None]:
DATA_PATH="../"

In [None]:
# Format for Federal Law LexML URN:
#
#    https://normas.leg.br/?urn=urn:lex:br:federal:lei:{<year>}-{<month>}-{<day>};{<law-number}
#

LEXML_FEDERAL_LAW_URN_FORMAT="https://normas.leg.br/?urn=urn:lex:br:federal:lei:{}-{:02}-{:02};{}"

In [None]:
# Month converter

MONTHS={
    "janeiro": 1,
    "fevereiro": 2,
    "março": 3,
    "abril": 4,
    "maio": 5,
    "junho": 6,
    "julho": 7,
    "agosto": 8,
    "setembro": 9,
    "outubro": 10,
    "novembro": 11,
    "dezembro": 12    
}

In [None]:
references_filename_map_df = pd.read_csv(DATA_PATH + "reference_filename_map.tsv", 
                                         sep="\t", 
                                         index_col=0)

In [None]:
all_references = sorted(references_filename_map_df.index)

In [None]:
legal_rules_references = [ref for ref in all_references if ref.startswith("Lei") or ref.startswith("lei")]

In [None]:
legal_rules_references

## Some manual fixes

In [None]:
legal_rules_references = legal_rules_references[4:]

In [None]:
legal_rules_references.index('Lei nº 4.242, de 1963')

In [None]:
legal_rules_references[55] = 'Lei nº 4.242, de 17 de julho de 1963'

In [None]:
legal_rules_references.index('Lei nº 10.741, de 1º de outubro de 2003')

In [None]:
legal_rules_references[7] = 'Lei nº 10.741, de 1 de outubro de 2003'

In [None]:
legal_rules_references

In [None]:
all_legal_norms = []

for title in legal_rules_references:
    # m = re.match("Lei[^0-9]+([0-9\.]+)\,\sde\s([0-9]+)\s(\w+)\s([0-9])+", title)
    m = re.match("Lei[^0-9]+([0-9\.]+),\sde\s([0-9]+)\sde\s(\w+)\sde\s([0-9]+)", title)
    print("{}: {}-{}-{};{}".format(title, 
                                   m.group(4), 
                                   m.group(3),
                                   m.group(2),
                                   m.group(1).replace(".", "")))

    all_legal_norms.append(LEXML_FEDERAL_LAW_URN_FORMAT.format(m.group(4), 
                                                               int(MONTHS[m.group(3).lower()]),
                                                               int(m.group(2)),
                                                               m.group(1).replace(".", "")))

In [None]:
all_legal_norms

## Using the Legal Document Fetcher Module

Now we'll use the Object-Oriented module to fetch all legal documents and save them as Word files.

In [None]:
# Import the module
from legal_document_fetcher import LegalDocumentFetcher, FetcherConfig

In [None]:
# Configure the fetcher
config = FetcherConfig(
    output_dir='./legal_documents',
    request_timeout=30,
    retry_attempts=3,
    delay_between_requests=2.0,
    content_selector='div.texto'
)

print(f"Output directory: {config.output_dir}")
print(f"Timeout: {config.request_timeout}s")
print(f"Retry attempts: {config.retry_attempts}")
print(f"Delay between requests: {config.delay_between_requests}s")

In [None]:
# Create the fetcher instance
fetcher = LegalDocumentFetcher(config)
print(f"Fetcher initialized with {len(all_legal_norms)} URLs to process")

In [None]:
# Test with a single URL first
test_url = all_legal_norms[0]
print(f"Testing with: {test_url}")

test_result = fetcher.process_single_url(test_url)
print(f"\nResult: {test_result}")

In [None]:
# Process all URLs
# WARNING: This will take a while (about 3-4 minutes for 108 URLs with 2s delay)
# Uncomment the line below to run:

# results = fetcher.process_url_list(all_legal_norms, show_progress=True)

In [None]:
# View summary statistics
summary = fetcher.get_summary()

print("=" * 60)
print("FETCH SUMMARY")
print("=" * 60)
print(f"Total URLs processed:    {summary['total']}")
print(f"Successful:              {summary['success']}")
print(f"Failed:                  {summary['failed']}")
print(f"Success rate:            {summary['success_rate']:.2f}%")
print(f"Average fetch time:      {summary['avg_fetch_time']:.2f}s")
print("=" * 60)

if summary['failed_urls']:
    print(f"\nFailed URLs ({len(summary['failed_urls'])}):")
    for url in summary['failed_urls']:
        print(f"  - {url}")

In [None]:
# Export results to CSV for analysis
fetcher.export_results_to_csv('fetch_results.csv')
print("Results exported to fetch_results.csv")