<a href="https://colab.research.google.com/github/cyndinha99dugrau/Agrinho/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# =====================================================
# üìò EXTRA√á√ÉO DE DADOS DE FATURAS HIDROSUGA (Colab)
# =====================================================

!pip install pdfplumber pytesseract pdf2image openpyxl

import pdfplumber, re, pandas as pd
from dateutil import parser as dateparser
from google.colab import files
from pdf2image import convert_from_path
import pytesseract, io, os

# =====================================================
# --- Express√µes Regulares ---
# =====================================================

CNPJ_RE = re.compile(r"\b(\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2})\b")
FATURA_NUM_RE = re.compile(r"FATURA\s+DE\s+LOCA√á√ÉO\s*N[¬∫¬∞]?\s*(\d{1,5})", re.IGNORECASE)
EMISSAO_RE = re.compile(r"Emiss[a√£]o[:\s]*([0-9]{1,2}[\/\.\-][0-9]{1,2}[\/\.\-][0-9]{4})", re.IGNORECASE)
VENC_RE = re.compile(r"Vencimento[:\s]*([0-9]{1,2}[\/\.\-][0-9]{1,2}[\/\.\-][0-9]{4})", re.IGNORECASE)
VALOR_RE = re.compile(r"R\$[\s]*([\d\.\,]+)")
DESTINATARIO_RE = re.compile(r"DESTINAT[√ÅA]RIO\s+([A-Z0-9\&\.\-\s]+)", re.IGNORECASE)

# =====================================================
# --- Fun√ß√µes auxiliares ---
# =====================================================

def parse_date(text):
    try:
        return dateparser.parse(text, dayfirst=True).date().isoformat()
    except:
        return None

def parse_valor(text):
    text = text.replace('.', '').replace(',', '.')
    try:
        return float(re.findall(r"[\d\.]+", text)[0])
    except:
        return None

def extract_from_page(text: str, page_number: int):
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    text = re.sub(r"\s+", " ", text)

    result = {
        'P√°gina': page_number + 1,
        'N√∫mero da Fatura': None,
        'Destinat√°rio': None,
        'CNPJ Destinat√°rio': None,
        'Emiss√£o': None,
        'Vencimento': None,
        'Valor Total (R$)': None
    }

    # --- Buscar n√∫mero da fatura ---
    for i, ln in enumerate(lines):
        if "FATURA DE LOCA√á√ÉO" in ln.upper():
            # procurar nas pr√≥ximas 3 linhas
            for j in range(i, min(i + 4, len(lines))):
                m = re.search(r"N[¬∫¬∞]?\s*[:\-]?\s*([\d\.]+)", lines[j])
                if not m:
                    # se n√£o tiver ‚ÄúN¬∫‚Äù, procurar n√∫mero isolado (ex: 1.234)
                    m = re.search(r"\b(\d{1,5}(?:\.\d{3})?)\b", lines[j])
                if m:
                    result['N√∫mero da Fatura'] = m.group(1).replace('.', '')
                    break
            break

    # --- Destinat√°rio ---
    m = re.search(r"DESTINAT[√ÅA]RIO\s+([A-Z0-9\&\.\-\s]+)", text, re.IGNORECASE)
    if m:
        result['Destinat√°rio'] = m.group(1).strip()

    # --- CNPJ Destinat√°rio ---
    cnpjs = CNPJ_RE.findall(text)
    if cnpjs:
        for cnpj in cnpjs:
            if "19.686.283/0001-03" not in cnpj:  # ignora o da Hidrosuga
                result['CNPJ Destinat√°rio'] = cnpj
                break

    # --- Emiss√£o ---
    m = re.search(r"Emiss[a√£]o[:\s]*([0-9]{1,2}[\/\.\-][0-9]{1,2}[\/\.\-][0-9]{4})", text, re.IGNORECASE)
    if m:
        result['Emiss√£o'] = parse_date(m.group(1))

    # --- Vencimento ---
    m = re.search(r"Vencimento[:\s]*([0-9]{1,2}[\/\.\-][0-9]{1,2}[\/\.\-][0-9]{4})", text, re.IGNORECASE)
    if m:
        result['Vencimento'] = parse_date(m.group(1))

    # --- Valor Total ---
    valores = VALOR_RE.findall(text)
    if valores:
        result['Valor Total (R$)'] = parse_valor(valores[-1])

    return result

def ocr_pdf_to_text(pdf_path):
    print("üîç PDF parece ser imagem ‚Äî executando OCR...")
    pages = convert_from_path(pdf_path)
    all_text = []
    for i, page in enumerate(pages):
        text = pytesseract.image_to_string(page, lang='por')
        all_text.append(text)
        print(f"‚úÖ OCR extra√≠do da p√°gina {i+1}/{len(pages)}")
    return all_text

# =====================================================
# --- Processar PDF ---
# =====================================================

def process_pdf(path_pdf: str):
    results = []
    ocr_texts = None
    with pdfplumber.open(path_pdf) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            if not text:
                if ocr_texts is None:
                    ocr_texts = ocr_pdf_to_text(path_pdf)
                text = ocr_texts[i] if i < len(ocr_texts) else ""
            if not text:
                print(f"‚ö†Ô∏è P√°gina {i+1} sem texto extra√≠do.")
                continue
            res = extract_from_page(text, i)
            results.append(res)
    return pd.DataFrame(results)

# =====================================================
# --- Execu√ß√£o no Colab ---
# =====================================================

print("üì§ Envie o arquivo PDF (Faturas)...")
uploaded = files.upload()
pdf_path = next(iter(uploaded))
print(f"üìÑ Arquivo carregado: {pdf_path}")

df = process_pdf(pdf_path)
df.to_excel("faturas_extraidas.xlsx", index=False)

print("‚úÖ Extra√ß√£o conclu√≠da com sucesso!")
files.download("faturas_extraidas.xlsx")


üì§ Envie o arquivo PDF (Faturas)...


Saving Faturas.pdf to Faturas (2).pdf
üìÑ Arquivo carregado: Faturas (2).pdf
‚úÖ Extra√ß√£o conclu√≠da com sucesso!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>