In [13]:
# =========================
# pdf_to_csv_aceite_colab.py
# =========================
!pip install pdfplumber pandas

import pdfplumber
import pandas as pd
import re
from datetime import datetime

# Rutas de entrada/salida
pdf_path = "/content/Aceite.pdf"  # <-- Cambia si subes otro PDF
out_csv = "/content/precios_aceite_clean_long.csv"

# Expresiones regulares para fechas y productos
DATE_RX = re.compile(r'^\s*(\d{2}-\d{2}-\d{4})\s*$')
PRODUCT_RX = re.compile(
    r'^Aceite de oliva\s+(virgen extra|virgen|lampante)\s+Picual(?:\s+([0-9][0-9.,]*)\s*€)?(?:\s+(Sin cierre de operaciones))?$',
    re.IGNORECASE
)

TYPE_MAP = {
    'virgen extra': 'virgen_extra_picual',
    'virgen': 'virgen_picual',
    'lampante': 'lampante_picual'
}

def parse_pdf_to_csv(pdf_path, out_csv, include_sin_cierre=False):
    rows = []
    current_date = None

    # Abrimos el PDF con pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
            for ln in lines:
                # 1) Si la línea es una fecha
                m_date = DATE_RX.match(ln)
                if m_date:
                    d = datetime.strptime(m_date.group(1), "%d-%m-%Y").date()
                    current_date = d.isoformat()
                    continue

                # 2) Si es un producto
                m_prod = PRODUCT_RX.match(ln)
                if m_prod and current_date is not None:
                    kind_raw = m_prod.group(1).lower()
                    price_str = m_prod.group(2)
                    sin_cierre = m_prod.group(3) is not None
                    tipo = TYPE_MAP.get(kind_raw, kind_raw.replace(" ", "_") + "_picual")

                    if sin_cierre:
                        if include_sin_cierre:
                            rows.append({"fecha": current_date, "tipo": tipo, "precio_eur_kg": None})
                    else:
                        if price_str:
                            price_str = price_str.replace(",", ".")
                            price = float(re.sub(r'[^0-9.]', '', price_str))
                            rows.append({"fecha": current_date, "tipo": tipo, "precio_eur_kg": price})

    # Convertimos a DataFrame ordenado
    df = pd.DataFrame(rows, columns=["fecha","tipo","precio_eur_kg"])
    df["fecha"] = pd.to_datetime(df["fecha"])
    df = df.sort_values(["fecha", "tipo"]).reset_index(drop=True)

    # Guardamos CSV
    df.to_csv(out_csv, index=False)
    print(f"CSV generado: {out_csv} ({len(df)} filas)")
    return df

# Ejecutar conversión
df = parse_pdf_to_csv(pdf_path, out_csv)

# Mostrar primeras filas
df.head(10)


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling 

Unnamed: 0,fecha,tipo,precio_eur_kg
0,2025-01-02,lampante_picual,3.675
1,2025-01-02,virgen_extra_picual,4.056
2,2025-01-02,virgen_picual,3.881
3,2025-01-03,lampante_picual,3.688
4,2025-01-03,virgen_extra_picual,4.088
5,2025-01-03,virgen_picual,3.913
6,2025-01-07,lampante_picual,3.767
7,2025-01-07,virgen_extra_picual,4.133
8,2025-01-07,virgen_picual,3.933
9,2025-01-08,lampante_picual,3.764


In [2]:
!pip install pdfminer

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/4.2 MB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/4.2 MB[0m [31m20.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.2/4.2 MB[0m [31m46.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2

In [4]:
!pip install pdfminer

[31mERROR: Could not find a version that satisfies the requirement pdfminer.high_level (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pdfminer.high_level[0m[31m
[0m

In [11]:
import pandas as pd

# Load the example CSV to inspect the desired format
example_df = pd.read_csv("/content/precios_aceite_clean_long.csv")
print("--- Example CSV Data ---")
print(example_df.head())
print("--- Data Types ---")
print(example_df.info())
print("--- End Example CSV Data ---")

--- Example CSV Data ---
        fecha                 tipo  precio_eur_kg
0  2025-01-02      lampante_picual         4083.0
1  2025-01-23  virgen_extra_picual         3500.0
2  2025-02-05        virgen_picual         3913.0
3  2025-03-03  virgen_extra_picual         3750.0
4  2025-03-14        virgen_picual         3450.0
--- Data Types ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fecha          13 non-null     object 
 1   tipo           13 non-null     object 
 2   precio_eur_kg  13 non-null     float64
dtypes: float64(1), object(2)
memory usage: 444.0+ bytes
None
--- End Example CSV Data ---
