# 13 - DuckDB: Procesamiento de M√∫ltiples Archivos Parquet

## üéØ Objetivos
- Trabajar con m√∫ltiples archivos Parquet
- Particionamiento de datos
- Queries eficientes sobre datasets particionados
- Union y combinaci√≥n de archivos
- Optimizaciones y mejores pr√°cticas
- An√°lisis de datos distribuidos

## üìö Tecnolog√≠as
- **DuckDB**: SQL analytics engine
- **Parquet**: Formato columnar
- **Pandas**: Manipulaci√≥n de datos
- **PyArrow**: Backend de Parquet

## ‚≠ê Complejidad: Intermedio

## 1. Instalaci√≥n y Setup

In [None]:
# Instalar dependencias
!pip install duckdb pandas numpy pyarrow matplotlib seaborn plotly faker -q

In [None]:
import duckdb
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime, timedelta
from pathlib import Path
import shutil
import warnings
warnings.filterwarnings('ignore')

print(f"‚úÖ DuckDB version: {duckdb.__version__}")
print(f"‚úÖ PyArrow version: {pa.__version__}")
print(f"‚úÖ Pandas version: {pd.__version__}")

## 2. Generar Datos de Prueba

Crearemos datos sint√©ticos de e-commerce distribuidos en m√∫ltiples archivos.

In [None]:
from faker import Faker
import random

fake = Faker()
np.random.seed(42)
random.seed(42)

# Par√°metros
NUM_CUSTOMERS = 1000
NUM_PRODUCTS = 100
NUM_TRANSACTIONS = 50000

# Generar cat√°logo de productos
categories = ['Electr√≥nica', 'Ropa', 'Hogar', 'Deportes', 'Libros', 'Juguetes', 'Alimentos']
products = []
for i in range(NUM_PRODUCTS):
    products.append({
        'product_id': i + 1,
        'product_name': fake.catch_phrase(),
        'category': random.choice(categories),
        'price': round(random.uniform(10, 1000), 2),
        'cost': round(random.uniform(5, 500), 2)
    })

products_df = pd.DataFrame(products)
print("üì¶ Productos generados:")
print(products_df.head(10))

# Generar clientes
customers = []
for i in range(NUM_CUSTOMERS):
    customers.append({
        'customer_id': i + 1,
        'name': fake.name(),
        'email': fake.email(),
        'country': fake.country(),
        'city': fake.city(),
        'signup_date': fake.date_between(start_date='-2y', end_date='today')
    })

customers_df = pd.DataFrame(customers)
print("\nüë• Clientes generados:")
print(customers_df.head(10))

## 3. Generar Transacciones Particionadas por Fecha

In [None]:
# Generar transacciones para los √∫ltimos 2 a√±os
start_date = datetime.now() - timedelta(days=730)
end_date = datetime.now()

transactions = []
for i in range(NUM_TRANSACTIONS):
    transaction_date = fake.date_time_between(start_date=start_date, end_date=end_date)
    product = products_df.sample(1).iloc[0]
    customer = customers_df.sample(1).iloc[0]
    quantity = random.randint(1, 5)
    
    transactions.append({
        'transaction_id': i + 1,
        'transaction_date': transaction_date,
        'customer_id': customer['customer_id'],
        'product_id': product['product_id'],
        'quantity': quantity,
        'unit_price': product['price'],
        'total_amount': round(quantity * product['price'], 2),
        'payment_method': random.choice(['Credit Card', 'Debit Card', 'PayPal', 'Cash']),
        'status': random.choice(['completed', 'completed', 'completed', 'pending', 'cancelled'])
    })

transactions_df = pd.DataFrame(transactions)

# Agregar columnas de partici√≥n
transactions_df['year'] = transactions_df['transaction_date'].dt.year
transactions_df['month'] = transactions_df['transaction_date'].dt.month
transactions_df['day'] = transactions_df['transaction_date'].dt.day

print(f"üí≥ Transacciones generadas: {len(transactions_df):,}")
print(f"üìä Rango: {transactions_df['transaction_date'].min()} a {transactions_df['transaction_date'].max()}")
print(f"\nüìä Muestra de transacciones:")
print(transactions_df.head(10))

## 4. Guardar Datos en M√∫ltiples Archivos Parquet

Crearemos estructura particionada por a√±o y mes.

In [None]:
# Crear directorios
base_path = Path('data_parquet')
if base_path.exists():
    shutil.rmtree(base_path)
base_path.mkdir(exist_ok=True)

# Guardar productos y clientes (sin particionar)
products_df.to_parquet(base_path / 'products.parquet', index=False)
customers_df.to_parquet(base_path / 'customers.parquet', index=False)

print("‚úÖ Guardados:")
print(f"   - products.parquet")
print(f"   - customers.parquet")

# Guardar transacciones particionadas por a√±o/mes
transactions_path = base_path / 'transactions'
transactions_path.mkdir(exist_ok=True)

file_count = 0
for year in transactions_df['year'].unique():
    year_path = transactions_path / f'year={year}'
    year_path.mkdir(exist_ok=True)
    
    for month in transactions_df[transactions_df['year'] == year]['month'].unique():
        month_path = year_path / f'month={month:02d}'
        month_path.mkdir(exist_ok=True)
        
        # Filtrar datos para esta partici√≥n
        partition_data = transactions_df[
            (transactions_df['year'] == year) & 
            (transactions_df['month'] == month)
        ].copy()
        
        # Remover columnas de partici√≥n (est√°n en el path)
        partition_data = partition_data.drop(columns=['year', 'month'])
        
        # Guardar
        partition_file = month_path / 'data.parquet'
        partition_data.to_parquet(partition_file, index=False)
        file_count += 1
        print(f"‚úÖ {partition_file} ({len(partition_data)} registros)")

print(f"\nüìä Total de archivos Parquet creados: {file_count + 2}")
print(f"üìä Estructura:")
print(f"   data_parquet/")
print(f"   ‚îú‚îÄ‚îÄ products.parquet")
print(f"   ‚îú‚îÄ‚îÄ customers.parquet")
print(f"   ‚îî‚îÄ‚îÄ transactions/")
print(f"       ‚îú‚îÄ‚îÄ year=2023/month=01/data.parquet")
print(f"       ‚îú‚îÄ‚îÄ year=2023/month=02/data.parquet")
print(f"       ‚îî‚îÄ‚îÄ ...")

## 5. Leer M√∫ltiples Archivos Parquet con DuckDB

In [None]:
# Conectar a DuckDB
con = duckdb.connect(':memory:')

print("‚úÖ DuckDB conectado")

# Leer archivo √∫nico
result = con.execute("""
    SELECT * FROM 'data_parquet/products.parquet' LIMIT 5
""").df()

print("\nüì¶ Productos (archivo √∫nico):")
print(result)

# Leer TODOS los archivos de transacciones con glob pattern
result = con.execute("""
    SELECT COUNT(*) as total_transactions
    FROM 'data_parquet/transactions/**/*.parquet'
""").df()

print(f"\nüí≥ Total transacciones (m√∫ltiples archivos): {result['total_transactions'][0]:,}")

# Ver muestra
result = con.execute("""
    SELECT * 
    FROM 'data_parquet/transactions/**/*.parquet'
    LIMIT 10
""").df()

print("\nüìä Muestra de transacciones:")
print(result)

## 6. Queries con Partici√≥n Pushdown

DuckDB puede leer solo las particiones necesarias.

In [None]:
# Query que solo lee particiones espec√≠ficas
# DuckDB detecta el filtro year=2024, month=1 y solo lee esos archivos
result = con.execute("""
    SELECT 
        COUNT(*) as transactions,
        SUM(total_amount) as total_revenue,
        AVG(total_amount) as avg_transaction
    FROM read_parquet('data_parquet/transactions/year=2024/month=01/*.parquet')
""").df()

print("üìä Estad√≠sticas Enero 2024 (solo 1 archivo):")
print(result)

# Query m√°s complejo con m√∫ltiples meses
result = con.execute("""
    SELECT 
        DATE_TRUNC('month', transaction_date) as month,
        COUNT(*) as transactions,
        ROUND(SUM(total_amount), 2) as revenue,
        ROUND(AVG(total_amount), 2) as avg_transaction,
        COUNT(DISTINCT customer_id) as unique_customers
    FROM 'data_parquet/transactions/**/*.parquet'
    GROUP BY month
    ORDER BY month DESC
    LIMIT 12
""").df()

print("\nüìä √öltimos 12 meses:")
print(result)

## 7. Joins entre M√∫ltiples Archivos Parquet

In [None]:
# Join transacciones con productos y clientes
result = con.execute("""
    SELECT 
        t.transaction_id,
        t.transaction_date,
        c.name as customer_name,
        c.country,
        p.product_name,
        p.category,
        t.quantity,
        t.total_amount,
        t.status
    FROM 'data_parquet/transactions/**/*.parquet' t
    JOIN 'data_parquet/customers.parquet' c ON t.customer_id = c.customer_id
    JOIN 'data_parquet/products.parquet' p ON t.product_id = p.product_id
    WHERE t.status = 'completed'
    ORDER BY t.transaction_date DESC
    LIMIT 10
""").df()

print("üìä Transacciones completas (con joins):")
print(result)

# An√°lisis de ventas por categor√≠a y pa√≠s
result = con.execute("""
    SELECT 
        p.category,
        c.country,
        COUNT(*) as transactions,
        SUM(t.quantity) as units_sold,
        ROUND(SUM(t.total_amount), 2) as revenue
    FROM 'data_parquet/transactions/**/*.parquet' t
    JOIN 'data_parquet/customers.parquet' c ON t.customer_id = c.customer_id
    JOIN 'data_parquet/products.parquet' p ON t.product_id = p.product_id
    WHERE t.status = 'completed'
    GROUP BY p.category, c.country
    ORDER BY revenue DESC
    LIMIT 20
""").df()

print("\nüìä Ventas por Categor√≠a y Pa√≠s (Top 20):")
print(result)

## 8. An√°lisis de Performance: Single vs Multiple Files

In [None]:
import time

# Test 1: Leer todo de m√∫ltiples archivos particionados
start = time.time()
result1 = con.execute("""
    SELECT COUNT(*), SUM(total_amount)
    FROM 'data_parquet/transactions/**/*.parquet'
""").df()
time1 = time.time() - start

print(f"‚è±Ô∏è Query sobre archivos particionados: {time1*1000:.2f}ms")
print(f"üìä Resultado: {result1}")

# Test 2: Leer solo una partici√≥n
start = time.time()
result2 = con.execute("""
    SELECT COUNT(*), SUM(total_amount)
    FROM 'data_parquet/transactions/year=2024/month=01/*.parquet'
""").df()
time2 = time.time() - start

print(f"\n‚è±Ô∏è Query sobre 1 partici√≥n: {time2*1000:.2f}ms")
print(f"üìä Resultado: {result2}")

# Crear un archivo √∫nico para comparaci√≥n
single_file_path = base_path / 'transactions_single.parquet'
transactions_df.to_parquet(single_file_path, index=False)

# Test 3: Leer archivo √∫nico grande
start = time.time()
result3 = con.execute("""
    SELECT COUNT(*), SUM(total_amount)
    FROM 'data_parquet/transactions_single.parquet'
""").df()
time3 = time.time() - start

print(f"\n‚è±Ô∏è Query sobre archivo √∫nico: {time3*1000:.2f}ms")
print(f"üìä Resultado: {result3}")

print(f"\nüìä Comparaci√≥n de Performance:")
print(f"   Archivos particionados: {time1*1000:.2f}ms")
print(f"   Una partici√≥n: {time2*1000:.2f}ms ({time1/time2:.2f}x m√°s lento)")
print(f"   Archivo √∫nico: {time3*1000:.2f}ms")
print(f"\nüí° Particionamiento permite leer solo datos necesarios!")

## 9. Crear Vistas sobre M√∫ltiples Parquet Files

In [None]:
# Crear vistas para simplificar queries
con.execute("""
    CREATE OR REPLACE VIEW transactions_view AS
    SELECT * FROM 'data_parquet/transactions/**/*.parquet'
""")

con.execute("""
    CREATE OR REPLACE VIEW products_view AS
    SELECT * FROM 'data_parquet/products.parquet'
""")

con.execute("""
    CREATE OR REPLACE VIEW customers_view AS
    SELECT * FROM 'data_parquet/customers.parquet'
""")

# Crear vista materializada con joins
con.execute("""
    CREATE OR REPLACE VIEW sales_detailed AS
    SELECT 
        t.transaction_id,
        t.transaction_date,
        t.customer_id,
        c.name as customer_name,
        c.country,
        c.city,
        t.product_id,
        p.product_name,
        p.category,
        p.price as product_price,
        p.cost as product_cost,
        t.quantity,
        t.total_amount,
        t.payment_method,
        t.status,
        (t.total_amount - (p.cost * t.quantity)) as profit
    FROM transactions_view t
    JOIN customers_view c ON t.customer_id = c.customer_id
    JOIN products_view p ON t.product_id = p.product_id
""")

print("‚úÖ Vistas creadas")

# Usar vistas para queries m√°s simples
result = con.execute("""
    SELECT 
        category,
        COUNT(*) as transactions,
        ROUND(SUM(total_amount), 2) as revenue,
        ROUND(SUM(profit), 2) as profit,
        ROUND(SUM(profit) / SUM(total_amount) * 100, 2) as profit_margin_pct
    FROM sales_detailed
    WHERE status = 'completed'
    GROUP BY category
    ORDER BY revenue DESC
""").df()

print("\nüìä An√°lisis de Rentabilidad por Categor√≠a:")
print(result)

## 10. Agregaciones Complejas con Window Functions

In [None]:
# An√°lisis de tendencias mensuales con window functions
result = con.execute("""
    WITH monthly_sales AS (
        SELECT 
            DATE_TRUNC('month', transaction_date) as month,
            category,
            SUM(total_amount) as revenue
        FROM sales_detailed
        WHERE status = 'completed'
        GROUP BY month, category
    )
    SELECT 
        month,
        category,
        ROUND(revenue, 2) as revenue,
        ROUND(LAG(revenue) OVER (PARTITION BY category ORDER BY month), 2) as prev_month_revenue,
        ROUND(
            (revenue - LAG(revenue) OVER (PARTITION BY category ORDER BY month)) / 
            LAG(revenue) OVER (PARTITION BY category ORDER BY month) * 100, 
            2
        ) as growth_pct,
        ROUND(AVG(revenue) OVER (PARTITION BY category ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW), 2) as moving_avg_3m
    FROM monthly_sales
    ORDER BY category, month DESC
""").df()

print("üìä Tendencias Mensuales por Categor√≠a:")
print(result.head(20))

## 11. Exportar Resultados a Nuevos Parquet Files

In [None]:
# Crear directorio de analytics
analytics_path = base_path / 'analytics'
analytics_path.mkdir(exist_ok=True)

# Exportar an√°lisis de categor√≠as
con.execute("""
    COPY (
        SELECT 
            category,
            COUNT(*) as transactions,
            SUM(total_amount) as revenue,
            SUM(profit) as profit,
            AVG(total_amount) as avg_transaction
        FROM sales_detailed
        WHERE status = 'completed'
        GROUP BY category
    ) TO 'data_parquet/analytics/category_summary.parquet' (FORMAT PARQUET)
""")

print("‚úÖ Exportado: category_summary.parquet")

# Exportar top customers
con.execute("""
    COPY (
        SELECT 
            customer_id,
            customer_name,
            country,
            COUNT(*) as total_orders,
            SUM(total_amount) as total_spent,
            AVG(total_amount) as avg_order_value,
            MAX(transaction_date) as last_purchase
        FROM sales_detailed
        WHERE status = 'completed'
        GROUP BY customer_id, customer_name, country
        ORDER BY total_spent DESC
        LIMIT 100
    ) TO 'data_parquet/analytics/top_customers.parquet' (FORMAT PARQUET)
""")

print("‚úÖ Exportado: top_customers.parquet")

# Exportar an√°lisis temporal
con.execute("""
    COPY (
        SELECT 
            DATE_TRUNC('day', transaction_date) as date,
            COUNT(*) as transactions,
            SUM(total_amount) as revenue,
            COUNT(DISTINCT customer_id) as unique_customers,
            COUNT(DISTINCT product_id) as unique_products
        FROM sales_detailed
        WHERE status = 'completed'
        GROUP BY date
        ORDER BY date
    ) TO 'data_parquet/analytics/daily_summary.parquet' (FORMAT PARQUET)
""")

print("‚úÖ Exportado: daily_summary.parquet")

# Listar archivos creados
print("\nüìÅ Archivos analytics creados:")
for f in analytics_path.glob('*.parquet'):
    size_mb = f.stat().st_size / (1024 * 1024)
    print(f"   {f.name} ({size_mb:.2f} MB)")

## 12. Visualizaciones con Plotly

In [None]:
# Leer datos de analytics
category_df = con.execute("""
    SELECT * FROM 'data_parquet/analytics/category_summary.parquet'
    ORDER BY revenue DESC
""").df()

# Gr√°fico de barras - Revenue por categor√≠a
fig = px.bar(
    category_df, 
    x='category', 
    y='revenue',
    title='Revenue por Categor√≠a',
    labels={'category': 'Categor√≠a', 'revenue': 'Revenue ($)'},
    color='profit',
    color_continuous_scale='Viridis'
)
fig.show()

# Leer daily summary
daily_df = con.execute("""
    SELECT * FROM 'data_parquet/analytics/daily_summary.parquet'
    ORDER BY date
""").df()

# Time series de revenue
fig = px.line(
    daily_df,
    x='date',
    y='revenue',
    title='Revenue Diario',
    labels={'date': 'Fecha', 'revenue': 'Revenue ($)'}
)
fig.show()

# An√°lisis de top customers
top_customers_df = con.execute("""
    SELECT * FROM 'data_parquet/analytics/top_customers.parquet'
    LIMIT 10
""").df()

fig = px.bar(
    top_customers_df,
    x='customer_name',
    y='total_spent',
    title='Top 10 Clientes por Gasto Total',
    labels={'customer_name': 'Cliente', 'total_spent': 'Gasto Total ($)'},
    color='total_orders',
    color_continuous_scale='Blues'
)
fig.update_xaxis(tickangle=45)
fig.show()

print("‚úÖ Visualizaciones generadas")

## 13. Union de M√∫ltiples Parquet con Diferentes Schemas

In [None]:
# Simular archivos con schemas diferentes (ej: datos de diferentes fuentes)
legacy_path = base_path / 'legacy'
legacy_path.mkdir(exist_ok=True)

# Schema antiguo (sin algunas columnas)
legacy_transactions = transactions_df[[
    'transaction_id', 'transaction_date', 'customer_id', 
    'product_id', 'total_amount'
]].head(100).copy()

legacy_transactions.to_parquet(legacy_path / 'old_transactions.parquet', index=False)

# Nuevo schema (con todas las columnas)
new_transactions = transactions_df[[
    'transaction_id', 'transaction_date', 'customer_id', 
    'product_id', 'quantity', 'unit_price', 'total_amount', 
    'payment_method', 'status'
]].tail(100).copy()

new_transactions.to_parquet(legacy_path / 'new_transactions.parquet', index=False)

print("‚úÖ Archivos con diferentes schemas creados")

# Union con schema matching autom√°tico
result = con.execute("""
    SELECT 
        transaction_id,
        transaction_date,
        customer_id,
        product_id,
        total_amount,
        COALESCE(payment_method, 'Unknown') as payment_method,
        COALESCE(status, 'Unknown') as status,
        'legacy' as source
    FROM 'data_parquet/legacy/old_transactions.parquet'
    
    UNION ALL
    
    SELECT 
        transaction_id,
        transaction_date,
        customer_id,
        product_id,
        total_amount,
        payment_method,
        status,
        'new' as source
    FROM 'data_parquet/legacy/new_transactions.parquet'
    
    ORDER BY transaction_date
""").df()

print("\nüìä Union de schemas diferentes:")
print(result.head(10))
print(f"\nüìä Total registros: {len(result)}")
print(f"üìä Fuentes: {result['source'].value_counts().to_dict()}")

## 14. Metadata y Schema Inspection

In [None]:
# Obtener schema de un archivo Parquet
schema_info = con.execute("""
    DESCRIBE SELECT * FROM 'data_parquet/products.parquet'
""").df()

print("üìä Schema de products.parquet:")
print(schema_info)

# Metadata de archivo Parquet usando PyArrow
parquet_file = pq.ParquetFile('data_parquet/products.parquet')
print("\nüìä Metadata del archivo:")
print(f"   N√∫mero de row groups: {parquet_file.num_row_groups}")
print(f"   N√∫mero de filas: {parquet_file.metadata.num_rows}")
print(f"   N√∫mero de columnas: {parquet_file.metadata.num_columns}")
print(f"   Tama√±o serializado: {parquet_file.metadata.serialized_size} bytes")

# Estad√≠sticas de columnas
print("\nüìä Estad√≠sticas de columnas:")
for i in range(parquet_file.metadata.num_columns):
    col = parquet_file.metadata.row_group(0).column(i)
    print(f"   {col.path_in_schema}: {col.statistics}")

# Listar todos los archivos y tama√±os
result = con.execute("""
    SELECT 
        filename,
        file_size,
        file_modified_time
    FROM read_parquet(
        'data_parquet/**/*.parquet',
        filename=true,
        file_row_number=false,
        hive_partitioning=false
    )
    GROUP BY filename, file_size, file_modified_time
    ORDER BY file_size DESC
""").df()

print("\nüìÅ Todos los archivos Parquet:")
print(result)

## 15. Best Practices y Optimizaciones

In [None]:
print("üí° MEJORES PR√ÅCTICAS PARA PARQUET + DUCKDB")
print("=" * 60)

print("\n1Ô∏è‚É£ PARTICIONAMIENTO:")
print("   ‚úÖ Particiona por columnas frecuentemente filtradas (fecha, regi√≥n, etc.)")
print("   ‚úÖ Evita demasiadas particiones peque√±as (<100MB cada una)")
print("   ‚úÖ Usa Hive-style partitioning (year=2024/month=01/)")

print("\n2Ô∏è‚É£ COMPRESI√ìN:")
print("   ‚úÖ Usa SNAPPY para balance velocidad/compresi√≥n")
print("   ‚úÖ Usa GZIP para m√°xima compresi√≥n (m√°s lento)")
print("   ‚úÖ Usa ZSTD para mejor compresi√≥n moderna")

print("\n3Ô∏è‚É£ SCHEMA:")
print("   ‚úÖ Usa tipos de datos apropiados (INT32 vs INT64)")
print("   ‚úÖ Considera diccionarios para strings repetitivos")
print("   ‚úÖ Mant√©n schemas consistentes entre archivos")

print("\n4Ô∏è‚É£ QUERIES:")
print("   ‚úÖ Usa projection pushdown (SELECT solo columnas necesarias)")
print("   ‚úÖ Usa predicate pushdown (WHERE en particiones)")
print("   ‚úÖ Evita SELECT * en datasets grandes")

print("\n5Ô∏è‚É£ ESCRITURA:")
print("   ‚úÖ Escribe batches grandes (no muchos archivos peque√±os)")
print("   ‚úÖ Usa row groups de ~128MB")
print("   ‚úÖ Considera ordenar datos antes de escribir")

# Demostrar projection pushdown
print("\n" + "=" * 60)
print("üìä DEMO: Projection Pushdown\n")

# Sin projection (lee todo)
start = time.time()
result = con.execute("""
    SELECT transaction_id, total_amount
    FROM (
        SELECT *
        FROM 'data_parquet/transactions/**/*.parquet'
    )
    LIMIT 1000
""").df()
time_without = time.time() - start

# Con projection (solo lee columnas necesarias)
start = time.time()
result = con.execute("""
    SELECT transaction_id, total_amount
    FROM 'data_parquet/transactions/**/*.parquet'
    LIMIT 1000
""").df()
time_with = time.time() - start

print(f"Sin projection pushdown: {time_without*1000:.2f}ms")
print(f"Con projection pushdown: {time_with*1000:.2f}ms")
print(f"Mejora: {time_without/time_with:.2f}x m√°s r√°pido")

## 16. Resumen

In [None]:
# Estad√≠sticas finales
print("üéâ RESUMEN DEL TUTORIAL")
print("=" * 60)

# Contar archivos
parquet_files = list(base_path.rglob('*.parquet'))
total_size = sum(f.stat().st_size for f in parquet_files)

print(f"\nüìÅ Archivos creados: {len(parquet_files)}")
print(f"üíæ Tama√±o total: {total_size / (1024*1024):.2f} MB")

# Estad√≠sticas de datos
stats = con.execute("""
    SELECT 
        (SELECT COUNT(*) FROM 'data_parquet/products.parquet') as productos,
        (SELECT COUNT(*) FROM 'data_parquet/customers.parquet') as clientes,
        (SELECT COUNT(*) FROM 'data_parquet/transactions/**/*.parquet') as transacciones,
        (SELECT SUM(total_amount) FROM 'data_parquet/transactions/**/*.parquet' WHERE status = 'completed') as revenue_total
""").df()

print(f"\nüìä Datos procesados:")
print(f"   Productos: {stats['productos'][0]:,}")
print(f"   Clientes: {stats['clientes'][0]:,}")
print(f"   Transacciones: {stats['transacciones'][0]:,}")
print(f"   Revenue total: ${stats['revenue_total'][0]:,.2f}")

print(f"\n‚úÖ Conceptos aprendidos:")
print(f"   - Particionamiento de datos")
print(f"   - Lectura de m√∫ltiples archivos Parquet")
print(f"   - Queries distribuidos con DuckDB")
print(f"   - Joins entre archivos")
print(f"   - Window functions")
print(f"   - Optimizaciones de performance")
print(f"   - Schema evolution")

# Cerrar conexi√≥n
con.close()
print("\n‚úÖ Conexi√≥n DuckDB cerrada")