In [None]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2://airflow:airflow@localhost:5432/airflow")

orders = pd.read_sql('SELECT * FROM silver."olist_orders_dataset";', engine)
customers = pd.read_sql('SELECT * FROM silver."olist_customers_dataset";', engine)
order_items = pd.read_sql('SELECT * FROM silver."olist_order_items_dataset";', engine)

import numpy as np

# 1. Checar nulos em chaves primárias
print("Nulos em chaves primárias:")
print("orders:", orders['order_id'].isnull().sum())
print("customers:", customers['customer_id'].isnull().sum())
print("order_items:", order_items['order_item_id'].isnull().sum())

# 2. Checar duplicados
print("\nDuplicados:")
print("orders:", orders.duplicated(subset=['order_id']).sum())
print("customers:", customers.duplicated(subset=['customer_id']).sum())
print("order_items:", order_items.duplicated(subset=['order_id', 'order_item_id']).sum())

# 3. Consistência de datas
if 'order_purchase_timestamp' in orders.columns and 'order_delivered_customer_date' in orders.columns:
    print("\nDatas inconsistentes (entrega antes da compra):")
    print((orders['order_delivered_customer_date'] < orders['order_purchase_timestamp']).sum())

# 4. CEPs inválidos (esperado 8 dígitos)
if 'customer_zip_code_prefix' in customers.columns:
    print("\nCEPs inválidos (customer_zip_code_prefix):")
    print(customers[~customers['customer_zip_code_prefix'].astype(str).str.match(r'^\\d{5}$')].shape[0])

# 5. Valores negativos
if 'price' in order_items.columns:
    print("\nValores negativos em price:")
    price_numeric = pd.to_numeric(order_items['price'], errors='coerce')
    print((price_numeric < 0).sum())
if 'freight_value' in order_items.columns:
    print("Valores negativos em freight_value:")
    freight_numeric = pd.to_numeric(order_items['freight_value'], errors='coerce')
    print((freight_numeric < 0).sum())

# 6. Chaves estrangeiras
print("\nChaves estrangeiras inconsistentes:")
print("order_items sem order_id correspondente:", (~order_items['order_id'].isin(orders['order_id'])).sum())
print("orders sem customer_id correspondente:", (~orders['customer_id'].isin(customers['customer_id'])).sum())

# 7. Estatísticas básicas
print("\nEstatísticas básicas:")
print("orders:", orders.describe(include='all'))
print("customers:", customers.describe(include='all'))
print("order_items:", order_items.describe(include='all'))

Nulos em chaves primárias:
orders: 0
customers: 0
order_items: 0

Duplicados:
orders: 0
customers: 0
order_items: 0

Datas inconsistentes (entrega antes da compra):
0

CEPs inválidos (customer_zip_code_prefix):
99441

Valores negativos em price:
0
Valores negativos em freight_value:
0

Chaves estrangeiras inconsistentes:
order_items sem order_id correspondente: 0
orders sem customer_id correspondente: 0

Estatísticas básicas:
orders:                                 order_id                       customer_id  \
count                              99441                             99441   
unique                             99441                             99441   
top     e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
freq                                   1                                 1   
mean                                 NaN                               NaN   
min                                  NaN                               NaN   
25%             