# 01 â€” Data Exploration
Quick overview of the Olist Brazilian E-Commerce dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

RAW = '../data/raw/'

customers = pd.read_csv(RAW + 'olist_customers_dataset.csv')
orders = pd.read_csv(RAW + 'olist_orders_dataset.csv', parse_dates=['order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date'])
items = pd.read_csv(RAW + 'olist_order_items_dataset.csv')
payments = pd.read_csv(RAW + 'olist_order_payments_dataset.csv')
reviews = pd.read_csv(RAW + 'olist_order_reviews_dataset.csv')
products = pd.read_csv(RAW + 'olist_products_dataset.csv')
sellers = pd.read_csv(RAW + 'olist_sellers_dataset.csv')
categories = pd.read_csv(RAW + 'product_category_name_translation.csv')

print('All files loaded.')

## Table Shapes & Missing Values

In [None]:
tables = {
    'customers': customers, 'orders': orders, 'items': items,
    'payments': payments, 'reviews': reviews, 'products': products,
    'sellers': sellers, 'categories': categories
}

for name, df in tables.items():
    pct_missing = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
    print(f'{name:12s}  {df.shape[0]:>7,} rows x {df.shape[1]:>2} cols  |  {pct_missing:.1f}% missing')

## Key Counts & Date Range

In [None]:
print(f"Unique customers:  {customers['customer_unique_id'].nunique():,}")
print(f"Unique orders:     {orders['order_id'].nunique():,}")
print(f"Unique products:   {products['product_id'].nunique():,}")
print(f"Unique sellers:    {sellers['seller_id'].nunique():,}")
print(f"Order date range:  {orders['order_purchase_timestamp'].min().date()} to {orders['order_purchase_timestamp'].max().date()}")

## How Tables Connect
```
customers --(customer_id)--> orders --(order_id)--> items --(product_id)--> products
                                |                     |
                                +--(order_id)--> payments
                                +--(order_id)--> reviews
                                              (seller_id)--> sellers
```

## Review Score Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

reviews['review_score'].value_counts().sort_index().plot.bar(ax=axes[0], color='steelblue')
axes[0].set_title('Review Score Distribution')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Count')

has_text = reviews['review_comment_message'].notna().sum()
no_text = reviews['review_comment_message'].isna().sum()
axes[1].bar(['Has text', 'No text'], [has_text, no_text], color=['steelblue', 'lightcoral'])
axes[1].set_title(f'Reviews with Text: {has_text:,} / {len(reviews):,} ({has_text/len(reviews)*100:.0f}%)')

plt.tight_layout()
plt.show()

## Sample Review Texts

In [None]:
reviews[reviews['review_comment_message'].notna()][['review_score', 'review_comment_message']].sample(5, random_state=42)

## Top Product Categories & Payment Methods

In [None]:
product_cats = products.merge(categories, on='product_category_name', how='left')
print('Top 10 Product Categories:')
print(product_cats['product_category_name_english'].value_counts().head(10).to_string())

print(f'
Payment Methods:')
print(payments['payment_type'].value_counts().to_string())

## Order Value Distribution

In [None]:
order_values = items.groupby('order_id')['price'].sum()
print(order_values.describe())

fig, ax = plt.subplots(figsize=(8, 3))
order_values[order_values < order_values.quantile(0.99)].hist(bins=50, ax=ax, color='steelblue', edgecolor='white')
ax.set_title('Order Value Distribution (trimmed at 99th percentile)')
ax.set_xlabel('Order Value (BRL)')
plt.tight_layout()
plt.show()