# Exploratory Data Analysis - Olist E-Commerce Dataset

## Objectives
1. Understand the 8 table relationships
2. Identify data quality issues
3. Document business metrics
4. Explore key patterns and insights

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path

# Configure visualization
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

In [None]:
# Connect to DuckDB
db_path = Path('..') / 'ecommerce_raw.duckdb'
con = duckdb.connect(str(db_path))
print(f"Connected to: {db_path}")

## 1. Data Overview

In [None]:
# List all tables
tables = con.execute("""
    SELECT table_name, COUNT(*) as row_count
    FROM information_schema.tables
    WHERE table_schema = 'raw'
    GROUP BY table_name
    ORDER BY table_name
""").df()

# Get row counts
for table in tables['table_name']:
    count = con.execute(f"SELECT COUNT(*) FROM raw.{table}").fetchone()[0]
    tables.loc[tables['table_name'] == table, 'row_count'] = count

print("\nAvailable Tables:")
tables

## 2. Explore Each Table

In [None]:
# Orders
orders = con.execute("SELECT * FROM raw.olist_orders_dataset LIMIT 5").df()
print("Orders Table:")
display(orders)
print(f"\nShape: {con.execute('SELECT COUNT(*), COUNT(DISTINCT order_id) FROM raw.olist_orders_dataset').fetchone()}")

In [None]:
# Order Items
order_items = con.execute("SELECT * FROM raw.olist_order_items_dataset LIMIT 5").df()
print("Order Items Table:")
display(order_items)

In [None]:
# Customers
customers = con.execute("SELECT * FROM raw.olist_customers_dataset LIMIT 5").df()
print("Customers Table:")
display(customers)

## 3. Data Quality Checks

In [None]:
# Check for nulls in orders
null_check = con.execute("""
    SELECT 
        COUNT(*) as total_rows,
        SUM(CASE WHEN order_id IS NULL THEN 1 ELSE 0 END) as null_order_id,
        SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END) as null_customer_id,
        SUM(CASE WHEN order_status IS NULL THEN 1 ELSE 0 END) as null_status,
        SUM(CASE WHEN order_purchase_timestamp IS NULL THEN 1 ELSE 0 END) as null_timestamp
    FROM raw.olist_orders_dataset
""").df()

print("Null Checks - Orders:")
null_check

## 4. Business Metrics Exploration

In [None]:
# Orders over time
orders_by_month = con.execute("""
    SELECT 
        DATE_TRUNC('month', order_purchase_timestamp) as month,
        COUNT(*) as order_count
    FROM raw.olist_orders_dataset
    WHERE order_purchase_timestamp IS NOT NULL
    GROUP BY month
    ORDER BY month
""").df()

print("Orders by Month:")
display(orders_by_month.tail(10))

# Plot
plt.figure(figsize=(14, 6))
plt.plot(orders_by_month['month'], orders_by_month['order_count'], marker='o')
plt.title('Orders Over Time')
plt.xlabel('Month')
plt.ylabel('Order Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Order status distribution
status_dist = con.execute("""
    SELECT order_status, COUNT(*) as count
    FROM raw.olist_orders_dataset
    GROUP BY order_status
    ORDER BY count DESC
""").df()

print("Order Status Distribution:")
display(status_dist)

# Plot
plt.figure(figsize=(10, 6))
plt.bar(status_dist['order_status'], status_dist['count'])
plt.title('Order Status Distribution')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Relationship Analysis

In [None]:
# Join orders with items and payments
order_summary = con.execute("""
    SELECT 
        o.order_id,
        o.customer_id,
        o.order_status,
        COUNT(DISTINCT oi.order_item_id) as item_count,
        SUM(oi.price) as total_price,
        SUM(p.payment_value) as total_payment
    FROM raw.olist_orders_dataset o
    LEFT JOIN raw.olist_order_items_dataset oi ON o.order_id = oi.order_id
    LEFT JOIN raw.olist_order_payments_dataset p ON o.order_id = p.order_id
    WHERE o.order_status = 'delivered'
    GROUP BY o.order_id, o.customer_id, o.order_status
    LIMIT 10
""").df()

print("Order Summary (Sample):")
order_summary

## 6. Key Insights & Next Steps

**Data Quality Observations:**
- TBD after running analysis

**Business Insights:**
- TBD after running analysis

**Next Steps:**
1. Build dbt staging models to clean data
2. Create dimensional model (star schema)
3. Develop business metrics layer

In [None]:
# Close connection
con.close()