PART 1: REVENUE ANALYSIS

Query 1: Monthly Revenue Trend

In [6]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('../data/processed/ecommerce.db')

def run_query(query, title):
    print(f"\n{'=' * 70}")
    print(title)
    print('=' * 70)
    result = pd.read_sql_query(query, conn)
    print(result.to_string(index=False))
    return result

In [11]:
query1 = """
SELECT 
    order_year,
    order_month,
    COUNT(DISTINCT o.order_id) as total_orders,
    COUNT(DISTINCT o.customer_id) as unique_customers,
    ROUND(SUM(p.total_payment_value), 2) as total_revenue,
    ROUND(AVG(p.total_payment_value), 2) as avg_order_value
FROM orders o
JOIN order_payments p ON o.order_id = p.order_id
GROUP BY order_year, order_month
ORDER BY order_year, order_month
"""

result1 = run_query(query1, "QUERY 1: Monthly Revenue Trend")


QUERY 1: Monthly Revenue Trend
 order_year  order_month  total_orders  unique_customers  total_revenue  avg_order_value
       2016           10           265               265       46566.71           175.72
       2016           12             1                 1          19.62            19.62
       2017            1           750               750      127545.67           170.06
       2017            2          1653              1653      271298.65           164.13
       2017            3          2546              2546      414369.39           162.75
       2017            4          2303              2303      390952.18           169.76
       2017            5          3546              3546      567066.73           159.92
       2017            6          3135              3135      490225.60           156.37
       2017            7          3872              3872      566403.93           146.28
       2017            8          4193              4193      646000.61       

Query 2: Top 10 Product Categories

In [18]:
querry2 = """
SELECT
    pr.product_category_name_english as category,
    COUNT(DISTINCT oi.order_id) as total_orders,
    COUNT(oi.product_id) as items_sold,
    ROUND(SUM(oi.price), 2) as total_revenue,
    ROUND(AVG(oi.price), 2) as avg_item_price
FROM order_items oi
JOIN products pr ON oi.product_id = pr.product_id
GROUP BY pr.product_category_name_english
ORDER BY total_revenue DESC
LIMIT 10
"""

result2 = run_query(query2, "QUERY 2: Top 10 Product Categories by Revenue")


QUERY 2: Top 10 Product Categories by Revenue
             category  total_orders  items_sold  total_revenue  avg_item_price
        health_beauty          8836        9670     1258681.34          130.16
        watches_gifts          5624        5991     1205005.68          201.14
       bed_bath_table          9417       11115     1036988.68           93.30
       sports_leisure          7720        8641      988048.97          114.34
computers_accessories          6689        7827      911954.32          116.51
      furniture_decor          6449        8334      729762.49           87.56
           cool_stuff          3632        3796      635290.85          167.36
           housewares          5884        6964      632248.66           90.79
                 auto          3897        4235      592720.11          139.96
         garden_tools          3518        4347      485256.46          111.63


In [19]:
top10_revenue = result2['total_revenue'].sum()
total_revenue = pd.read_sql_query(
    "SELECT SUM(price) as total FROM order_items",
    conn
)['total'][0]

concentration = (top10_revenue / total_revenue) * 100
print(f"\nTop 10 categories represent {concentration:.1f}% of toal revenue")


Top 10 categories represent 62.4% of toal revenue


Query 3: Revenue by State

In [41]:
query3 = """
SELECT
    c.customer_state,
    COUNT(DISTINCT o.order_id) as total_orders,
    COUNT(DISTINCT c.customer_unique_id) as unique_customers,
    ROUND(SUM(p.total_payment_value), 2) as total_revenue,
    ROUND(AVG(p.total_payment_value), 2) as avg_order_value,
    ROUND(SUM(p.total_payment_value) / COUNT(DISTINCT c.customer_unique_id), 2) as revenue_per_customer
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN order_payments p ON o.order_id = p.order_id
GROUP BY c.customer_state
ORDER BY total_revenue DESC
LIMIT 10
"""

result3 = run_query(query3, "QUERY 3: Top 10 States by Revenue")


QUERY 3: Top 10 States by Revenue
customer_state  total_orders  unique_customers  total_revenue  avg_order_value  revenue_per_customer
            SP         40500             39155     5770266.19           142.48                147.37
            RJ         12350             11917     2055690.45           166.45                172.50
            MG         11354             11001     1819277.61           160.23                165.37
            RS          5345              5168      861802.40           161.24                166.76
            PR          4923              4769      781919.55           158.83                163.96
            SC          3546              3449      595208.40           167.85                172.57
            BA          3256              3158      591270.60           181.59                187.23
            DF          2080              2019      346146.17           166.42                171.44
            GO          1957              1895      3342

PART 2: OPERATIONAL ANALYSIS

Query 4: Delivery Performance by State

In [47]:
query4 = """
SELECT
    c.customer_state,
    COUNT(o.order_id) as total_orders,
    ROUND(AVG(o.delivery_time_days), 1) as avg_delivery_days,
    ROUND(AVG(o.delivery_delay_days), 1) as acg_delay_days,
    ROUND(AVG(CAST(o.on_time_delivery AS FLOAT)) *100, 1) as on_time_pct
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.delivery_time_days IS NOT NULL
GROUP BY c.customer_state
ORDER BY avg_delivery_days DESC
LIMIT 15
"""

result4 = run_query(query4, "QUERY 4: Delivery Performance by State")


QUERY 4: Delivery Performance by State
customer_state  total_orders  avg_delivery_days  acg_delay_days  on_time_pct
            RR            41               29.0           -17.3         87.8
            AP            67               26.7           -19.7         97.0
            AM           145               26.0           -19.6         97.2
            AL           397               24.0            -8.7         78.6
            PA           946               23.3           -14.1         88.8
            MA           717               21.1            -9.6         82.6
            SE           335               21.0           -10.0         84.8
            CE          1279               20.8           -10.8         86.2
            AC            80               20.6           -20.7         96.3
            PB           517               20.0           -13.3         89.6
            PI           476               19.0           -11.3         86.1
            RO           243        

Query 5: Late Delivery Hotspots

In [50]:
query5 = """
SELECT
    c.customer_state,
    COUNT(o.order_id) as total_orders,
    SUM(CASE WHEN o.delivery_delay_days > 0 THEN 1 ELSE 0 END) as late_deliveries,
    ROUND(
        CAST(SUM(CASE WHEN o.delivery_delay_days > 0 THEN 1 ELSE 0 END) AS FLOAT) /
        COUNT(o.order_id) * 100,
        1
    ) as late_pct,
    ROUND(AVG(CASE WHEN o.delivery_delay_days > 0 THEN o.delivery_delay_days END), 1) as avg_days_late
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.delivery_delay_days IS NOT NULL
GROUP BY c.customer_state
HAVING late_pct > 20
ORDER BY late_pct DESC
"""

result5 = run_query(query5, "QUERY 5: States with >20% Late Delivery Rate")


QUERY 5: States with >20% Late Delivery Rate


customer_state  total_orders  late_deliveries  late_pct  avg_days_late
            AL           397               85      21.4            9.5


Query 6: Freight Cost Analysis

In [52]:
query6 = """
SELECT
    pr.product_category_name_english as category,
    COUNT(oi.order_id)as total_orders,
    ROUND(AVG(oi.price), 2) as avg_product_price,
    ROUND(AVG(oi.freight_value), 2) as avg_freight,
    ROUND(AVG(oi.freight_pct_of_price), 1) as avg_freight_pct
FROM order_items oi
JOIN products pr ON oi.product_id = pr.product_id
GROUP BY pr.product_category_name_english
HAVING COUNT(oi.order_id) > 50
ORDER BY avg_freight_pct DESC
LIMIT 10"""

result6 = run_query(query6, "QUERY 6: Categories with Highest Freight Percentage")


QUERY 6: Categories with Highest Freight Percentage
                category  total_orders  avg_product_price  avg_freight  avg_freight_pct
            dvds_blu_ray            64              93.74        20.14             83.3
             electronics          2767              57.91        16.83             68.4
      christmas_supplies           153              57.52        21.11             67.6
 fashion_underwear_beach           131              72.84        14.63             56.6
  signaling_and_security           199             108.09        32.70             54.6
               telephony          4545              71.21        15.67             50.6
              food_drink           278              54.60        16.22             49.8
                  drinks           379              59.18        15.15             42.8
costruction_tools_garden           238             108.05        22.32             42.5
              housewares          6964              90.79        20

PART 3: CUSTOMER ANALYSIS (2 QUERIES)

Query 7: Customer Segmentation

In [56]:
query7 = """
SELECT
    CASE
        WHEN order_count = 1 THEN 'One-time'
        WHEN order_count BETWEEN 2 AND 3 THEN 'Repeat'
        ELSE 'Loyal'
    END as customer_segment,
    COUNT(*) as customer_count,
    ROUND(AVG(lifetime_value), 2) as avg_lifetime_value,
    ROUND(SUM(lifetime_value), 2) as total_segment_revenue
FROM (
    SELECT
        c.customer_unique_id,
        COUNT(DISTINCT o.order_id) as order_count,
        SUM(p.total_payment_value) as lifetime_value
    FROM customers c
    JOIN orders o ON c.customer_id = o.customer_id
    JOIN order_payments p ON o.order_id = p.order_id
    GROUP BY c.customer_unique_id
) customer_stats
GROUP BY customer_segment
ORDER BY avg_lifetime_value DESC
"""

result7 = run_query(query7, "QUERY 7: Customer Segmentation by Purchase Frequency")


QUERY 7: Customer Segmentation by Purchase Frequency
customer_segment  customer_count  avg_lifetime_value  total_segment_revenue
           Loyal              47              789.42               37102.97
          Repeat            2754              300.38              827254.24
        One-time           90556              160.76            14558104.56


Query 8: Top Customers

In [61]:
query8 = """
SELECT 
    c.customer_unique_id,
    c.customer_state,
    COUNT(DISTINCT o.order_id) as total_orders,
    ROUND(SUM(p.total_payment_value), 2) as lifetime_value,
    ROUND(AVG(p.total_payment_value), 2) as avg_order_value,
    MIN(o.order_purchase_timestamp) as first_order,
    MAX(o.order_purchase_timestamp) as last_order
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
JOIN order_payments p ON o.order_id = p.order_id
GROUP BY c.customer_unique_id, c.customer_state
ORDER BY lifetime_value DESC
LIMIT 20
"""

result8 = run_query(query8, "QUERY 8: Top 20 Customers by Lifetime Value")


QUERY 8: Top 20 Customers by Lifetime Value
              customer_unique_id customer_state  total_orders  lifetime_value  avg_order_value         first_order          last_order
0a0a92112bd4c708ca5fde585afaa872             RJ             1        13664.08         13664.08 2017-09-29 15:24:52 2017-09-29 15:24:52
da122df9eeddfedc1dc1f5349a1a690c             RJ             2         7571.63          3785.82 2017-04-01 15:58:40 2017-04-01 15:58:41
763c8b1c9c68a0229c42c9fc6f662b93             ES             1         7274.88          7274.88 2018-07-15 14:49:44 2018-07-15 14:49:44
dc4802a71eae9be1dd28f5d788ceb526             MS             1         6929.31          6929.31 2017-02-12 20:37:36 2017-02-12 20:37:36
459bef486812aa25204be022145caa62             ES             1         6922.21          6922.21 2018-07-25 18:10:17 2018-07-25 18:10:17
ff4159b92c40ebe40454e3e6a7c35ed6             SP             1         6726.66          6726.66 2017-05-24 18:14:34 2017-05-24 18:14:34
4007669dec

PART 4: BEHAVIORAL ANALYSIS (2 QUERIES)

Query 9: Order Patterns by Day of Week

In [63]:
query9 = """
SELECT 
    order_day_of_week,
    CASE order_day_of_week
        WHEN 0 THEN 'Monday'
        WHEN 1 THEN 'Tuesday'
        WHEN 2 THEN 'Wednesday'
        WHEN 3 THEN 'Thursday'
        WHEN 4 THEN 'Friday'
        WHEN 5 THEN 'Saturday'
        WHEN 6 THEN 'Sunday'
    END as day_name,
    COUNT(DISTINCT o.order_id) as total_orders,
    ROUND(SUM(p.total_payment_value), 2) as total_revenue,
    ROUND(AVG(p.total_payment_value), 2) as avg_order_value
FROM orders o
JOIN order_payments p ON o.order_id = p.order_id
GROUP BY order_day_of_week
ORDER BY order_day_of_week
"""

result9 = run_query(query9, "QUERY 9: Order Patterns by Day of Week")


QUERY 9: Order Patterns by Day of Week
 order_day_of_week  day_name  total_orders  total_revenue  avg_order_value
                 0    Monday         15701     2530591.86           161.17
                 1   Tuesday         15503     2474065.60           159.59
                 2 Wednesday         15076     2396624.55           158.97
                 3  Thursday         14322     2284158.44           159.49
                 4    Friday         13685     2222878.71           162.43
                 5  Saturday         10555     1706107.53           161.64
                 6    Sunday         11635     1808035.08           155.40


Query 10: Payment Method Preferences

In [64]:
query10 = """
SELECT 
    payment_methods,
    COUNT(order_id) as total_orders,
    ROUND(SUM(total_payment_value), 2) as total_revenue,
    ROUND(AVG(total_payment_value), 2) as avg_order_value,
    ROUND(AVG(max_installments), 1) as avg_installments
FROM order_payments
GROUP BY payment_methods
ORDER BY total_orders DESC
LIMIT 10
"""

result10 = run_query(query10, "QUERY 10: Payment Method Analysis")


QUERY 10: Payment Method Analysis
        payment_methods  total_orders  total_revenue  avg_order_value  avg_installments
            credit_card         74259    12397278.59           166.95               3.5
                 boleto         19784     2869361.27           145.03               1.0
                voucher          1621      185422.28           114.39               1.0
             debit_card          1527      217939.79           142.72               1.0
   credit_card, voucher          1127      164566.17           146.02               2.2
   voucher, credit_card          1118      174151.20           155.77               2.1
            not_defined             3           0.00             0.00               1.0
credit_card, debit_card             1         152.82           152.82               3.0


SAVE QUERIES TO FILE

In [66]:
queries_sql = """-- E-COMMERCE ANALYSIS SQL QUERIES
-- Database: ecommerce.db
-- Date: January 5, 2026

-- QUERY 1: Monthly Revenue Trend
{q1}

-- QUERY 2: Top 10 Product Categories
{q2}

-- QUERY 3: Revenue by State
{q3}

-- QUERY 4: Delivery Performance by State
{q4}

-- QUERY 5: Late Delivery Hotspots
{q5}

-- QUERY 6: Freight Cost Analysis
{q6}

-- QUERY 7: Customer Segmentation
{q7}

-- QUERY 8: Top 20 Customers
{q8}

-- QUERY 9: Order Patterns by Day
{q9}

-- QUERY 10: Payment Methods
{q10}
""".format(
    q1=query1, q2=query2, q3=query3, q4=query4, q5=query5,
    q6=query6, q7=query7, q8=query8, q9=query9, q10=query10
)

with open('../sql/business_queries.sql', 'w') as f:
    f.write(queries_sql)

print("\nAll queries saved to: sql/business_queries.sql")

conn.close()


All queries saved to: sql/business_queries.sql


FINAL SUMMARY

In [65]:
print("\n" + "=" * 70)
print("DAY 5 COMPLETE")
print("=" * 70)

print("\nAccomplishments:")
print("  1. Wrote 10 business SQL queries")
print("  2. Covered revenue, operations, customers, behavior")
print("  3. Each query answers specific business question")
print("  4. Saved to sql/business_queries.sql")

print("\nSQL Skills Demonstrated:")
print("  - JOINs (2-3 table joins)")
print("  - Aggregations (SUM, COUNT, AVG)")
print("  - GROUP BY with multiple columns")
print("  - CASE statements for categorization")
print("  - Subqueries (Query 7)")
print("  - HAVING clause for post-aggregation filtering")

print("\nNext Steps (Day 6):")
print("  - Advanced SQL (CTEs, window functions)")
print("  - Running totals, rankings")
print("  - Cohort analysis")
print("  - 5 more complex queries")


DAY 5 COMPLETE

Accomplishments:
  1. Wrote 10 business SQL queries
  2. Covered revenue, operations, customers, behavior
  3. Each query answers specific business question
  4. Saved to sql/business_queries.sql

SQL Skills Demonstrated:
  - JOINs (2-3 table joins)
  - Aggregations (SUM, COUNT, AVG)
  - GROUP BY with multiple columns
  - CASE statements for categorization
  - Subqueries (Query 7)
  - HAVING clause for post-aggregation filtering

Next Steps (Day 6):
  - Advanced SQL (CTEs, window functions)
  - Running totals, rankings
  - Cohort analysis
  - 5 more complex queries
