# üìä Amazon Sales Data Analysis & Insights

## Business Intelligence Dashboard
**Objective:** Analyze processed data from the Dagster pipeline to extract business insights and create visualizations.

**Key Questions to Answer:**
- Which month was most profitable?
- What are the daily order patterns by status?
- Which product categories perform best?
- What geographical trends exist in sales?

**Data Source:** DuckDB tables created by Dagster pipeline:
- `raw_amazon_sales` - Clean sales data
- `monthly_revenue` - Monthly revenue by category  
- `daily_orders` - Daily order counts by status

**Note:** This notebook focuses on data analysis and visualization. The data pipeline is handled separately by Dagster Python modules.

## üì¶ Import Required Libraries

In [1]:
# Data analysis and visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Database connection
import duckdb

# Utility libraries
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Analysis libraries imported successfully")
print("üìä Ready for data analysis and visualization")

ModuleNotFoundError: No module named 'plotly'

## üîå Connect to DuckDB Database

In [None]:
# Connect to the DuckDB database created by Dagster pipeline
db_path = "../data/sales.duckdb"

try:
    conn = duckdb.connect(db_path)
    print(f"‚úÖ Connected to DuckDB: {db_path}")
    
    # Check available tables
    tables = conn.execute("SHOW TABLES").fetchall()
    print(f"üìä Available tables: {len(tables)}")
    for table in tables:
        table_name = table[0]
        count = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
        print(f"  ‚Ä¢ {table_name}: {count:,} records")
        
except Exception as e:
    print(f"‚ùå Error connecting to database: {e}")
    print("üí° Make sure the Dagster pipeline has been executed first!")

## üìà Monthly Revenue Analysis

In [None]:
# Load monthly revenue data
monthly_revenue_query = """
SELECT 
    year_month,
    category,
    total_revenue,
    order_count,
    avg_order_value
FROM monthly_revenue
ORDER BY year_month, total_revenue DESC
"""

df_monthly = pd.read_sql(monthly_revenue_query, conn)
print(f"üìä Loaded {len(df_monthly)} monthly revenue records")

# Display sample data
print("\nüîç Sample Monthly Revenue Data:")
print(df_monthly.head(10))

# Calculate totals by month
monthly_totals = df_monthly.groupby('year_month').agg({
    'total_revenue': 'sum',
    'order_count': 'sum'
}).reset_index()

print(f"\nüí∞ Monthly Revenue Totals:")
for _, row in monthly_totals.iterrows():
    print(f"  ‚Ä¢ {row['year_month']}: ${row['total_revenue']:,.0f} ({row['order_count']:,} orders)")

In [None]:
# üèÜ IDENTIFY MOST PROFITABLE MONTH
most_profitable_month = monthly_totals.loc[monthly_totals['total_revenue'].idxmax()]

print("üèÜ MOST PROFITABLE MONTH ANALYSIS:")
print("=" * 50)
print(f"üìÖ Month: {most_profitable_month['year_month']}")
print(f"üí∞ Total Revenue: ${most_profitable_month['total_revenue']:,.0f}")
print(f"üì¶ Total Orders: {most_profitable_month['order_count']:,}")
print(f"üíµ Average Order Value: ${most_profitable_month['total_revenue']/most_profitable_month['order_count']:,.0f}")

# Create monthly revenue visualization
fig = px.bar(monthly_totals, 
             x='year_month', 
             y='total_revenue',
             title='üìà Monthly Revenue Comparison',
             labels={'total_revenue': 'Total Revenue ($)', 'year_month': 'Month'},
             color='total_revenue',
             color_continuous_scale='Blues')

fig.update_layout(
    showlegend=False,
    xaxis_title="Month",
    yaxis_title="Total Revenue ($)",
    height=400
)

# Highlight most profitable month
fig.add_annotation(
    x=most_profitable_month['year_month'],
    y=most_profitable_month['total_revenue'],
    text=f"Most Profitable<br>${most_profitable_month['total_revenue']:,.0f}",
    showarrow=True,
    arrowhead=2,
    arrowcolor="red",
    bgcolor="yellow",
    bordercolor="red"
)

fig.show()

## üì¶ Daily Orders Analysis

In [None]:
# Load daily orders data
daily_orders_query = """
SELECT 
    order_date,
    status,
    order_count,
    total_quantity,
    total_amount
FROM daily_orders
ORDER BY order_date, status
"""

df_daily = pd.read_sql(daily_orders_query, conn)
df_daily['order_date'] = pd.to_datetime(df_daily['order_date'])

print(f"üìä Loaded {len(df_daily)} daily order records")
print(f"üìÖ Date range: {df_daily['order_date'].min()} to {df_daily['order_date'].max()}")

# Summary by order status
status_summary = df_daily.groupby('status').agg({
    'order_count': 'sum',
    'total_quantity': 'sum', 
    'total_amount': 'sum'
}).reset_index()

print(f"\nüìã Orders Summary by Status:")
for _, row in status_summary.iterrows():
    print(f"  ‚Ä¢ {row['status']}: {row['order_count']:,} orders, ${row['total_amount']:,.0f} total")

In [None]:
# Create daily orders by status visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Daily orders time series by status
for status in df_daily['status'].unique():
    status_data = df_daily[df_daily['status'] == status]
    ax1.plot(status_data['order_date'], status_data['order_count'], 
             marker='o', label=status, linewidth=2, markersize=4)

ax1.set_title('üì¶ Daily Orders by Status Over Time', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Orders')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Order status distribution pie chart
pie_data = status_summary.set_index('status')['order_count']
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#ff99cc']
ax2.pie(pie_data.values, labels=pie_data.index, autopct='%1.1f%%', 
        startangle=90, colors=colors[:len(pie_data)])
ax2.set_title('üìä Order Distribution by Status', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Daily orders summary statistics
print(f"\nüìà DAILY ORDERS INSIGHTS:")
print(f"‚Ä¢ Average orders per day: {df_daily.groupby('order_date')['order_count'].sum().mean():.0f}")
print(f"‚Ä¢ Peak order day: {df_daily.groupby('order_date')['order_count'].sum().idxmax()}")
print(f"‚Ä¢ Total unique days: {df_daily['order_date'].nunique()}")

## üéØ Product Category Performance

In [None]:
# Analyze category performance
category_performance = df_monthly.groupby('category').agg({
    'total_revenue': 'sum',
    'order_count': 'sum',
    'avg_order_value': 'mean'
}).round(2).sort_values('total_revenue', ascending=False)

print("üèÜ TOP PERFORMING CATEGORIES:")
print("=" * 40)
for category, row in category_performance.head(10).iterrows():
    print(f"üì¶ {category}:")
    print(f"   üí∞ Revenue: ${row['total_revenue']:,.0f}")
    print(f"   üìä Orders: {row['order_count']:,}")
    print(f"   üíµ Avg Order: ${row['avg_order_value']:.0f}")
    print()

# Category revenue visualization
fig = px.bar(category_performance.head(10).reset_index(), 
             x='category', 
             y='total_revenue',
             title='üéØ Top 10 Categories by Revenue',
             labels={'total_revenue': 'Total Revenue ($)', 'category': 'Product Category'},
             color='total_revenue',
             color_continuous_scale='Viridis')

fig.update_xaxes(tickangle=45)
fig.update_layout(height=500, showlegend=False)
fig.show()

# Monthly category trends for top categories
top_categories = category_performance.head(5).index.tolist()
monthly_category_trends = df_monthly[df_monthly['category'].isin(top_categories)]

fig = px.line(monthly_category_trends, 
              x='year_month', 
              y='total_revenue', 
              color='category',
              title='üìà Monthly Revenue Trends for Top 5 Categories',
              labels={'total_revenue': 'Total Revenue ($)', 'year_month': 'Month'})

fig.update_layout(height=400)
fig.show()

## üó∫Ô∏è Geographical Analysis

In [None]:
# Geographical analysis from raw data
geo_query = """
SELECT 
    ship_state,
    ship_city,
    COUNT(*) as order_count,
    SUM(amount) as total_revenue,
    AVG(amount) as avg_order_value
FROM raw_amazon_sales 
WHERE status != 'Cancelled' AND amount > 0
GROUP BY ship_state, ship_city
ORDER BY total_revenue DESC
"""

df_geo = pd.read_sql(geo_query, conn)

# State-level analysis
state_analysis = df_geo.groupby('ship_state').agg({
    'order_count': 'sum',
    'total_revenue': 'sum',
    'avg_order_value': 'mean'
}).round(2).sort_values('total_revenue', ascending=False)

print("üó∫Ô∏è TOP STATES BY REVENUE:")
print("=" * 35)
for state, row in state_analysis.head(10).iterrows():
    if pd.notna(state) and state.strip():
        print(f"üìç {state}:")
        print(f"   üí∞ Revenue: ${row['total_revenue']:,.0f}")
        print(f"   üìä Orders: {row['order_count']:,}")
        print(f"   üíµ Avg Order: ${row['avg_order_value']:.0f}")
        print()

# Top cities analysis
top_cities = df_geo.head(15)
print("üèôÔ∏è TOP CITIES BY REVENUE:")
print("=" * 30)
for _, row in top_cities.iterrows():
    if pd.notna(row['ship_city']) and row['ship_city'].strip():
        city_state = f"{row['ship_city']}, {row['ship_state']}" if pd.notna(row['ship_state']) else row['ship_city']
        print(f"üè¢ {city_state}: ${row['total_revenue']:,.0f} ({row['order_count']} orders)")

# Visualize top states
top_states = state_analysis.head(10).reset_index()
fig = px.bar(top_states, 
             x='ship_state', 
             y='total_revenue',
             title='üó∫Ô∏è Top 10 States by Revenue',
             labels={'total_revenue': 'Total Revenue ($)', 'ship_state': 'State'},
             color='total_revenue',
             color_continuous_scale='Blues')

fig.update_xaxes(tickangle=45)
fig.update_layout(height=500, showlegend=False)
fig.show()

## üìä Business Intelligence Summary

In [None]:
# Generate comprehensive business intelligence summary
print("üìä AMAZON SALES BUSINESS INTELLIGENCE DASHBOARD")
print("=" * 55)

# Key Performance Indicators
total_revenue = monthly_totals['total_revenue'].sum()
total_orders = monthly_totals['order_count'].sum()
avg_order_value = total_revenue / total_orders if total_orders > 0 else 0

print(f"\nüí∞ FINANCIAL OVERVIEW:")
print(f"‚Ä¢ Total Revenue: ${total_revenue:,.0f}")
print(f"‚Ä¢ Total Orders: {total_orders:,}")
print(f"‚Ä¢ Average Order Value: ${avg_order_value:.0f}")
print(f"‚Ä¢ Revenue Growth Period: {df_daily['order_date'].min().strftime('%B %Y')} - {df_daily['order_date'].max().strftime('%B %Y')}")

print(f"\nüèÜ KEY FINDINGS:")
print(f"‚Ä¢ Most Profitable Month: {most_profitable_month['year_month']} (${most_profitable_month['total_revenue']:,.0f})")
print(f"‚Ä¢ Top Product Category: {category_performance.index[0]} (${category_performance.iloc[0]['total_revenue']:,.0f})")
print(f"‚Ä¢ Top State: {state_analysis.index[0]} (${state_analysis.iloc[0]['total_revenue']:,.0f})")
print(f"‚Ä¢ Average Daily Orders: {df_daily.groupby('order_date')['order_count'].sum().mean():.0f}")

print(f"\nüìà ORDER STATUS BREAKDOWN:")
for _, row in status_summary.iterrows():
    percentage = (row['order_count'] / total_orders) * 100
    print(f"‚Ä¢ {row['status']}: {row['order_count']:,} orders ({percentage:.1f}%)")

print(f"\nüéØ BUSINESS RECOMMENDATIONS:")
print(f"‚Ä¢ Focus marketing efforts in {most_profitable_month['year_month']} - highest conversion period")
print(f"‚Ä¢ Expand {category_performance.index[0]} product line - top revenue generator")
print(f"‚Ä¢ Prioritize logistics in {state_analysis.index[0]} - highest revenue state")
print(f"‚Ä¢ Investigate cancelled orders ({status_summary[status_summary['status']=='Cancelled']['order_count'].iloc[0]:,} orders) for improvement opportunities")

print(f"\n‚úÖ DATA QUALITY METRICS:")
print(f"‚Ä¢ Data Coverage: {df_daily['order_date'].nunique()} unique days")
print(f"‚Ä¢ Total Records Analyzed: {total_orders:,}")
print(f"‚Ä¢ Geographic Coverage: {state_analysis.shape[0]} states, {df_geo.shape[0]} cities")

# Close database connection
conn.close()
print(f"\nüîí Database connection closed")
print(f"üìã Analysis completed successfully!")