# QuickBooks Commerce - Sales Forecasting Data Exploration

This notebook explores the sales data and prepares it for model training.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

For this demo, we'll use synthetic data that simulates realistic sales patterns.
In production, you would load from:
- Kaggle dataset (e.g., Retail Sales Dataset)
- QuickBooks Commerce API
- Data warehouse

In [None]:
# Generate synthetic sales data
def generate_sales_data(days=730):
    """Generate realistic synthetic sales data"""
    categories = [
        'Electronics', 'Clothing & Apparel', 'Home & Garden',
        'Sports & Outdoors', 'Books & Media', 'Food & Beverages',
        'Health & Beauty', 'Toys & Games'
    ]
    
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days)
    
    data = []
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    for date in date_range:
        for category in categories:
            # Base sales with trend
            days_elapsed = (date - start_date).days
            base_sales = 800 + days_elapsed * 0.5
            
            # Seasonality (yearly)
            day_of_year = date.timetuple().tm_yday
            seasonality = 200 * np.sin(2 * np.pi * day_of_year / 365)
            
            # Weekly pattern
            if date.weekday() >= 5:
                weekend_boost = 150
            else:
                weekend_boost = 0
            
            # Random noise
            noise = np.random.normal(0, 80)
            
            sales = max(0, base_sales + seasonality + weekend_boost + noise)
            
            data.append({
                'date': date,
                'category': category,
                'sales_units': int(sales),
                'revenue': round(sales * np.random.uniform(30, 80), 2)
            })
    
    return pd.DataFrame(data)

df = generate_sales_data()
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("\nDataset Info:")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Categories: {df['category'].nunique()}")
print(f"Total records: {len(df)}")

print("\nSales Statistics by Category:")
df.groupby('category').agg({
    'sales_units': ['mean', 'std', 'min', 'max'],
    'revenue': 'sum'
}).round(2)

In [None]:
# Sales trend over time
plt.figure(figsize=(14, 6))
for category in df['category'].unique():
    cat_data = df[df['category'] == category].groupby('date')['sales_units'].sum()
    plt.plot(cat_data.index, cat_data.values, label=category, alpha=0.7)

plt.title('Sales Trend by Category Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Sales Units')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
# Category performance comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Total sales by category
category_sales = df.groupby('category')['sales_units'].sum().sort_values(ascending=True)
category_sales.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Total Sales by Category', fontweight='bold')
axes[0].set_xlabel('Sales Units')

# Revenue by category
category_revenue = df.groupby('category')['revenue'].sum().sort_values(ascending=True)
category_revenue.plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_title('Total Revenue by Category', fontweight='bold')
axes[1].set_xlabel('Revenue ($)')

plt.tight_layout()
plt.show()

In [None]:
# Seasonality analysis
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Monthly pattern
monthly_sales = df.groupby('month')['sales_units'].mean()
axes[0].bar(monthly_sales.index, monthly_sales.values, color='teal')
axes[0].set_title('Average Daily Sales by Month', fontweight='bold')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Average Sales Units')
axes[0].set_xticks(range(1, 13))

# Weekly pattern
weekly_sales = df.groupby('day_of_week')['sales_units'].mean()
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[1].bar(range(7), weekly_sales.values, color='purple')
axes[1].set_title('Average Sales by Day of Week', fontweight='bold')
axes[1].set_xlabel('Day of Week')
axes[1].set_ylabel('Average Sales Units')
axes[1].set_xticks(range(7))
axes[1].set_xticklabels(days)

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
def create_features(df):
    """Create time-series features"""
    df = df.copy()
    
    # Time features
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Lag features
    for lag in [7, 14, 30]:
        df[f'sales_lag_{lag}'] = df.groupby('category')['sales_units'].shift(lag)
    
    # Rolling features
    for window in [7, 30]:
        df[f'sales_rolling_mean_{window}'] = df.groupby('category')['sales_units'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        df[f'sales_rolling_std_{window}'] = df.groupby('category')['sales_units'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
    
    return df

df_features = create_features(df)
print("\nFeatures created:")
print(df_features.columns.tolist())
df_features.head()

## 4. Save Processed Data

In [None]:
# Save for model training
import os
os.makedirs('../data/processed', exist_ok=True)

df_features.to_csv('../data/processed/sales_features.csv', index=False)
print("Data saved to: ../data/processed/sales_features.csv")

## 5. Key Insights

1. **Seasonality**: Clear weekly and yearly patterns in sales
2. **Weekend Effect**: Sales increase on weekends
3. **Category Differences**: Electronics and Clothing are top performers
4. **Growth Trend**: Overall positive trend in sales over time

Next steps:
- Train forecasting models (XGBoost, Prophet)
- Incorporate external data (economic indicators)
- Validate model performance