# Sales Data Processing

This notebook processes large sales CSV files and generates summary reports.

In [None]:
import pandas as pd
import numpy as np
import os
import sys

In [None]:
def process_sales_data(input_file, output_dir):
    """Process large sales CSV file"""
    
    print(f"Processing {input_file}...")
    
    df = pd.read_csv(input_file)
    print(f"Loaded {len(df)} rows")
    
    original_df = df.copy()
    
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.upper()
        df[f'{col}_lower'] = original_df[col].str.lower()
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    
    category_dfs = []
    for category in df['category'].unique():
        category_df = df[df['category'] == category].copy()
        category_df['revenue'] = category_df['quantity'] * category_df['price']
        category_df['profit_margin'] = category_df['revenue'] * 0.3
        category_df['tax'] = category_df['revenue'] * 0.1
        category_dfs.append(category_df)
    
    all_categories = pd.concat(category_dfs, ignore_index=True)
    
    daily_summary = all_categories.groupby(['date', 'category']).agg({
        'revenue': 'sum',
        'quantity': 'sum',
        'profit_margin': 'mean'
    }).reset_index()
    
    monthly_summary = all_categories.groupby(['year', 'month', 'category']).agg({
        'revenue': 'sum',
        'quantity': 'sum',
        'profit_margin': 'mean'
    }).reset_index()
    
    category_summary = all_categories.groupby('category').agg({
        'revenue': ['sum', 'mean', 'std'],
        'quantity': ['sum', 'mean', 'std'],
        'profit_margin': ['mean', 'std']
    }).reset_index()
    
    pivot_daily = all_categories.pivot_table(
        values='revenue',
        index='date',
        columns='category',
        aggfunc='sum',
        fill_value=0
    )
    
    pivot_monthly = all_categories.pivot_table(
        values='revenue',
        index=['year', 'month'],
        columns='category',
        aggfunc='sum',
        fill_value=0
    )
    
    print("Writing output files...")
    
    daily_summary.to_csv(os.path.join(output_dir, 'daily_summary.csv'), index=False)
    monthly_summary.to_csv(os.path.join(output_dir, 'monthly_summary.csv'), index=False)
    category_summary.to_csv(os.path.join(output_dir, 'category_summary.csv'), index=False)
    pivot_daily.to_csv(os.path.join(output_dir, 'pivot_daily.csv'))
    pivot_monthly.to_csv(os.path.join(output_dir, 'pivot_monthly.csv'))
    
    for year in all_categories['year'].unique():
        year_data = all_categories[all_categories['year'] == year]
        year_data.to_csv(os.path.join(output_dir, f'data_{year}.csv'), index=False)
    
    print(f"Processing complete. Output files saved to {output_dir}")
    
    return {
        'total_rows': len(all_categories),
        'total_revenue': all_categories['revenue'].sum(),
        'unique_categories': all_categories['category'].nunique(),
        'date_range': f"{all_categories['date'].min()} to {all_categories['date'].max()}"
    }

In [None]:
def generate_sample_data(output_file, num_rows=1000000):
    """Generate sample sales data for testing"""
    print(f"Generating {num_rows} rows of sample data...")
    
    categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Sports', 'Home', 'Toys', 'Beauty']
    
    dates = pd.date_range(start='2020-01-01', end='2023-12-31', periods=num_rows)
    
    data = {
        'order_id': range(1, num_rows + 1),
        'date': np.random.choice(dates, num_rows),
        'category': np.random.choice(categories, num_rows),
        'product_name': ['Product_' + str(i) for i in range(num_rows)],
        'quantity': np.random.randint(1, 100, num_rows),
        'price': np.random.uniform(10, 1000, num_rows).round(2),
        'customer_id': np.random.randint(1000, 50000, num_rows),
        'region': np.random.choice(['North', 'South', 'East', 'West'], num_rows),
        'description': ['This is a long product description for item ' + str(i) + ' with lots of text that takes up memory' * 5 for i in range(num_rows)]
    }
    
    df = pd.DataFrame(data)
    df.to_csv(output_file, index=False)
    print(f"Sample data saved to {output_file}")

In [None]:
# Main execution
if not os.path.exists('data'):
    os.makedirs('data')
if not os.path.exists('output'):
    os.makedirs('output')

# You can run process_sales_data here
# results = process_sales_data('data/sales_data.csv', 'output')
# print(f"Results: {results}")