In [9]:
import pandas as pd

df_final = pd.read_csv('../data/processed/cleaned_sales.csv')

def integrity_audit(df):
    print("--- Data Integrity Audit ---")
    # 1. Check for Nulls
    null_counts = df.isnull().sum()
    print(f"Nulls found:\n{null_counts[null_counts > 0] if null_counts.sum() > 0 else 'None'}\n")
    
    # 2. Check for Duplicates
    dupes = df.duplicated().sum()
    print(f"Duplicate rows: {dupes}")
    
    # 3. Check for Negative Values (Should be 0 after cleaning)
    neg_sales = (df['sales'] < 0).sum()
    print(f"Negative sales records: {neg_sales}")
    
    # 4. Logical Range Check
    print(f"Date Range: {df['order_date'].min()} to {df['order_date'].max()}")
    print(f"Total Records: {len(df)}")

integrity_audit(df_final)

  df_final = pd.read_csv('../data/processed/cleaned_sales.csv')


--- Data Integrity Audit ---
Nulls found:
None

Duplicate rows: 5195
Negative sales records: 0
Date Range: 2010-12-01 08:26:00 to 2018-12-30
Total Records: 407684


In [10]:
import pandas as pd

df = pd.read_csv('../data/processed/cleaned_sales.csv')
df['order_date'] = pd.to_datetime(df['order_date'], format='mixed')

df_pbix = df[df['order_date'] >= '2015-01-01'].copy()

df_pbix.to_csv('../data/processed/cleaned_sales_final.csv', index=False)

monthly_final = df_pbix.set_index('order_date').resample('MS')['sales'].sum().reset_index()
monthly_final.to_csv('../data/processed/monthly_sales_final.csv', index=False)

print(f"Final Export Successful.")
print(f"New Date Range: {df_pbix['order_date'].min()} to {df_pbix['order_date'].max()}")
print(f"Total Records for Power BI: {len(df_pbix)}")

  df = pd.read_csv('../data/processed/cleaned_sales.csv')


Final Export Successful.
New Date Range: 2015-01-03 00:00:00 to 2018-12-30 00:00:00
Total Records for Power BI: 9800


In [11]:
import pandas as pd

df_m = pd.read_csv('../data/processed/monthly_sales.csv')

def forecast_logic_check(df):
    print("--- Forecasting Logic Check ---")
    
    # Standardize column names to lowercase to avoid KeyErrors
    df.columns = [col.lower() for col in df.columns]
    
    # If the aggregation created a column named '0' or something else, 
    # and 'sales' is missing, find the numeric column.
    if 'sales' not in df.columns:
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) > 0:
            df = df.rename(columns={numeric_cols[0]: 'sales'})
        else:
            print("❌ Error: No numeric sales data found in the file.")
            return

    # Calculate MoM Growth
    df['mom_growth'] = df['sales'].pct_change() * 100
    avg_growth = df['mom_growth'].mean()
    
    print(f"Average Month-over-Month Growth: {avg_growth:.2f}%")
    
    if abs(avg_growth) > 50:
        print("⚠️ Risk: Growth rate seems extreme. This often happens if the first/last month is partial.")
    else:
        print("✅ Growth rate is within realistic bounds.")

forecast_logic_check(df_m)

--- Forecasting Logic Check ---
Average Month-over-Month Growth: 30.84%
✅ Growth rate is within realistic bounds.


In [12]:
import os
required_files = [
    '../data/processed/cleaned_sales.csv',
    '../data/processed/daily_sales.csv',
    '../data/processed/monthly_sales.csv'
]

for f in required_files:
    status = "EXISTS" if os.path.exists(f) else "MISSING"
    print(f"{f}: {status}")

../data/processed/cleaned_sales.csv: EXISTS
../data/processed/daily_sales.csv: EXISTS
../data/processed/monthly_sales.csv: EXISTS
