In [3]:
# 1. Import Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings
from sklearn.preprocessing import MinMaxScaler
import os

# Suppress warnings
warnings.filterwarnings('ignore')

# Set Seaborn style
sns.set(style="whitegrid")

# Create ../outputs directories
os.makedirs("../outputs/charts", exist_ok=True)
os.makedirs("../outputs/reports", exist_ok=True)

# Load data
df = pd.read_csv('../data/AusApparalSales4thQrt2020.csv')

# Preview
df.head()

# 2. Data Wrangling

# Check null values
print(df.isna().sum())

# Fill missing values
df['Sales'] = df['Sales'].fillna(df['Sales'].median())
df['Unit'] = df['Unit'].fillna(df['Unit'].median())

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_', regex=True)

# Normalize sales & unit columns
scaler = MinMaxScaler()
df[['sales_normalized', 'unit_normalized']] = scaler.fit_transform(df[['sales', 'unit']])

df.head()

# 3. GroupBy and Recommendations

# Group by state
state_sales = df.groupby('state')['sales'].sum().sort_values(ascending=False)
print(state_sales)
state_sales.to_csv("../outputs/reports/state_sales.csv")

# Group by demographic
group_sales = df.groupby('group')['sales'].sum().sort_values(ascending=False)
print(group_sales)
group_sales.to_csv("../outputs/reports/group_sales.csv")

# 4. Descriptive Statistical Analysis

# Descriptive Stats
desc_stats = df[['sales', 'unit']].describe()
desc_stats.to_csv("../outputs/reports/descriptive_stats.csv")

# Median, Mode, Std
with open("../outputs/reports/stat_summary.txt", "w") as f:
    f.write("Median:\n")
    f.write(str(df[['sales', 'unit']].median()))
    f.write("\n\nMode:\n")
    f.write(str(df[['sales', 'unit']].mode().iloc[0]))
    f.write("\n\nStandard Deviation:\n")
    f.write(str(df[['sales', 'unit']].std()))

# 5. Sales Rankings

max_state = state_sales.idxmax()
min_state = state_sales.idxmin()
max_group = group_sales.idxmax()
min_group = group_sales.idxmin()

with open("../outputs/reports/rankings.txt", "w") as f:
    f.write(f"Highest sales: {max_state} | {state_sales[max_state]}\n")
    f.write(f"Lowest sales: {min_state} | {state_sales[min_state]}\n")
    f.write(f"Highest group sales: {max_group}\n")
    f.write(f"Lowest group sales: {min_group}\n")

# 6. Time-Based Reports

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.set_index('date', inplace=True)
df = df[~df.index.isna()]

# Weekly
weekly_sales = df['sales'].resample('W').sum()
weekly_sales.to_csv("../outputs/reports/weekly_sales.csv")

# Monthly
monthly_sales = df['sales'].resample('M').sum()
monthly_sales.to_csv("../outputs/reports/monthly_sales.csv")

# Quarterly
quarterly_sales = df['sales'].resample('Q').sum()
quarterly_sales.to_csv("../outputs/reports/quarterly_sales.csv")

# 7. Visualization Dashboard (Seaborn/Matplotlib)

# State-wise Demographic Sales
plt.figure(figsize=(12,6))
sns.barplot(data=df, x='state', y='sales', hue='group')
plt.title("State-wise Sales by Demographic")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("../outputs/charts/state_demographic_sales.png")
plt.close()

# Group-wise Sales across States
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='group', y='sales')
plt.title("Sales Distribution by Group")
plt.tight_layout()
plt.savefig("../outputs/charts/group_sales_boxplot.png")
plt.close()

# Time-of-Day Analysis
df['hour'] = df.index.hour
plt.figure(figsize=(12,6))
sns.histplot(df, x='hour', bins=24, kde=True)
plt.title("Sales Frequency by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Sales")
plt.tight_layout()
plt.savefig("../outputs/charts/sales_by_hour.png")
plt.close()


Date     0
Time     0
State    0
Group    0
Unit     0
Sales    0
dtype: int64
state
VIC    105565000
NSW     74970000
SA      58857500
QLD     33417500
TAS     22760000
NT      22580000
WA      22152500
Name: sales, dtype: int64
group
Men        85750000
Women      85442500
Kids       85072500
Seniors    84037500
Name: sales, dtype: int64
