# Brewed Insights: Coffee Sales Analysis

### 1. Install and import the relevant libraries

In [None]:
import sqlite3
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import numpy as np

### 2. Load data

In [None]:
# Load CSV data into a Pandas DataFrame
base_path = os.path.dirname(os.getcwd())  # go up from notebooks folder
data_path = os.path.join(base_path, 'data', 'coffee_sales.csv')

df = pd.read_csv(data_path)

### 3. Handle Cross-Year Data (Mar 2024 - Mar 2025)
- Handling the non-standard fiscal year and a single continuous time series

In [None]:
# Parse the Date and Time columns
df['Date'] = pd.to_datetime(df['Date'])
# Handle inconsistent time formats (some have milliseconds, some don't)
df['DateTime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), format='mixed') # mixed format parsing

# Extract Year for calculations
df['Year'] = df['Date'].dt.year

# Create sequential month number (handles cross-year properly)
# This counts unique year-month combinations
df['month_sequence'] = (
    (df['Date'].dt.year - df['Date'].dt.year.min()) * 12 + 
    df['Date'].dt.month
)
# Normalize to start from 1
df['month_sequence'] = df['month_sequence'] - df['month_sequence'].min() + 1

# Create display labels for months (e.g., "Mar 2024", "Apr 2024")
df['year_month_display'] = df['Date'].dt.strftime('%b %Y')

# Create year_month_sort for proper ordering
df['year_month_sort'] = df['Date'].dt.year * 100 + df['Date'].dt.month

# Create fiscal year fields (March = Month 1)
FISCAL_START_MONTH = 3  # March
df['fiscal_month'] = ((df['Date'].dt.month - FISCAL_START_MONTH) % 12) + 1 # Fiscal month 1 = March
df['fiscal_year'] = df['Date'].dt.year # Initial fiscal year
df.loc[df['Date'].dt.month < FISCAL_START_MONTH, 'fiscal_year'] -= 1 # Adjust fiscal year for Jan/Feb

# Create fiscal period label for display
df['fiscal_period'] = 'FY' + df['fiscal_year'].astype(str) + '-M' + df['fiscal_month'].astype(str).str.zfill(2)

print("âœ“ Cross-year data handling complete")
print(f"Data spans: {df['year_month_display'].min()} to {df['year_month_display'].max()}")
print(f"Total unique months: {df['month_sequence'].nunique()}")
print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")

### 4. Create a temporary database using SQLite and insert the table

In [None]:
# Create a temporary SQLite database
conn = sqlite3.connect('coffee.db')

# Write the DataFrame into a SQL table
df.to_sql('coffee_sales', conn, index=False, if_exists='replace')

### 5. Create a function that will be reutilized later

In [None]:
def run_query(query, conn):
    """Helper function to run SQL queries and return DataFrame."""
    return pd.read_sql_query(query, conn)

### 6. Exploratory Data Analysis (EDA)

#### 6.1 Top-Selling Coffee Products

In [None]:
top_sellers = run_query("""
SELECT coffee_name, 
       COUNT(*) AS total_sales, 
       ROUND(SUM(money), 2) AS total_revenue,
       ROUND(AVG(money), 2) AS avg_price
FROM coffee_sales
GROUP BY coffee_name
ORDER BY total_revenue DESC
LIMIT 10;
""", conn)

print(top_sellers.to_string(index=False)) # to_string to avoid truncation

#### 6.2 Peak Hours

In [None]:
peak_hours = run_query("""
SELECT hour_of_day, 
       ROUND(SUM(money), 2) AS total_revenue,
       COUNT(*) AS transactions
FROM coffee_sales
GROUP BY hour_of_day
ORDER BY hour_of_day;
""", conn)

print(peak_hours.to_string(index=False))
print(f"\nPeak Hour: {peak_hours.loc[peak_hours['total_revenue'].idxmax(), 'hour_of_day']}:00")
print(f"Peak Revenue: ${peak_hours['total_revenue'].max():,.2f}")

#### 6.3 Revenue by Day of the Week

In [None]:
revenue_by_day = run_query("""
SELECT Weekday, 
       ROUND(SUM(money), 2) AS total_revenue,
       COUNT(*) AS transactions,
       ROUND(AVG(money), 2) AS avg_transaction
FROM coffee_sales
GROUP BY Weekday
ORDER BY Weekdaysort;
""", conn)

print(revenue_by_day.to_string(index=False))

#### 6.4 Average Sale per Hour

In [None]:
avg_sale_hour = run_query("""
SELECT hour_of_day,
       ROUND(AVG(money), 2) AS avg_sale_per_hour,
       COUNT(*) AS transactions
FROM coffee_sales
GROUP BY hour_of_day
ORDER BY hour_of_day;
""", conn)

print(avg_sale_hour.to_string(index=False))

#### 6.5 Monthly Sales Performance: Growth Rate by Month

In [None]:
# Use month_sequence for proper ordering
monthly_sales = df.groupby(['year_month_display', 'month_sequence', 'year_month_sort'])['money'].agg([
    ('total_revenue', 'sum'),
    ('transactions', 'count'),
    ('avg_transaction', 'mean')
]).reset_index()
monthly_sales = monthly_sales.sort_values('month_sequence')

# Calculate growth rate
monthly_sales['growth_rate'] = (
    monthly_sales['total_revenue'].pct_change().fillna(0) * 100
).round(2)

# Display with proper labels
monthly_display = monthly_sales[['year_month_display', 'total_revenue', 'transactions', 'growth_rate']].copy()
monthly_display['total_revenue'] = monthly_display['total_revenue'].apply(lambda x: f"${x:,.2f}")
monthly_display['growth_rate'] = monthly_display['growth_rate'].apply(lambda x: f"{x:.2f}%")
monthly_display.columns = ['Period', 'Revenue', 'Transactions', 'Growth Rate']
print(monthly_display.to_string(index=False))

#### 6.6 Sales by Time of Day

In [None]:
time_of_day_analysis = run_query("""
SELECT Time_of_Day,
       COUNT(*) AS transactions,
       ROUND(SUM(money), 2) AS total_revenue,
       ROUND(AVG(money), 2) AS avg_transaction
FROM coffee_sales
GROUP BY Time_of_Day
ORDER BY 
    CASE Time_of_Day
        WHEN 'Morning' THEN 1
        WHEN 'Afternoon' THEN 2
        WHEN 'Night' THEN 3
    END;
""", conn)

print(time_of_day_analysis.to_string(index=False))

### 7. Outliers: Extreme Sales

#### 7.1 Overview

In [None]:
# Threshold for top 25% transactions
threshold = df['money'].quantile(0.75)
high_value_sales = df[df['money'] >= threshold]

# Summary statistics for high-value transactions
num_outliers = len(high_value_sales)
total_outliers = high_value_sales['money'].sum()
avg_outliers = high_value_sales['money'].mean()
pct_of_total = total_outliers / df['money'].sum() * 100

# Display summary
summary = pd.DataFrame({
    "Metric": [
        "High-value transactions", 
        "Total revenue", 
        "Average value", 
        "% of total revenue",
        "Threshold value"
    ], 
    "Value": [
        f"{num_outliers:,}",
        f"${total_outliers:,.2f}", 
        f"${avg_outliers:.2f}", 
        f"{pct_of_total:.2f}%",
        f"${threshold:.2f}"
    ]
})
print(summary.to_string(index=False))

#### 7.2 High-Value Coffee Transactions: Item Contribution to Top Sales


In [None]:
# Aggregate high-value sales by coffee item
coffee_high_value = high_value_sales.groupby('coffee_name').agg(
    total_revenue=('money', 'sum'), # total revenue from high-value sales
    transaction_count=('money', 'count'), # number of high-value transactions
    avg_value=('money', 'mean') # average value of high-value transactions
).sort_values(by='total_revenue', ascending=False)

# Calculate percentage of high-value revenue for each coffee item
coffee_high_value['pct_of_high_value_revenue'] = (
    coffee_high_value['total_revenue'] / high_value_sales['money'].sum() * 100
).round(2)

# Format monetary values and percentages for display
coffee_high_value['total_revenue'] = coffee_high_value['total_revenue'].apply(lambda x: f"${x:,.2f}")
coffee_high_value['avg_value'] = coffee_high_value['avg_value'].apply(lambda x: f"${x:.2f}")
coffee_high_value['pct_of_high_value_revenue'] = coffee_high_value['pct_of_high_value_revenue'].apply(lambda x: f"{x:.2f}%")

print(coffee_high_value)

### 8. Predictive Analysis

#### 8.1 Hourly Sales Forecast (6:00 - 22:00)

In [None]:
# Aggregate average sales by hour
hourly_sales = df.groupby('hour_of_day')['money'].mean().reset_index()
X = hourly_sales[['hour_of_day']]
y = hourly_sales['money']

# Polynomial regression model (degree 2), for capturing peak/off-peak trends
poly_hour_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_hour_model.fit(X, y)

# Predict average sales for each hour from 6:00 to 22:00
future_hours = pd.DataFrame({'hour_of_day': range(6, 23)})
predicted_hourly_sales = poly_hour_model.predict(future_hours)

# Format and display predictions
predicted_hourly_sales_df = future_hours.copy()
predicted_hourly_sales_df['predicted_avg_transaction'] = predicted_hourly_sales
predicted_hourly_sales_df['predicted_avg_transaction'] = predicted_hourly_sales_df['predicted_avg_transaction'].apply(
    lambda x: f"${x:.2f}"
)
predicted_hourly_sales_df['hour'] = predicted_hourly_sales_df['hour_of_day'].apply(lambda x: f"{x}:00")
print(predicted_hourly_sales_df[['hour', 'predicted_avg_transaction']].to_string(index=False))

#### 8.2 Monthly Sales Forecast (Next 3 Months)

In [None]:
# Use month_sequence for proper temporal ordering
monthly_sales_seq = df.groupby('month_sequence')['money'].sum().reset_index()
monthly_sales_seq = monthly_sales_seq.sort_values('month_sequence')

# Polynomial regression model (degree 2) for monthly sales trend, capturing non-linear growth
X_month = monthly_sales_seq[['month_sequence']]
y_month = monthly_sales_seq['money']

poly_month_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_month_model.fit(X_month, y_month)

# Predict next 3 months after the last month in your data
max_month = df['month_sequence'].max()
future_months = pd.DataFrame({'month_sequence': [max_month + 1, max_month + 2, max_month + 3]})
predicted_monthly_sales = poly_month_model.predict(future_months)

# Create readable labels based on your last date
last_date = df['Date'].max()
future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=3, freq='MS')

# Format and display predictions
predicted_monthly_sales_df = pd.DataFrame({
    'Period': future_dates.strftime('%b %Y'),
    'Predicted Revenue': [f"${x:,.2f}" for x in predicted_monthly_sales],
    'Month Sequence': [max_month + 1, max_month + 2, max_month + 3]
})
print(predicted_monthly_sales_df.to_string(index=False))

### 9. Executive Summary

In [None]:
# Get date range information
first_month = df.loc[df['Date'].idxmin(), 'year_month_display']
last_month = df.loc[df['Date'].idxmax(), 'year_month_display']

# Calculate summary statistics
total_revenue = df['money'].sum()
total_transactions = len(df)
avg_transaction = df['money'].mean()
unique_products = df['coffee_name'].nunique()

# Display summary
print(f"Analysis Period: {first_month} to {last_month}")
print(f"Total Duration: {df['month_sequence'].nunique()} months")
print(f"Date Range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"\nTotal Transactions: {total_transactions:,}")
print(f"Total Revenue: ${total_revenue:,.2f}")
print(f"Average Transaction: ${avg_transaction:.2f}")
print(f"Unique Products: {unique_products}")
print(f"\nPeak Hour: {peak_hours.loc[peak_hours['total_revenue'].idxmax(), 'hour_of_day']}:00 (${peak_hours['total_revenue'].max():,.2f})")
print(f"Best Day: {revenue_by_day.loc[revenue_by_day['total_revenue'].idxmax(), 'Weekday']} (${revenue_by_day['total_revenue'].max():,.2f})")
print(f"Top Product: {top_sellers.iloc[0]['coffee_name']} (${top_sellers.iloc[0]['total_revenue']:,.2f} revenue)")

### 10. Close the Connection

In [None]:
conn.close()