# Brewed Insights: Coffee Sales Analysis

### 1. Install and import the relevant libraries

In [18]:
import sqlite3
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import numpy as np

### 2. Load data

In [19]:
# Load CSV data into a Pandas DataFrame
base_path = os.path.dirname(os.getcwd())  # go up from notebooks folder
data_path = os.path.join(base_path, 'data', 'coffee_sales.csv')

df = pd.read_csv(data_path)

### 3. Handle Cross-Year Data (Mar 2024 - Mar 2025)
- Handling the non-standard fiscal year and a single continuous time series

In [20]:
# Parse the Date and Time columns
df['Date'] = pd.to_datetime(df['Date'])
# Handle inconsistent time formats (some have milliseconds, some don't)
df['DateTime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), format='mixed') # mixed format parsing

# Extract Year for calculations
df['Year'] = df['Date'].dt.year

# Create sequential month number (handles cross-year properly)
# This counts unique year-month combinations
df['month_sequence'] = (
    (df['Date'].dt.year - df['Date'].dt.year.min()) * 12 + 
    df['Date'].dt.month
)
# Normalize to start from 1
df['month_sequence'] = df['month_sequence'] - df['month_sequence'].min() + 1

# Create display labels for months (e.g., "Mar 2024", "Apr 2024")
df['year_month_display'] = df['Date'].dt.strftime('%b %Y')

# Create year_month_sort for proper ordering
df['year_month_sort'] = df['Date'].dt.year * 100 + df['Date'].dt.month

# Create fiscal year fields (March = Month 1)
FISCAL_START_MONTH = 3  # March
df['fiscal_month'] = ((df['Date'].dt.month - FISCAL_START_MONTH) % 12) + 1 # Fiscal month 1 = March
df['fiscal_year'] = df['Date'].dt.year # Initial fiscal year
df.loc[df['Date'].dt.month < FISCAL_START_MONTH, 'fiscal_year'] -= 1 # Adjust fiscal year for Jan/Feb

# Create fiscal period label for display
df['fiscal_period'] = 'FY' + df['fiscal_year'].astype(str) + '-M' + df['fiscal_month'].astype(str).str.zfill(2)

print("✓ Cross-year data handling complete")
print(f"Data spans: {df['year_month_display'].min()} to {df['year_month_display'].max()}")
print(f"Total unique months: {df['month_sequence'].nunique()}")
print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")

✓ Cross-year data handling complete
Data spans: Apr 2024 to Sep 2024
Total unique months: 13
Date range: 2024-03-01 to 2025-03-23


### 4. Create a temporary database using SQLite and insert the table

In [21]:
# Create a temporary SQLite database
conn = sqlite3.connect('coffee.db')

# Write the DataFrame into a SQL table
df.to_sql('coffee_sales', conn, index=False, if_exists='replace')

3547

### 5. Create a function that will be reutilized later

In [22]:
def run_query(query, conn):
    """Helper function to run SQL queries and return DataFrame."""
    return pd.read_sql_query(query, conn)

### 6. Exploratory Data Analysis (EDA)

#### 6.1 Top-Selling Coffee Products

In [23]:
top_sellers = run_query("""
SELECT coffee_name, 
       COUNT(*) AS total_sales, 
       ROUND(SUM(money), 2) AS total_revenue,
       ROUND(AVG(money), 2) AS avg_price
FROM coffee_sales
GROUP BY coffee_name
ORDER BY total_revenue DESC
LIMIT 5;
""", conn)

print(top_sellers.to_string(index=False)) # to_string to avoid truncation

        coffee_name  total_sales  total_revenue  avg_price
              Latte          757       26875.30      35.50
Americano with Milk          809       24751.12      30.59
         Cappuccino          486       17439.14      35.88
          Americano          564       14650.26      25.98
      Hot Chocolate          276        9933.46      35.99


#### 6.2 Peak Hours

In [24]:
peak_hours = run_query("""
SELECT hour_of_day, 
       ROUND(SUM(money), 2) AS total_revenue,
       COUNT(*) AS transactions
FROM coffee_sales
GROUP BY hour_of_day
ORDER BY hour_of_day;
""", conn)

print(peak_hours.to_string(index=False))
print(f"\nPeak Hour: {peak_hours.loc[peak_hours['total_revenue'].idxmax(), 'hour_of_day']}:00")
print(f"Peak Revenue: ${peak_hours['total_revenue'].max():,.2f}")

 hour_of_day  total_revenue  transactions
           6         149.40             5
           7        2846.02            88
           8        7017.88           235
           9        7264.28           242
          10       10198.52           328
          11        8453.10           283
          12        7419.62           241
          13        7028.76           225
          14        7173.80           225
          15        7476.02           236
          16        9031.84           278
          17        7659.76           237
          18        7162.60           218
          19        7751.96           229
          20        5578.92           169
          21        6397.94           195
          22        3635.16           113

Peak Hour: 10:00
Peak Revenue: $10,198.52


#### 6.3 Revenue by Day of the Week

In [25]:
revenue_by_day = run_query("""
SELECT Weekday, 
       ROUND(SUM(money), 2) AS total_revenue,
       COUNT(*) AS transactions,
       ROUND(AVG(money), 2) AS avg_transaction
FROM coffee_sales
GROUP BY Weekday
ORDER BY Weekdaysort;
""", conn)

print(revenue_by_day.to_string(index=False))

Weekday  total_revenue  transactions  avg_transaction
    Mon       17363.10           544            31.92
    Tue       18168.38           572            31.76
    Wed       15750.46           500            31.50
    Thu       16091.40           510            31.55
    Fri       16802.66           532            31.58
    Sat       14733.52           470            31.35
    Sun       13336.06           419            31.83


#### 6.4 Average Sale per Hour

In [26]:
avg_sale_hour = run_query("""
SELECT hour_of_day,
       ROUND(AVG(money), 2) AS avg_sale_per_hour,
       COUNT(*) AS transactions
FROM coffee_sales
GROUP BY hour_of_day
ORDER BY hour_of_day;
""", conn)

print(avg_sale_hour.to_string(index=False))

 hour_of_day  avg_sale_per_hour  transactions
           6              29.88             5
           7              32.34            88
           8              29.86           235
           9              30.02           242
          10              31.09           328
          11              29.87           283
          12              30.79           241
          13              31.24           225
          14              31.88           225
          15              31.68           236
          16              32.49           278
          17              32.32           237
          18              32.86           218
          19              33.85           229
          20              33.01           169
          21              32.81           195
          22              32.17           113


#### 6.5 Monthly Sales Performance: Growth Rate by Month

In [27]:
# Use month_sequence for proper ordering
monthly_sales = df.groupby(['year_month_display', 'month_sequence', 'year_month_sort'])['money'].agg([
    ('total_revenue', 'sum'),
    ('transactions', 'count'),
    ('avg_transaction', 'mean')
]).reset_index()
monthly_sales = monthly_sales.sort_values('month_sequence')

# Calculate growth rate
monthly_sales['growth_rate'] = (
    monthly_sales['total_revenue'].pct_change().fillna(0) * 100
).round(2)

# Display with proper labels
monthly_display = monthly_sales[['year_month_display', 'total_revenue', 'transactions', 'growth_rate']].copy()
monthly_display['total_revenue'] = monthly_display['total_revenue'].apply(lambda x: f"${x:,.2f}")
monthly_display['growth_rate'] = monthly_display['growth_rate'].apply(lambda x: f"{x:.2f}%")
monthly_display.columns = ['Period', 'Revenue', 'Transactions', 'Growth Rate']
print(monthly_display.to_string(index=False))

  Period    Revenue  Transactions Growth Rate
Mar 2024  $5,905.20           175       0.00%
Apr 2024  $5,719.56           168      -3.14%
May 2024  $8,164.42           241      42.75%
Jun 2024  $7,617.76           223      -6.70%
Jul 2024  $6,915.94           237      -9.21%
Aug 2024  $7,613.84           272      10.09%
Sep 2024  $9,988.64           344      31.19%
Oct 2024 $13,891.16           426      39.07%
Nov 2024  $8,590.54           259     -38.16%
Dec 2024  $8,237.74           259      -4.11%
Jan 2025  $6,398.86           201     -22.32%
Feb 2025 $13,215.48           423     106.53%
Mar 2025  $9,986.44           319     -24.43%


#### 6.6 Sales by Time of Day

In [28]:
time_of_day_analysis = run_query("""
SELECT Time_of_Day,
       COUNT(*) AS transactions,
       ROUND(SUM(money), 2) AS total_revenue,
       ROUND(AVG(money), 2) AS avg_transaction
FROM coffee_sales
GROUP BY Time_of_Day
ORDER BY 
    CASE Time_of_Day
        WHEN 'Morning' THEN 1
        WHEN 'Afternoon' THEN 2
        WHEN 'Night' THEN 3
    END;
""", conn)

print(time_of_day_analysis.to_string(index=False))

Time_of_Day  transactions  total_revenue  avg_transaction
    Morning          1181       35929.20            30.42
  Afternoon          1205       38130.04            31.64
      Night          1161       38186.34            32.89


### 7. Outliers: Extreme Sales

#### 7.1 Overview

In [29]:
# Threshold for top 25% transactions
threshold = df['money'].quantile(0.75)
high_value_sales = df[df['money'] >= threshold]

# Summary statistics for high-value transactions
num_outliers = len(high_value_sales)
total_outliers = high_value_sales['money'].sum()
avg_outliers = high_value_sales['money'].mean()
pct_of_total = total_outliers / df['money'].sum() * 100

# Display summary
summary = pd.DataFrame({
    "Metric": [
        "High-value transactions", 
        "Total revenue", 
        "Average value", 
        "% of total revenue",
        "Threshold value"
    ], 
    "Value": [
        f"{num_outliers:,}",
        f"${total_outliers:,.2f}", 
        f"${avg_outliers:.2f}", 
        f"{pct_of_total:.2f}%",
        f"${threshold:.2f}"
    ]
})
print(summary.to_string(index=False))

                 Metric      Value
High-value transactions      1,415
          Total revenue $51,511.80
          Average value     $36.40
     % of total revenue     45.89%
        Threshold value     $35.76


#### 7.2 High-Value Coffee Transactions: Item Contribution to Top Sales


In [30]:
# Aggregate high-value sales by coffee item
coffee_high_value = high_value_sales.groupby('coffee_name').agg(
    total_revenue=('money', 'sum'), # total revenue from high-value sales
    transaction_count=('money', 'count'), # number of high-value transactions
    avg_value=('money', 'mean') # average value of high-value transactions
).sort_values(by='total_revenue', ascending=False)

# Calculate percentage of high-value revenue for each coffee item
coffee_high_value['pct_of_high_value_revenue'] = (
    coffee_high_value['total_revenue'] / high_value_sales['money'].sum() * 100
).round(2)

# Format monetary values and percentages for display
coffee_high_value['total_revenue'] = coffee_high_value['total_revenue'].apply(lambda x: f"${x:,.2f}")
coffee_high_value['avg_value'] = coffee_high_value['avg_value'].apply(lambda x: f"${x:.2f}")
coffee_high_value['pct_of_high_value_revenue'] = coffee_high_value['pct_of_high_value_revenue'].apply(lambda x: f"{x:.2f}%")

print(coffee_high_value)

              total_revenue  transaction_count avg_value  \
coffee_name                                                
Latte            $20,508.22                563    $36.43   
Cappuccino       $14,288.42                390    $36.64   
Hot Chocolate     $9,080.14                250    $36.32   
Cocoa             $7,635.02                212    $36.01   

              pct_of_high_value_revenue  
coffee_name                              
Latte                            39.81%  
Cappuccino                       27.74%  
Hot Chocolate                    17.63%  
Cocoa                            14.82%  


### 8. Predictive Analysis

#### 8.1 Hourly Sales Forecast (6:00 - 22:00)

In [31]:
# Aggregate average sales by hour
hourly_sales = df.groupby('hour_of_day')['money'].mean().reset_index()
X = hourly_sales[['hour_of_day']]
y = hourly_sales['money']

# Polynomial regression model (degree 2), for capturing peak/off-peak trends
poly_hour_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_hour_model.fit(X, y)

# Predict average sales for each hour from 6:00 to 22:00
future_hours = pd.DataFrame({'hour_of_day': range(6, 23)})
predicted_hourly_sales = poly_hour_model.predict(future_hours)

# Format and display predictions
predicted_hourly_sales_df = future_hours.copy()
predicted_hourly_sales_df['predicted_avg_transaction'] = predicted_hourly_sales
predicted_hourly_sales_df['predicted_avg_transaction'] = predicted_hourly_sales_df['predicted_avg_transaction'].apply(
    lambda x: f"${x:.2f}"
)
predicted_hourly_sales_df['hour'] = predicted_hourly_sales_df['hour_of_day'].apply(lambda x: f"{x}:00")
print(predicted_hourly_sales_df[['hour', 'predicted_avg_transaction']].to_string(index=False))

 hour predicted_avg_transaction
 6:00                    $30.16
 7:00                    $30.34
 8:00                    $30.52
 9:00                    $30.70
10:00                    $30.89
11:00                    $31.07
12:00                    $31.26
13:00                    $31.45
14:00                    $31.64
15:00                    $31.83
16:00                    $32.02
17:00                    $32.22
18:00                    $32.41
19:00                    $32.61
20:00                    $32.81
21:00                    $33.02
22:00                    $33.22


#### 8.2 Monthly Sales Forecast (Next 3 Months)

In [32]:
# Use month_sequence for proper temporal ordering
monthly_sales_seq = df.groupby('month_sequence')['money'].sum().reset_index()
monthly_sales_seq = monthly_sales_seq.sort_values('month_sequence')

# Polynomial regression model (degree 2) for monthly sales trend, capturing non-linear growth
X_month = monthly_sales_seq[['month_sequence']]
y_month = monthly_sales_seq['money']

poly_month_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_month_model.fit(X_month, y_month)

# Predict next 3 months after the last month in your data
max_month = df['month_sequence'].max()
future_months = pd.DataFrame({'month_sequence': [max_month + 1, max_month + 2, max_month + 3]})
predicted_monthly_sales = poly_month_model.predict(future_months)

# Create readable labels based on your last date
last_date = df['Date'].max()
future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=3, freq='MS')

# Format and display predictions
predicted_monthly_sales_df = pd.DataFrame({
    'Period': future_dates.strftime('%b %Y'),
    'Predicted Revenue': [f"${x:,.2f}" for x in predicted_monthly_sales],
    'Month Sequence': [max_month + 1, max_month + 2, max_month + 3]
})
print(predicted_monthly_sales_df.to_string(index=False))

  Period Predicted Revenue  Month Sequence
May 2025        $10,021.35              14
Jun 2025         $9,886.23              15
Jul 2025         $9,684.47              16


### 9. Executive Summary

In [33]:
# Get date range information
first_month = df.loc[df['Date'].idxmin(), 'year_month_display']
last_month = df.loc[df['Date'].idxmax(), 'year_month_display']

# Calculate summary statistics
total_revenue = df['money'].sum()
total_transactions = len(df)
avg_transaction = df['money'].mean()
unique_products = df['coffee_name'].nunique()

# Display summary
print(f"Analysis Period: {first_month} to {last_month}")
print(f"Total Duration: {df['month_sequence'].nunique()} months")
print(f"Date Range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"\nTotal Transactions: {total_transactions:,}")
print(f"Total Revenue: ${total_revenue:,.2f}")
print(f"Average Transaction: ${avg_transaction:.2f}")
print(f"Unique Products: {unique_products}")
print(f"\nPeak Hour: {peak_hours.loc[peak_hours['total_revenue'].idxmax(), 'hour_of_day']}:00 (${peak_hours['total_revenue'].max():,.2f})")
print(f"Best Day: {revenue_by_day.loc[revenue_by_day['total_revenue'].idxmax(), 'Weekday']} (${revenue_by_day['total_revenue'].max():,.2f})")
print(f"Top Product: {top_sellers.iloc[0]['coffee_name']} (${top_sellers.iloc[0]['total_revenue']:,.2f} revenue)")

Analysis Period: Mar 2024 to Mar 2025
Total Duration: 13 months
Date Range: 2024-03-01 to 2025-03-23

Total Transactions: 3,547
Total Revenue: $112,245.58
Average Transaction: $31.65
Unique Products: 8

Peak Hour: 10:00 ($10,198.52)
Best Day: Tue ($18,168.38)
Top Product: Latte ($26,875.30 revenue)


### 10. Close the Connection

In [34]:
conn.close()