## ☕ Coffee Sales Dataset Schema

This table describes the columns (features) found in the `Coffe_sales.csv` file, their data types, and their purpose.

| Feature Name | Description | Data Type / Format |
| :--- | :--- | :--- |
| **`hour_of_day`** | Hour of purchase | Integer (0–23) |
| **`cash_type`** | Mode of payment | Categorical (cash / card) |
| **`money`** | Transaction amount | Numeric (local currency) |
| **`coffee_name`** | Type of coffee purchased | String (e.g., Latte, Americano) |
| **`Time_of_Day`** | Categorized time of purchase | Categorical (Morning, Afternoon, Night) |
| **`Weekday`** | Day of the week | String (e.g., Mon, Tue, ...) |
| **`Month_name`** | Month of purchase | String (e.g., Jan, Feb, Mar) |
| **`Weekdaysort`** | Numeric representation for weekday ordering | Integer (1 = Mon, 7 = Sun) |
| **`Monthsort`** | Numeric representation for month ordering | Integer (1 = Jan, 12 = Dec) |
| **`Date`** | Date of transaction | Date (YYYY-MM-DD) |
| **`Time`** | Exact time of transaction | Time (HH:MM:SS) |

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, inspect
from injest_db import engine

In [2]:
inspector = inspect(engine)

table_names = inspector.get_table_names()

print(f"Tables found in the database: {table_names}")

Tables found in the database: ['sales']


# Data Cleaning & Transformation

In [3]:
pd.read_sql_query("DESCRIBE sales;", engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,hour_of_day,bigint,YES,,,
1,cash_type,text,YES,,,
2,money,double,YES,,,
3,coffee_name,text,YES,,,
4,Time_of_Day,text,YES,,,
5,Weekday,text,YES,,,
6,Month_name,text,YES,,,
7,Weekdaysort,bigint,YES,,,
8,Monthsort,bigint,YES,,,
9,Date,text,YES,,,


In [4]:
pd.read_sql_query("SELECT * FROM sales LIMIT 5;", engine)

Unnamed: 0,hour_of_day,cash_type,money,coffee_name,Time_of_Day,Weekday,Month_name,Weekdaysort,Monthsort,Date,Time
0,10,card,38.7,Latte,Morning,Fri,Mar,5,3,2024-03-01,10:15:50.520000
1,12,card,38.7,Hot Chocolate,Afternoon,Fri,Mar,5,3,2024-03-01,12:19:22.539000
2,12,card,38.7,Hot Chocolate,Afternoon,Fri,Mar,5,3,2024-03-01,12:20:18.089000
3,13,card,28.9,Americano,Afternoon,Fri,Mar,5,3,2024-03-01,13:46:33.006000
4,13,card,38.7,Latte,Afternoon,Fri,Mar,5,3,2024-03-01,13:48:14.626000


In [5]:
# Check basic dataset information

pd.read_sql_query('''
SELECT 
    COUNT(*) AS total_records,
    MIN(Date) AS earliest_date,
    MAX(Date) AS latest_date,
    COUNT(DISTINCT coffee_name) AS unique_products
FROM sales;''', engine)

Unnamed: 0,total_records,earliest_date,latest_date,unique_products
0,3547,2024-03-01,2025-03-23,8


In [6]:
# Check for NULL values in critical columns

pd.read_sql_query('''
SELECT 
    'hour_of_day' AS column_name,
    COUNT(*) AS null_count
FROM sales 
WHERE hour_of_day IS NULL
UNION ALL
SELECT 'cash_type', COUNT(*) FROM sales WHERE cash_type IS NULL
UNION ALL
SELECT 'money', COUNT(*) FROM sales WHERE money IS NULL
UNION ALL
SELECT 'coffee_name', COUNT(*) FROM sales WHERE coffee_name IS NULL
UNION ALL
SELECT 'Date', COUNT(*) FROM sales WHERE Date IS NULL
UNION ALL
SELECT 'Time', COUNT(*) FROM sales WHERE Time IS NULL;''', engine)

Unnamed: 0,column_name,null_count
0,hour_of_day,0
1,cash_type,0
2,money,0
3,coffee_name,0
4,Date,0
5,Time,0


In [7]:
# Check for duplicate records

pd.read_sql_query('''
SELECT 
    Date,
    Time,
    coffee_name,
    money,
    COUNT(*) AS duplicate_count
FROM sales
GROUP BY Date, Time, coffee_name, money
HAVING COUNT(*) > 1;''', engine)

Unnamed: 0,Date,Time,coffee_name,money,duplicate_count


In [8]:
# Verify coffee name consistency (check for variations) 

pd.read_sql_query('''
SELECT 
    coffee_name,
    COUNT(*) AS transaction_count
FROM sales
GROUP BY coffee_name
ORDER BY coffee_name;''', engine)

Unnamed: 0,coffee_name,transaction_count
0,Americano,564
1,Americano with Milk,809
2,Cappuccino,486
3,Cocoa,239
4,Cortado,287
5,Espresso,129
6,Hot Chocolate,276
7,Latte,757


In [9]:
# from sqlalchemy import text
# with engine.connect() as connection:
#     # The 'begin()' block ensures your change is saved (committed)
#     with connection.begin() as transaction:
#         connection.execute(text("""
#             UPDATE sales
#             SET Date = STR_TO_DATE(Date, '%m/%d/%Y');
                                
#         """))

# Exploratory Data Analysis (EDA)

In [10]:
# Calculate total number of transactions

pd.read_sql_query("SELECT COUNT(cash_type) as transaction_count FROM sales;", engine).values[0]

array([3547])

In [11]:
# Count distinct products sold

pd.read_sql_query("SELECT COUNT(DISTINCT coffee_name) as productcounts FROM sales", engine).values[0]

array([8])

In [12]:
# Calculate basic statistics (min, max, average, median) for transaction amounts

pd.read_sql_query("SELECT MIN(money) AS minimum_amount, MAX(money) AS max_amount, AVG(money) AS average_amount FROM sales", engine)

Unnamed: 0,minimum_amount,max_amount,average_amount
0,18.12,38.7,31.645216


In [13]:
# Median Transaction Amount

pd.read_sql_query('''WITH RankedAmounts AS (
    SELECT
        money AS amount,
        ROW_NUMBER() OVER (ORDER BY money) AS rn,
        COUNT(*) OVER () AS total_count
    FROM
        sales
)
SELECT
    AVG(amount) AS median_transaction_amount
FROM
    RankedAmounts
WHERE
    rn IN (FLOOR((total_count + 1) / 2), CEIL((total_count + 1) / 2));''', engine).values[0]

array([32.82])

In [14]:
# Total revenue across all transactions

pd.read_sql_query("SELECT SUM(money) AS total_revenue FROM sales", engine).values[0]

array([112245.58])

In [15]:
# Distribution of payment types (card vs cash if cash exists)

pd.read_sql_query(''' SELECT 
    cash_type,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM sales), 2) AS percentage
FROM sales
GROUP BY cash_type;''', engine)

Unnamed: 0,cash_type,count,percentage
0,card,3547,100.0


# Time-Based Analysis

In [16]:
# Sales by hour of day (identify peak hours)

pd.read_sql_query("SELECT hour_of_day, SUM(money) AS hourly_sales FROM sales GROUP BY hour_of_day;", engine)

Unnamed: 0,hour_of_day,hourly_sales
0,10,10198.52
1,12,7419.62
2,13,7028.76
3,15,7476.02
4,16,9031.84
5,18,7162.6
6,19,7751.96
7,11,8453.1
8,14,7173.8
9,17,7659.76


In [17]:
# Shows Date ranges

pd.read_sql_query('''SELECT 
    YEAR(Date) AS year,
    MIN(Date) AS first_transaction,
    MAX(Date) AS last_transaction,
    COUNT(DISTINCT Date) AS operating_days,
    COUNT(*) AS total_transactions
FROM sales
GROUP BY YEAR(Date)
ORDER BY year;''', engine)

Unnamed: 0,year,first_transaction,last_transaction,operating_days,total_transactions
0,2024,2024-03-01,2024-12-31,302,2604
1,2025,2025-01-02,2025-03-23,79,943


In [18]:
# Revenue by time of day (Morning, Afternoon, Night)

pd.read_sql_query("SELECT Time_of_Day, SUM(money) AS total_sales FROM sales GROUP BY Time_of_Day;", engine)

Unnamed: 0,Time_of_Day,total_sales
0,Morning,35929.2
1,Afternoon,38130.04
2,Night,38186.34


In [19]:
# Average transactions per hour
pd.read_sql_query('''SELECT HOUR(Time) AS hour, COUNT(cash_type) AS transaction_per_hour FROM sales
                  GROUP BY hour;''', engine)

Unnamed: 0,hour,transaction_per_hour
0,10,328
1,12,241
2,13,225
3,15,236
4,16,279
5,18,218
6,19,229
7,11,283
8,14,225
9,17,236


In [20]:
# Busiest hours for specific coffee types

pd.read_sql_query('''WITH coffeerankedhours AS(
                  SELECT hour_of_day, coffee_name, COUNT(*) AS transaction_count,
                  DENSE_RANK() OVER(PARTITION BY coffee_name ORDER BY COUNT(*) DESC) rnk_hours
                  FROM sales
                  GROUP BY hour_of_day, coffee_name
                  )
                  SELECT hour_of_day, coffee_name, transaction_count AS total_transactions
                  FROM coffeerankedhours
                  WHERE  rnk_hours <=5
                  ORDER BY
                  coffee_name,
                  total_transactions DESC;''', engine)

Unnamed: 0,hour_of_day,coffee_name,total_transactions
0,11,Americano,71
1,10,Americano,59
2,12,Americano,55
3,8,Americano,48
4,13,Americano,46
5,10,Americano with Milk,101
6,9,Americano with Milk,94
7,11,Americano with Milk,65
8,13,Americano with Milk,56
9,12,Americano with Milk,55


### Weekly Patterns

In [21]:
# Sales by day of week

pd.read_sql_query('''SELECT SUM(money) as revenue, Weekday FROM sales GROUP BY Weekday;''', engine)

Unnamed: 0,revenue,Weekday
0,16802.66,Fri
1,14733.52,Sat
2,13336.06,Sun
3,17363.1,Mon
4,18168.38,Tue
5,15750.46,Wed
6,16091.4,Thu


In [22]:
# Weekend vs weekday performance
pd.read_sql_query('''SELECT SUM(money) as revenue, Weekday, Weekdaysort From sales
                  GROUP BY Weekday, Weekdaysort
                  HAVING Weekdaysort = 6 AND 7;''', engine)

Unnamed: 0,revenue,Weekday,Weekdaysort
0,14733.52,Sat,6


### Monthly & Seasonal Trends

In [23]:
# Monthly revenue trends
pd.read_sql_query('''SELECT DATE_FORMAT(Date, '%Y-%m') AS month, SUM(money) AS revenue FROM sales 
                  GROUP BY month
                  ORDER BY month''', engine)

Unnamed: 0,month,revenue
0,2024-03,5905.2
1,2024-04,5719.56
2,2024-05,8164.42
3,2024-06,7617.76
4,2024-07,6915.94
5,2024-08,7613.84
6,2024-09,9988.64
7,2024-10,13891.16
8,2024-11,8590.54
9,2024-12,8237.74


In [24]:
# Month-over-month growth rates
pd.read_sql_query('''WITH MonthlyRevenue AS (
    SELECT
        DATE_FORMAT(Date, '%Y-%m') AS sales_month,
        SUM(money) AS current_month_revenue
    FROM
        sales
    GROUP BY
        sales_month
),
MoMCalculation AS (
    SELECT
        sales_month,
        current_month_revenue,
        LAG(current_month_revenue, 1) OVER (ORDER BY sales_month) AS previous_month_revenue
    FROM
        MonthlyRevenue
)
SELECT
    sales_month,
    current_month_revenue,
    previous_month_revenue,
                  
    -- Calculate the MoM Growth Rate using the formula:
    CASE
        WHEN previous_month_revenue IS NULL OR previous_month_revenue = 0 THEN NULL
        ELSE ROUND(
            ((current_month_revenue - previous_month_revenue) / previous_month_revenue) * 100,
            2
        )
    END AS mom_growth_rate_percent
FROM
    MoMCalculation
ORDER BY
    sales_month;''', engine)

Unnamed: 0,sales_month,current_month_revenue,previous_month_revenue,mom_growth_rate_percent
0,2024-03,5905.2,,
1,2024-04,5719.56,5905.2,-3.14
2,2024-05,8164.42,5719.56,42.75
3,2024-06,7617.76,8164.42,-6.7
4,2024-07,6915.94,7617.76,-9.21
5,2024-08,7613.84,6915.94,10.09
6,2024-09,9988.64,7613.84,31.19
7,2024-10,13891.16,9988.64,39.07
8,2024-11,8590.54,13891.16,-38.16
9,2024-12,8237.74,8590.54,-4.11


##  Product Analysis

### Coffee Performance

In [25]:
# Best-selling coffee types by total count

pd.read_sql_query('''SELECT coffee_name, COUNT(coffee_name) AS coffee_count 
                  FROM sales
                  GROUP BY coffee_name
                  ORDER BY coffee_count DESC''', engine)

Unnamed: 0,coffee_name,coffee_count
0,Americano with Milk,809
1,Latte,757
2,Americano,564
3,Cappuccino,486
4,Cortado,287
5,Hot Chocolate,276
6,Cocoa,239
7,Espresso,129


In [26]:
# Revenue by coffee type

pd.read_sql_query('''SELECT coffee_name, SUM(money) AS revenue 
                  FROM sales
                  GROUP BY coffee_name
                  ORDER BY revenue DESC''', engine)

Unnamed: 0,coffee_name,revenue
0,Latte,26875.3
1,Americano with Milk,24751.12
2,Cappuccino,17439.14
3,Americano,14650.26
4,Hot Chocolate,9933.46
5,Cocoa,8521.16
6,Cortado,7384.86
7,Espresso,2690.28


In [27]:
# Average price per coffee type

pd.read_sql_query('''SELECT coffee_name, AVG(money) AS avg_price 
                  FROM sales
                  GROUP BY coffee_name
                  ORDER BY avg_price DESC''', engine)

Unnamed: 0,coffee_name,avg_price
0,Hot Chocolate,35.990797
1,Cappuccino,35.883004
2,Cocoa,35.653389
3,Latte,35.502378
4,Americano with Milk,30.59471
5,Americano,25.975638
6,Cortado,25.73122
7,Espresso,20.854884


In [28]:
# Preferred coffee types by time of day
pd.read_sql_query('''WITH RankedCoffee AS (
    SELECT
        Time_of_Day,
        coffee_name,
        COUNT(*) AS transaction_count,
        RANK() OVER (
            PARTITION BY Time_of_Day
            ORDER BY COUNT(*) DESC, coffee_name -- Use coffee_name as a tie-breaker
        ) as rank_by_popularity
    FROM
        sales
    GROUP BY
        Time_of_Day,
        coffee_name
    HAVING
        Time_of_Day IS NOT NULL
)
SELECT
    Time_of_Day,
    coffee_name,
    transaction_count AS total_transactions
FROM
    RankedCoffee
WHERE
    rank_by_popularity <= 3
ORDER BY
    Time_of_Day,
    total_transactions DESC;
 ''', engine)

Unnamed: 0,Time_of_Day,coffee_name,total_transactions
0,Afternoon,Latte,270
1,Afternoon,Americano with Milk,239
2,Afternoon,Americano,233
3,Morning,Americano with Milk,331
4,Morning,Americano,219
5,Morning,Latte,215
6,Night,Latte,272
7,Night,Americano with Milk,239
8,Night,Cappuccino,200
