In [1]:
import duckdb

In [2]:
conn = duckdb.connect('../fetch.db')

# What are the top 5 brands by receipts scanned for most recent month?
+ Note
    + Top 5 by number of receipts scanned with a specific brand code
        + N_Receipts may double count receipts as brand codes are associated with items not receipts
    + Top 5 by number of receipt items scanned with a specific brand code

##### **There are no brands with receipt items for the month of March 2021**
+ There were 12 receipts that contained items with no associated brand (24 items)



In [3]:
print("Top 5 Brands by receipts scanned and receipt items scanned")
print("March 2021")
conn.sql("""
    WITH receipt_dates AS (
        SELECT _receipt_id, date_scanned FROM receipts     
    ),
    ranked_receipt_items AS (
        SELECT
            receipt_items._receipt_id,
            receipt_items._receipt_item_id,
            receipt_items.brand_code,
            date_scanned,
            DATE_TRUNC('month', date_scanned)::date as ym,
            DENSE_RANK() OVER (ORDER BY ym DESC) as rank
        FROM receipt_items
        LEFT JOIN receipt_dates ON receipt_items._receipt_id = receipt_dates._receipt_id
        ORDER BY date_scanned DESC
    )
    SELECT 
        brand_code,
        COUNT(DISTINCT _receipt_id) AS N_Receipts,
        COUNT(DISTINCT _receipt_item_id) AS N_Receipt_Items
    FROM ranked_receipt_items
    WHERE rank = 1
    GROUP BY brand_code
    ORDER BY N_Receipts DESC
;""")

Top 5 Brands by receipts scanned and receipt items scanned
March 2021


┌────────────┬────────────┬─────────────────┐
│ brand_code │ N_Receipts │ N_Receipt_Items │
│  varchar   │   int64    │      int64      │
├────────────┼────────────┼─────────────────┤
│ NULL       │         12 │              24 │
└────────────┴────────────┴─────────────────┘

# How does the ranking of the top 5 brands by receipts scanned for the recent month compare to the ranking for the previous month?
##### **Comparing March 2021 to February 2021, we can see we finally have values for brands which are not null**
##### **The top brands we have codes for are BRAND, MISSION, and VIVA for the month of February 2021**
+ BRAND was in 3 Receipts (3 items)
+ MISSION was in 2 Receipts (2 items)
+ VIVA was in 1 Receipt (1 item)
+ There were 115 receipts that contained items with no associated brand (186 items)

In [4]:
print("Top 5 Brands by receipts scanned and receipt items scanned")
print("February 2021")
conn.sql("""
    WITH receipt_dates AS (
        SELECT _receipt_id, date_scanned FROM receipts     
    ),
    ranked_receipt_items AS (
        SELECT
            receipt_items._receipt_id,
            receipt_items._receipt_item_id,
            receipt_items.brand_code,
            date_scanned,
            DATE_TRUNC('month', date_scanned)::date as ym,
            DENSE_RANK() OVER (ORDER BY ym DESC) as rank
        FROM receipt_items
        LEFT JOIN receipt_dates ON receipt_items._receipt_id = receipt_dates._receipt_id
        ORDER BY date_scanned DESC
    )
    SELECT 
        brand_code,
        COUNT(DISTINCT _receipt_id) AS N_Receipts,
        COUNT(DISTINCT _receipt_item_id) AS N_Receipt_Items
    FROM ranked_receipt_items
    WHERE rank = 2
    GROUP BY brand_code
    ORDER BY N_Receipts DESC
;""")

Top 5 Brands by receipts scanned and receipt items scanned
February 2021


┌────────────┬────────────┬─────────────────┐
│ brand_code │ N_Receipts │ N_Receipt_Items │
│  varchar   │   int64    │      int64      │
├────────────┼────────────┼─────────────────┤
│ NULL       │        115 │             186 │
│ BRAND      │          3 │               3 │
│ MISSION    │          2 │               2 │
│ VIVA       │          1 │               1 │
└────────────┴────────────┴─────────────────┘

# When considering average spend from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?
##### **The average spend is higher for FINISHED (Accepted) receipts.**
+ Nothing named 'Accepted', assuming FINISHED = 'Accepted'


In [5]:
conn.sql("""
         SELECT total_spent, rewards_receipt_status FROM receipts
         WHERE (rewards_receipt_status = 'FINISHED'
         OR rewards_receipt_status = 'REJECTED')
         AND total_spent IS NULL
         """
         ).show()
conn.sql("""
         SELECT DISTINCT rewards_receipt_status FROM receipts
         """)

┌───────────────┬────────────────────────┐
│  total_spent  │ rewards_receipt_status │
│ decimal(10,2) │        varchar         │
├───────────────┴────────────────────────┤
│                 0 rows                 │
└────────────────────────────────────────┘



┌────────────────────────┐
│ rewards_receipt_status │
│        varchar         │
├────────────────────────┤
│ FINISHED               │
│ SUBMITTED              │
│ PENDING                │
│ FLAGGED                │
│ REJECTED               │
└────────────────────────┘

In [6]:
print(f"The average spend is higher for FINISHED (Accepted) receipts.")
conn.sql("""
         SELECT AVG(total_spent) as average_spend, rewards_receipt_status 
         FROM receipts
         WHERE rewards_receipt_status = 'FINISHED'
         OR rewards_receipt_status = 'REJECTED'
         GROUP BY rewards_receipt_status
         ORDER BY average_spend DESC
         """
         )

The average spend is higher for FINISHED (Accepted) receipts.


┌───────────────────┬────────────────────────┐
│   average_spend   │ rewards_receipt_status │
│      double       │        varchar         │
├───────────────────┼────────────────────────┤
│ 80.85430501930502 │ FINISHED               │
│ 23.32605633802817 │ REJECTED               │
└───────────────────┴────────────────────────┘

# When considering total number of items purchased from receipts with 'rewardsReceiptStatus’ of ‘Accepted’ or ‘Rejected’, which is greater?
##### **The total number of items purchased is higher for FINISHED (Accepted) receipts.**
+ Nothing named 'Accepted', assuming FINISHED = 'Accepted'
+ Assuming total number of items purchased refers to purchasedItemCount in receipts.json and not total number of items in the receipts rewardsReceiptItemsList
    + Length of rewards receipt item list is not always equivalent to the total amount of items purchased in the receipt

In [7]:
conn.sql("""
         SELECT purchased_item_count, rewards_receipt_status FROM receipts
         WHERE (rewards_receipt_status = 'FINISHED'
                OR rewards_receipt_status = 'REJECTED')
         AND (purchased_item_count IS NULL
              OR purchased_item_count < 0)
         """
         ).show()

┌──────────────────────┬────────────────────────┐
│ purchased_item_count │ rewards_receipt_status │
│        int32         │        varchar         │
├──────────────────────┴────────────────────────┤
│                    0 rows                     │
└───────────────────────────────────────────────┘



In [8]:
print(f"The total number of items purchased is higher for FINISHED (Accepted) receipts.")
conn.sql("""
         SELECT SUM(purchased_item_count) as N_Items_Purchased, rewards_receipt_status 
         FROM receipts
         WHERE rewards_receipt_status = 'FINISHED'
         OR rewards_receipt_status = 'REJECTED'
         GROUP BY rewards_receipt_status
         ORDER BY N_Items_Purchased DESC
         """
         )

The total number of items purchased is higher for FINISHED (Accepted) receipts.


┌───────────────────┬────────────────────────┐
│ N_Items_Purchased │ rewards_receipt_status │
│      int128       │        varchar         │
├───────────────────┼────────────────────────┤
│              8184 │ FINISHED               │
│               173 │ REJECTED               │
└───────────────────┴────────────────────────┘

# Which brand has the most spend among users who were created within the past 6 months?

##### **Anchoring in 2025, no brands have any spend among users created within the past 6 months**

In [9]:
conn.sql("""
         SELECT _user_id, created_date, active, role FROM users
         WHERE created_date >= DATE_TRUNC('month', now()) - interval '6 month'
         """
         )

┌──────────┬──────────────┬─────────┬─────────┐
│ _user_id │ created_date │ active  │  role   │
│ varchar  │  timestamp   │ boolean │ varchar │
├──────────┴──────────────┴─────────┴─────────┤
│                   0 rows                    │
└─────────────────────────────────────────────┘

##### **Anchoring between 2020-09-01 and 2021-03-01, Ben and Jerry's has the most spend**
+ Since the max date_scanned in the receipts json was 2021-03-01, I am chosing to anchor my answer as of March 2021.

In [10]:
# Should role = 'fetch-staff' be considered?
# Should active = true be considered?
# Where final_price is null, only user_flagged_price seems to be filled, should we fill?
    # Doesn't matter this time
conn.sql("""    
         WITH receipt_item_user_6mo AS (
            SELECT brand_code, final_price, user_flagged_price, COALESCE(final_price, user_flagged_price) as filled_price
            FROM receipt_items
            WHERE receipt_items._user_id IN (
                SELECT _user_id 
                FROM users
                WHERE created_date <= '2021-03-01'
                AND created_date >= '2020-09-01'
            )
         )
         SELECT brand_code, ROUND(SUM(final_price::numeric), 2) as total_spend 
         FROM receipt_item_user_6mo
         WHERE brand_code IS NOT NULL
         GROUP BY brand_code
         ORDER BY total_spend DESC
         LIMIT 1

;"""
)

┌────────────────┬───────────────┐
│   brand_code   │  total_spend  │
│    varchar     │ decimal(38,2) │
├────────────────┼───────────────┤
│ BEN AND JERRYS │       1217.40 │
└────────────────┴───────────────┘

# Which brand has the most transactions among users who were created within the past 6 months?
##### **Brand code 'BRAND' has the most transactions among users created within the past 6 months**
+ Assuming 1 receipt is 1 transaction; not 1 receipt_item is 1 transaction

In [11]:
# Should role = 'fetch-staff' be considered?
# Should active = true be considered?
conn.sql("""    
         WITH receipt_item_user_6mo AS (
            SELECT _receipt_id, brand_code
            FROM receipt_items
            WHERE receipt_items._user_id IN (
                SELECT _user_id 
                FROM users
                WHERE created_date <= '2021-03-01'
                AND created_date >= '2020-09-01'
            )
         )
         SELECT brand_code, COUNT(DISTINCT _receipt_id) as N_Transactions
         FROM receipt_item_user_6mo
         WHERE brand_code IS NOT NULL
         GROUP BY brand_code
         ORDER BY N_Transactions DESC
         LIMIT 1

;"""
)

┌────────────┬────────────────┐
│ brand_code │ N_Transactions │
│  varchar   │     int64      │
├────────────┼────────────────┤
│ BRAND      │             20 │
└────────────┴────────────────┘

In [12]:
conn.close()