In [1]:
import pandas as pd
from pandasql import sqldf
from datetime import datetime, timedelta

In [2]:
users = pd.read_csv("Updated_Datasets/USER_CLEANED.csv")
transactions = pd.read_csv("Updated_Datasets/TRANSACTION_CLEANED.csv")
products = pd.read_csv("Updated_Datasets/PRODUCTS_CLEANED.csv")

In [3]:
# Function to run SQL queries on pandas DataFrames
def pysqldf(q):
    return sqldf(q, globals())

# What are the top 5 brands by receipts scanned among users 21 and over?

In [4]:
# SQL query to find the top 5 brands by receipts scanned among users 21 and over
query = """
SELECT p.BRAND, COUNT(DISTINCT t.RECEIPT_ID) AS receipt_count
FROM users u
JOIN transactions t ON u.ID = t.USER_ID
JOIN products p ON t.BARCODE = p.BARCODE
WHERE u.AGE >= 21 
  AND p.BRAND IS NOT NULL 
  AND p.BRAND <> ''
GROUP BY p.BRAND
ORDER BY receipt_count DESC
LIMIT 5;
"""

# Execute the query
top_5_brands_by_receipts = pysqldf(query)
print(top_5_brands_by_receipts)

             BRAND  receipt_count
0      NERDS CANDY              3
1             DOVE              3
2          TRIDENT              2
3  SOUR PATCH KIDS              2
4           MEIJER              2


# What are the top 5 brands by sales among users that have had their account for at least six months?

In [5]:
# SQL query to find the top 5 brands by sales among users with accounts at least six months old
query = """
SELECT p.BRAND, SUM(t.FINAL_SALE) AS total_sales
FROM users u
JOIN transactions t ON u.ID = t.USER_ID
JOIN products p ON t.BARCODE = p.BARCODE
WHERE julianday('now') - julianday(u.CREATED_DATE) >= 180
GROUP BY p.BRAND
ORDER BY total_sales DESC
LIMIT 5;
"""
# Execute the query
top_5_brands_by_sales = pysqldf(query)
print(top_5_brands_by_sales)

         BRAND  total_sales
0          CVS        72.00
1         DOVE        30.91
2      TRIDENT        23.36
3  COORS LIGHT        17.48
4     TRESEMMÉ        14.58


In [10]:
print("Users columns:", users.columns.tolist())

Users columns: ['ID', 'CREATED_DATE', 'BIRTH_DATE', 'STATE', 'LANGUAGE', 'GENDER', 'ACCOUNT_AGE', 'AGE', 'CREATED_YEAR', 'CREATED_MONTH', 'CREATED_DAYOFWEEK']


# At what percent has Fetch grown year over year?

In [8]:
# SQL query using the 'CREATED_YEAR' column from USERS CLEANED dataset for Year on Year Growth
query = """
SELECT 
    u1.CREATED_YEAR AS year,
    COUNT(u1.ID) AS current_year_users,
    (SELECT COUNT(u2.ID)
     FROM users u2
     WHERE u2.CREATED_YEAR = u1.CREATED_YEAR - 1) AS previous_year_users,
    (CAST(COUNT(u1.ID) AS FLOAT) - (SELECT COUNT(u2.ID)
                                    FROM users u2
                                    WHERE u2.CREATED_YEAR = u1.CREATED_YEAR - 1)) 
    / NULLIF((SELECT COUNT(u2.ID)
              FROM users u2
              WHERE u2.CREATED_YEAR = u1.CREATED_YEAR - 1), 0) * 100 AS yoy_growth_percent
FROM users u1
GROUP BY u1.CREATED_YEAR
ORDER BY u1.CREATED_YEAR;
"""

# Execute the query
yoy_growth = pysqldf(query)
print(yoy_growth)

    year  current_year_users  previous_year_users  yoy_growth_percent
0   2014                  30                    0                 NaN
1   2015                  51                   30           70.000000
2   2016                  70                   51           37.254902
3   2017                 644                   70          820.000000
4   2018                2168                  644          236.645963
5   2019                7093                 2168          227.167897
6   2020               16883                 7093          138.023403
7   2021               19159                16883           13.481016
8   2022               26807                19159           39.918576
9   2023               15464                26807          -42.313575
10  2024               11631                15464          -24.786601


In [9]:
# SQL query to find the leading brand in the "Dips & Salsa" category
query = """
SELECT p.BRAND, SUM(t.FINAL_SALE) AS Total_Sales
FROM transactions t
JOIN products p ON t.BARCODE = p.BARCODE
WHERE p.CATEGORY_1 = 'Dips & Salsa' OR p.CATEGORY_2 = 'Dips & Salsa' OR
      p.CATEGORY_3 = 'Dips & Salsa' OR p.CATEGORY_4 = 'Dips & Salsa'
GROUP BY p.BRAND
ORDER BY Total_Sales DESC
LIMIT 1;
"""

# Execute the query
leading_brand_dips_salsa = pysqldf(query)
print(leading_brand_dips_salsa)

      BRAND  Total_Sales
0  TOSTITOS        181.3
