In [3]:
import pandas as pd
from pandasql import sqldf
from datetime import datetime, timedelta

In [4]:
users = pd.read_csv("Updated_Datasets/USER_CLEANED.csv")
transactions = pd.read_csv("Updated_Datasets/TRANSACTION_CLEANED.csv")
products = pd.read_csv("Updated_Datasets/PRODUCTS_CLEANED.csv")

In [5]:
# Function to run SQL queries on pandas DataFrames
def pysqldf(q):
    return sqldf(q, globals())

# What are the top 5 brands by receipts scanned among users 21 and over?

In [6]:
# SQL query to find the top 5 brands by receipts scanned among users 21 and over
query = """
SELECT p.BRAND, COUNT(DISTINCT t.RECEIPT_ID) AS receipt_count
FROM users u
JOIN transactions t ON u.ID = t.USER_ID
JOIN products p ON t.BARCODE = p.BARCODE
WHERE u.AGE >= 21 
  AND p.BRAND IS NOT NULL 
  AND p.BRAND <> ''
GROUP BY p.BRAND
ORDER BY receipt_count DESC
LIMIT 5;
"""


top_5_brands_by_receipts = pysqldf(query)
top_5_brands_by_receipts

Unnamed: 0,BRAND,receipt_count
0,NERDS CANDY,3
1,DOVE,3
2,TRIDENT,2
3,SOUR PATCH KIDS,2
4,MEIJER,2


In [13]:
top_5_brands_by_receipts.to_csv("SQL_Outputs/top_5_brands_by_receipts.csv", index = False)

# What are the top 5 brands by sales among users that have had their account for at least six months?

In [7]:
# SQL query to find the top 5 brands by sales among users with accounts at least six months old
query = """
SELECT p.BRAND, SUM(t.FINAL_SALE) AS total_sales
FROM users u
JOIN transactions t ON u.ID = t.USER_ID
JOIN products p ON t.BARCODE = p.BARCODE
WHERE julianday('now') - julianday(u.CREATED_DATE) >= 180
GROUP BY p.BRAND
ORDER BY total_sales DESC
LIMIT 5;
"""

top_5_brands_by_sales = pysqldf(query)
top_5_brands_by_sales

Unnamed: 0,BRAND,total_sales
0,CVS,72.0
1,DOVE,30.91
2,TRIDENT,23.36
3,COORS LIGHT,17.48
4,TRESEMMÉ,14.58


In [15]:
top_5_brands_by_sales.to_csv("SQL_Outputs/top_5_brands_by_sales.csv", index = False)

In [8]:
print("Users columns:", users.columns.tolist())

Users columns: ['ID', 'CREATED_DATE', 'BIRTH_DATE', 'STATE', 'LANGUAGE', 'GENDER', 'ACCOUNT_AGE', 'AGE', 'CREATED_YEAR', 'CREATED_MONTH', 'CREATED_DAYOFWEEK']


# At what percent has Fetch grown year over year?

In [9]:
# SQL query using the 'CREATED_YEAR' column from USERS CLEANED dataset for Year on Year Growth
query = """
SELECT 
    u1.CREATED_YEAR AS year,
    COUNT(u1.ID) AS current_year_users,
    (SELECT COUNT(u2.ID)
     FROM users u2
     WHERE u2.CREATED_YEAR = u1.CREATED_YEAR - 1) AS previous_year_users,
    (CAST(COUNT(u1.ID) AS FLOAT) - (SELECT COUNT(u2.ID)
                                    FROM users u2
                                    WHERE u2.CREATED_YEAR = u1.CREATED_YEAR - 1)) 
    / NULLIF((SELECT COUNT(u2.ID)
              FROM users u2
              WHERE u2.CREATED_YEAR = u1.CREATED_YEAR - 1), 0) * 100 AS yoy_growth_percent
FROM users u1
GROUP BY u1.CREATED_YEAR
ORDER BY u1.CREATED_YEAR;
"""


yoy_growth = pysqldf(query)
yoy_growth

Unnamed: 0,year,current_year_users,previous_year_users,yoy_growth_percent
0,2014,30,0,
1,2015,51,30,70.0
2,2016,70,51,37.254902
3,2017,644,70,820.0
4,2018,2168,644,236.645963
5,2019,7093,2168,227.167897
6,2020,16883,7093,138.023403
7,2021,19159,16883,13.481016
8,2022,26807,19159,39.918576
9,2023,15464,26807,-42.313575


In [17]:
yoy_growth.to_csv("SQL_Outputs/yoy_growth.csv", index = False)

In [10]:
# SQL query to find the leading brand in the "Dips & Salsa" category
query = """
SELECT p.BRAND, SUM(t.FINAL_SALE) AS Total_Sales
FROM transactions t
JOIN products p ON t.BARCODE = p.BARCODE
WHERE p.CATEGORY_1 = 'Dips & Salsa' OR p.CATEGORY_2 = 'Dips & Salsa' OR
      p.CATEGORY_3 = 'Dips & Salsa' OR p.CATEGORY_4 = 'Dips & Salsa'
GROUP BY p.BRAND
ORDER BY Total_Sales DESC
LIMIT 1;
"""

# Execute the query
leading_brand_dips_salsa = pysqldf(query)
print(leading_brand_dips_salsa)

      BRAND  Total_Sales
0  TOSTITOS        181.3


In [11]:
# Data Quality issue regarding purchase dates and scan dates

query = """
SELECT *
FROM transactions
WHERE purchase_date > scan_date
   OR purchase_date < '2000-01-01' 
   OR purchase_date > '2099-12-31'
   OR scan_date < '2000-01-01'
   OR scan_date > '2099-12-31';
   """

abc = pysqldf(query)
abc

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,5c10e8df-9a9b-4ce0-bfed-51ab7adf2eb5,2024-07-06,2024-07-05 13:18:03.207 Z,DOLLAR GENERAL STORE,6230b0508096d0349b6c86f7,80878195749.0,1.0,10.0
1,05023b3d-5f83-47a7-a17c-8e8521d0bc94,2024-09-08,2024-09-07 22:22:29.903 Z,SHOP RITE,666a43c77c0469953bfd9ae0,64144041640.0,2.0,2.98
2,bfa85c94-a644-4f81-8a83-90ade603cf31,2024-06-15,2024-06-14 21:26:00.636 Z,WALMART,5dc24cdb682fcf1229d04bd6,78742236964.0,1.0,0.47
3,878fe0e9-42e5-4de2-b9d1-da82ecf25db1,2024-06-29,2024-06-28 20:04:50.390 Z,DOLLAR GENERAL STORE,64761a27a55bb77a0e27a417,732748017254.0,1.0,1.25
4,801eeda5-e4fe-41e1-97c2-03026765cc47,2024-08-31,2024-08-30 19:27:02.783 Z,WALMART,5dc24cdb682fcf1229d04bd6,21000658831.0,1.0,1.24
5,9abbb4f5-ac03-465f-97a2-13158fdf4043,2024-06-16,2024-06-15 08:11:23.497 Z,CVS,62fbe32f92710024ad021b5e,50428332221.0,1.0,5.99
6,c33e6c37-6ef2-4b4c-8b74-f96edce24995,2024-06-26,2024-06-25 22:03:31.809 Z,WALMART,6335c6fb0c625b72ae9d1af6,38000138638.0,1.0,2.0
7,2e841c20-2560-484e-b363-6b2b2f7530a6,2024-09-05,2024-09-04 21:03:53.240 Z,MACEY'S,5e8cec07f70de813e00885d6,38000199240.0,1.0,4.99
8,9c4f0e47-b911-4759-8b7a-ff07a89c4d0b,2024-06-22,2024-06-21 19:59:42.685 Z,WALMART,615ca042c2fb932121380f5d,49000012781.0,1.0,11.78
9,706ce3f7-5e22-49d0-92c6-3baf3c7b72bd,2024-06-27,2024-06-26 21:57:07.771 Z,WALMART,634aee03305e373439460ac3,51000025500.0,1.0,1.52
