In [1]:
import os
from sqlalchemy import create_engine
import pandas as pd
import pickle
from urllib.parse import quote_plus
from dotenv import load_dotenv

load_dotenv()

password = os.getenv("POSTGRES_PASSWORD")
username = os.getenv("POSTGRES_USERNAME")
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
db = os.getenv("POSTGRES_DB")

password_encoded = quote_plus(password)

# Load your pickle file
with open("/Users/evro/Documents/code/python/fetch/data/validated/users.pkl", "rb") as f:
    users = pickle.load(f)
with open("/Users/evro/Documents/code/python/fetch/data/validated/receipts.pkl", "rb") as f:
    receipts = pickle.load(f)
with open("/Users/evro/Documents/code/python/fetch/data/validated/receipt_item_data.pkl", "rb") as f:
    receipt_items = pickle.load(f)
with open("/Users/evro/Documents/code/python/fetch/data/validated/brands.pkl", "rb") as f:
    brands = pickle.load(f)

# Create a SQLAlchemy engine tied to the same connection
engine = create_engine(f"postgresql+psycopg2://{username}:{password_encoded}@{host}:{port}/{db}")

# Write the DataFrames to PostgreSQL tables.
users.to_sql('users', engine, index=False, if_exists='replace')
receipts.to_sql('receipts', engine, index=False, if_exists='replace')
receipt_items.to_sql('receipt_items', engine, index=False, if_exists='replace')
brands.to_sql('brands', engine, index=False, if_exists='replace')

167

In [2]:
# Run a query using pandas
query_one = """
WITH recent_month AS (
  SELECT
    DATE( MAX("date_scanned")) AS start_date,
    DATE( MAX("date_scanned")) + INTERVAL '1 month' AS end_date
  FROM receipts
)
SELECT
  b.name AS brand_name,
  COUNT(DISTINCT r.receipt_id) AS receipt_count
FROM receipts r
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code
JOIN recent_month rm
    ON DATE(r.date_scanned) >= rm.start_date
    AND DATE(r.date_scanned) < rm.end_date
GROUP BY b.name
ORDER BY receipt_count DESC
LIMIT 5;
"""

pd.read_sql_query(query_one, engine)

Unnamed: 0,brand_name,receipt_count


In [3]:
pd.read_sql_query("""
SELECT
  b.name AS brand_name,
  COUNT(DISTINCT r.receipt_id) AS receipt_count,
  r.date_scanned as dates
FROM receipts r
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code

GROUP BY b.name, dates
ORDER BY dates DESC
""", engine)

Unnamed: 0,brand_name,receipt_count,dates
0,Viva,1,2021-02-10
1,Huggies,1,2021-01-29
2,Huggies,1,2021-01-28
3,Cheetos,4,2021-01-25
4,Classico,3,2021-01-25
...,...,...,...
103,KNORR,1,2021-01-14
104,Kleenex,2,2021-01-14
105,Doritos,1,2021-01-06
106,Kleenex,1,2021-01-06


In [4]:
pd.read_sql_query(
    """
    WITH latest_month AS (
      SELECT DATE_TRUNC('month', MAX(r.date_scanned)) AS most_recent_month
      FROM receipts r
      JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
      JOIN brands b ON ri.brand_code = b.brand_code
    )
    SELECT
      b.name AS brand_name,
      COUNT(DISTINCT r.receipt_id) AS receipt_count
    FROM receipts r
    JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
    JOIN brands b ON ri.brand_code = b.brand_code
    WHERE DATE_TRUNC('month', r.date_scanned) = (SELECT most_recent_month FROM latest_month)
    GROUP BY b.name
    ORDER BY receipt_count DESC
    LIMIT 5;
    """, engine
)

Unnamed: 0,brand_name,receipt_count
0,Viva,1


In [5]:
query_two = """
WITH recent AS (
  SELECT b.name AS brand_name,
         COUNT(DISTINCT r.receipt_id) AS receipt_count,
         'recent' AS period
  FROM receipts r
  JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
  JOIN brands b ON ri.brand_code = b.brand_code
  WHERE r.date_scanned >= date_trunc('month', current_date) - interval '1 month'
    AND r.date_scanned < date_trunc('month', current_date)
  GROUP BY b.name
),
previous AS (
  SELECT b.name AS brand_name,
         COUNT(DISTINCT r.receipt_id) AS receipt_count,
         'previous' AS period
  FROM receipts r
  JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
  JOIN brands b ON ri.brand_code = b.brand_code
  WHERE r.date_scanned >= date_trunc('month', current_date) - interval '2 month'
    AND r.date_scanned < date_trunc('month', current_date) - interval '1 month'
  GROUP BY b.name
),
combined AS (
  SELECT * FROM recent
  UNION ALL
  SELECT * FROM previous
)
SELECT period,
       brand_name,
       receipt_count,
       RANK() OVER (PARTITION BY period ORDER BY receipt_count DESC) AS rank_position
FROM combined
ORDER BY period, rank_position;

"""

pd.read_sql_query(query_two, engine)

Unnamed: 0,period,brand_name,receipt_count,rank_position


In [10]:
five = """
    SELECT
      b.name AS brand_name,
      SUM(r.total_spent) AS total_spend
    FROM receipts r
    JOIN users u ON r.user_id::text = u.user_id
    JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
    JOIN brands b ON ri.brand_code = b.brand_code
    WHERE u.created_date >= current_date - interval '6 month'
    GROUP BY b.name
    ORDER BY total_spend DESC
    LIMIT 1;

"""
pd.read_sql_query(five, engine)


ProgrammingError: (psycopg2.errors.UndefinedFunction) operator does not exist: text >= timestamp without time zone
LINE 9:     WHERE u.created_date >= current_date - interval '6 month...
                                 ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

[SQL: 
    SELECT
      b.name AS brand_name,
      SUM(r.total_spent) AS total_spend
    FROM receipts r
    JOIN users u ON r.user_id::text = u.user_id
    JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
    JOIN brands b ON ri.brand_code = b.brand_code
    WHERE u.created_date >= current_date - interval '6 month'
    GROUP BY b.name
    ORDER BY total_spend DESC
    LIMIT 1;

]
(Background on this error at: https://sqlalche.me/e/20/f405)

Unnamed: 0,period,brand_name,receipt_count,rank_position


In [16]:
query_six = """
SELECT b.name AS brand_name,
       SUM(r.total_spent) AS total_spend
FROM receipts r
JOIN users u ON r.user_id = u.user_id
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code
WHERE u.create_date >= current_date - interval '6 month'
GROUP BY b.name
ORDER BY total_spend DESC
LIMIT 1;

"""

pd.read_sql_query(query_six, engine)

ProgrammingError: (psycopg2.errors.UndefinedFunction) operator does not exist: double precision = text
LINE 5: JOIN users u ON r.user_id = u.user_id
                                  ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

[SQL: 
SELECT b.name AS brand_name,
       SUM(r.total_spent) AS total_spend
FROM receipts r
JOIN users u ON r.user_id = u.user_id
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code
WHERE u.create_date >= current_date - interval '6 month'
GROUP BY b.name
ORDER BY total_spend DESC
LIMIT 1;

]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [11]:
q1 = """
SELECT b.name AS brand_name, COUNT(r.receipt_id) AS receipts_scanned
FROM receipts r
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code
WHERE r.purchase_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
  AND r.purchase_date < DATE_TRUNC('month', CURRENT_DATE)
GROUP BY b.name
ORDER BY receipts_scanned DESC
LIMIT 5;

"""

pd.read_sql_query(
    q1, engine
)

Unnamed: 0,brand_name,receipts_scanned


In [12]:
q2 = """
WITH current_month AS (
    SELECT b.name AS brand_name, COUNT(r.receipt_id) AS receipts_scanned
    FROM receipts r
    JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
    JOIN brands b ON ri.brand_code = b.brand_code
    WHERE r.purchase_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
      AND r.purchase_date < DATE_TRUNC('month', CURRENT_DATE)
    GROUP BY b.name
    ORDER BY receipts_scanned DESC
    LIMIT 5
),
previous_month AS (
    SELECT b.name AS brand_name, COUNT(r.receipt_id) AS receipts_scanned
    FROM receipts r
    JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
    JOIN brands b ON ri.brand_code = b.brand_code
    WHERE r.purchase_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '2 month')
      AND r.purchase_date < DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month')
    GROUP BY b.name
    ORDER BY receipts_scanned DESC
    LIMIT 5
)
SELECT cm.brand_name AS current_brand, cm.receipts_scanned AS current_count,
       pm.brand_name AS previous_brand, pm.receipts_scanned AS previous_count
FROM current_month cm
FULL OUTER JOIN previous_month pm ON cm.brand_name = pm.brand_name;

"""

pd.read_sql_query(q2, engine)

Unnamed: 0,current_brand,current_count,previous_brand,previous_count


In [13]:
q3 = """
SELECT rewards_receipt_status, AVG(total_spent) AS avg_spent
FROM receipts
WHERE rewards_receipt_status IN ('FINISHED', 'REJECTED')
GROUP BY rewards_receipt_status;

"""

pd.read_sql_query(q3, engine)

Unnamed: 0,rewards_receipt_status,avg_spent
0,REJECTED,23.326056
1,FINISHED,80.854305


In [14]:
q4 = """
SELECT rewards_receipt_status, SUM(purchased_item_count) AS total_items
FROM receipts
WHERE rewards_receipt_status IN ('FINISHED', 'REJECTED')
GROUP BY rewards_receipt_status;

"""
pd.read_sql_query(q4, engine)

Unnamed: 0,rewards_receipt_status,total_items
0,REJECTED,173.0
1,FINISHED,8184.0


In [15]:
q5 = """
SELECT b.name AS brand_name, SUM(ri.final_price) AS total_spend
FROM users u
JOIN receipts r ON u.user_id = r.user_id
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code
WHERE u.create_date >= CURRENT_DATE - INTERVAL '6 months'
GROUP BY b.name
ORDER BY total_spend DESC
LIMIT 1;
"""

pd.read_sql_query(q5, engine)

ProgrammingError: (psycopg2.errors.UndefinedFunction) operator does not exist: text = double precision
LINE 4: JOIN receipts r ON u.user_id = r.user_id
                                     ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

[SQL: 
SELECT b.name AS brand_name, SUM(ri.final_price) AS total_spend
FROM users u
JOIN receipts r ON u.user_id = r.user_id
JOIN receipt_items ri ON r.receipt_id = ri.receipt_id
JOIN brands b ON ri.brand_code = b.brand_code
WHERE u.create_date >= CURRENT_DATE - INTERVAL '6 months'
GROUP BY b.name
ORDER BY total_spend DESC
LIMIT 1;
]
(Background on this error at: https://sqlalche.me/e/20/f405)