### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

### Woolworths weekly catalogue scraper

In [3]:
# api endpoint to get category list (use "view" endpoint)
url_view = "https://embed.salefinder.com.au/productlist/view/62467/"

# parameters for the request
params = {
    'locationId': '4679',
    'token': '570f5c4a44505b5f51477f531a03180a0e0b1c1362352b2e21363226253968717d7a787d6468626562612b',
    'saleGroup': '0',
    'rows_per_page': '10',
}

# headers to make our request look like a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': '*/*',
}

# fetch the main catalogue page
response = requests.get(url_view, params=params, headers=headers)
json_text = response.text.strip()[1:-1]
data = json.loads(json_text)

print(f"Fetched: {data['saleName']}")
print(f"Valid: {data['startDate']} to {data['endDate']}\n")

# parse the HTML to extract categories
soup = BeautifulSoup(data['content'], 'html.parser')
categories = {}

for link in soup.find_all('a', class_='sf-navcategory-link'):
    href = link.get('href', '')
    if 'categoryId=' in href:
        category_id = href.split('categoryId=')[1].split('&')[0]
        category_name = link.text.strip()
        categories[category_id] = category_name

Fetched: Weekly Catalogue NSW
Valid: 2025-12-03T00:00:00 to 2025-12-09T23:59:59



In [11]:
url_category = "https://embed.salefinder.com.au/productlist/category/62467/"

# list to store ALL products from ALL categories
all_products = []

# loop through each category
for category_id, category_name in categories.items():
    
    # parameters for this specific category
    params = {
        'locationId': '4679',
        'token': '570f5c4a44505b5f51477f531a03180a0e0b1c1362352b2e21363226253968717d7a787d6468626562612b',
        'saleGroup': '0',
        'categoryId': category_id, 
        'rows_per_page': '500',
    }
    
    # api request
    response = requests.get(url_category, params=params, headers=headers)
    json_text = response.text.strip()[1:-1]
    data = json.loads(json_text)
    
    # parse
    soup = BeautifulSoup(data['content'], 'html.parser')
    
    # extract products
    for product in soup.find_all('div', class_='shelfProductStamp'):
        name_tag = product.find('span', class_='sf-item-heading')
        sale_price_tag = product.find('span', class_='sf-pricedisplay')
        savings_tag = product.find('span', class_='sf-regprice')
        
        if not name_tag or not sale_price_tag:
            continue
        
        name = name_tag.text.strip()
        sale_price = sale_price_tag.text.strip()
        stock_code = product.get('data-stockcode')
        
        sale_value = float(sale_price.replace('$', '').replace(',', ''))    
            
        if savings_tag:
            savings = savings_tag.text.strip()
            savings_value = float(savings.replace('$', ''))
            original_value = sale_value + savings_value
        else:
            savings_value = 0.0
            original_value = sale_value
        
        all_products.append({
            'category': category_name,
            'stock_code': stock_code,
            'name': name,
            'sale_price': sale_value,
            'original_price': original_value,
            'savings': savings_value
        })
    
    time.sleep(0.3) 

# Create DataFrame
df = pd.DataFrame(all_products)

In [12]:
df = pd.DataFrame(all_products)

df.groupby('category').first().reset_index()

Unnamed: 0,category,stock_code,name,sale_price,original_price,savings
0,Baby,20391.0,Huggies Baby Wipes Pk 72-80,2.5,5.0,2.5
1,Bakery,91895.0,Woolworths Large Pavlova Base,7.5,7.5,0.0
2,Baking,893892.0,Betty Crocker Cake or Cupcake Baking Mixes 370...,3.0,6.0,3.0
3,Beauty,6030444.0,The Kind Collective Mini Trio Sweet Fix,7.0,10.0,3.0
4,Biscuits & Snacks,384250.0,Arnott’s Jatz Crackers Biscuits 225g,2.0,4.0,2.0
5,Breakfast Foods,701706.0,Kellogg’s Nutri-Grain 290g or Froot Loops Cere...,3.5,7.0,3.5
6,Canned & Packet food,6022474.0,Thomas Dux Salmon Gravlax 100g,8.0,8.0,0.0
7,Clothing,6030298.0,Christmas Licensed Socks Pk 2 Assorted,7.0,10.0,3.0
8,Condiments,,"Woolworths Citrus, Honey & Mustard Glaze 150g",4.0,4.0,0.0
9,Confectionery,108635.0,Cadbury Christmas Stocking 180g,6.0,8.0,2.0


### Australia food guidelines

https://www.eatforhealth.gov.au/food-essentials/how-much-do-we-need-each-day/recommended-number-serves-adults

In [13]:
# converting the table above into a df
dietary_guidelines = {
    'Group': [
        'Men 19-50',
        'Men 51-70', 
        'Men 70+',
        'Women 19-50',
        'Women 51-70',
        'Women 70+',
        'Pregnant',
        'Lactating'
    ],
    'Vegetables & Legumes': [6, 5.5, 5, 5, 5, 5, 5, 7.5],
    'Fruit': [2, 2, 2, 2, 2, 2, 2, 2],
    'Grains (Wholegrain)': [6, 6, 4.5, 6, 4, 3, 8.5, 9],
    'Lean Meat & Alternatives': [3, 2.5, 2.5, 2.5, 2, 2, 3.5, 2.5],
    'Dairy (Reduced Fat)': [2.5, 2.5, 3.5, 2.5, 4, 4, 2.5, 2.5],
    'Additional Serves Range': ['0-3', '0-2.5', '0-2.5', '0-2.5', '0-2.5', '0-2', '0-2.5', '0-2.5']
}

df_guidelines = pd.DataFrame(dietary_guidelines)

df_guidelines

Unnamed: 0,Group,Vegetables & Legumes,Fruit,Grains (Wholegrain),Lean Meat & Alternatives,Dairy (Reduced Fat),Additional Serves Range
0,Men 19-50,6.0,2,6.0,3.0,2.5,0-3
1,Men 51-70,5.5,2,6.0,2.5,2.5,0-2.5
2,Men 70+,5.0,2,4.5,2.5,3.5,0-2.5
3,Women 19-50,5.0,2,6.0,2.5,2.5,0-2.5
4,Women 51-70,5.0,2,4.0,2.0,4.0,0-2.5
5,Women 70+,5.0,2,3.0,2.0,4.0,0-2
6,Pregnant,5.0,2,8.5,3.5,2.5,0-2.5
7,Lactating,7.5,2,9.0,2.5,2.5,0-2.5


### Categorising sale items into the australian reccomended food groups

In [14]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# glove model
model = api.load('glove-wiki-gigaword-300')

# define non-food categories to ignore
NON_FOOD_CATEGORIES = [
    'Baby', 'Beauty', 'Toiletries', 'Household Cleaning', 
    'Home & Outdoor', 'Stationery & Media', 'Clothing', 'Pet care'
]

# define dietary guideline categories with some additional keywords
dietary_guidelines = {
    'Vegetables & Legumes': ['vegetables', 'vegetable', 'legumes', 'beans', 'peas', 'broccoli', 'carrot', 'salad', 'greens'],
    'Fruit': ['fruit', 'fruits', 'apple', 'banana', 'orange', 'berry', 'grape', 'melon'],
    'Grains (Wholegrain)': ['bread', 'rice', 'pasta', 'noodles', 'cereal', 'oats', 'grain', 'wheat', 'crackers', 'muffin', 'bagel', 'bun', 'roll', 'wrap', 'pita', 'tortilla'],
    'Lean Meat & Alternatives': ['meat', 'beef', 'chicken', 'pork', 'lamb', 'fish', 'salmon', 'tuna', 'eggs', 'pie', 'sausage', 'bacon', 'ham', 'protein'],
    'Dairy (Reduced Fat)': ['milk', 'cheese', 'yoghurt', 'yogurt', 'dairy', 'cream', 'butter'],
    'Discretionary': ['chocolate', 'candy', 'chips', 'biscuits', 'cookies', 'cake', 'icecream', 'soft-drink', 'lollies', 'snack']
}

def get_text_embedding(words):
    """
    convert a list of words into a single vector by averaging
    """
    vectors = []
    for word in words:
        word_clean = word.lower().strip()
        if word_clean in model:
            vectors.append(model[word_clean])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

def classify_product(woolworths_category, product_name):
    """
    combine Woolworths category + product name to classify into dietary guidelines
    """
    # check if this is a non-food category first
    if woolworths_category in NON_FOOD_CATEGORIES:
        return 'Non-Food', 1.0
    
    # combine category and product name into words
    combined_text = f"{woolworths_category} {product_name}"
    
    # tokenize
    words = []
    for word in combined_text.split():
        # remove numbers and short words
        if len(word) > 2 and not word.isdigit():
            words.append(word)
    
    # get embedding for this product
    product_embedding = get_text_embedding(words)
    
    # get embeddings for each dietary category
    best_category = None
    best_score = -1
    
    for category_name, keywords in dietary_guidelines.items():
        category_embedding = get_text_embedding(keywords)
        
        # calculate similarity
        similarity = cosine_similarity(
            product_embedding.reshape(1, -1),
            category_embedding.reshape(1, -1)
        )[0][0]
        
        if similarity > best_score:
            best_score = similarity
            best_category = category_name
    
    return best_category, best_score

In [15]:
df['dietary_category'] = df.apply(
    lambda row: classify_product(row['category'], row['name'])[0],
    axis=1
)

df['dietary_confidence'] = df.apply(
    lambda row: classify_product(row['category'], row['name'])[1],
    axis=1
)

In [16]:
df_food = df[df['dietary_category'] != 'Non-Food']
df_food.to_csv('food_items_classified_manual_inspect.csv', index=False)

In [17]:
df_food.groupby('dietary_category').first().reset_index()

Unnamed: 0,dietary_category,category,stock_code,name,sale_price,original_price,savings,dietary_confidence
0,Dairy (Reduced Fat),Bakery,91895,Woolworths Large Pavlova Base,7.5,7.5,0.0,0.335953
1,Discretionary,Bakery,552336,Woolworths Decorated Gingerbread House 70g,3.5,3.5,0.0,0.38528
2,Fruit,Bakery,6030831,Woolworths Luscious Berry Trifle 1.2 kg,25.0,25.0,0.0,0.463502
3,Grains (Wholegrain),Bakery,328164,Gold Roasted Almond Shortbread 500g,19.0,19.0,0.0,0.625547
4,Lean Meat & Alternatives,Breakfast Foods,786273,Carman’s Protein Bars 150-200g Pk 5,5.5,7.9,2.4,0.588123
5,Vegetables & Legumes,Biscuits & Snacks,800481,Maretti Bruschette Roasted Garlic 150g,3.0,3.0,0.0,0.7372


In [18]:
test = df_food.groupby('dietary_category').first().reset_index()

### Extracting key ingredient

In [20]:
import anthropic
import time
import os
from dotenv import load_dotenv

load_dotenv() 
client = anthropic.Anthropic(
    api_key=os.getenv("ANTHROPIC_API_KEY")
)

def extract_key_ingredient_claude(product_name):
    """
    use claude to extract the main ingredient from a product name
    """
    try:
        message = client.messages.create(
            model="claude-3-5-haiku-20241022", 
            max_tokens=20,
            messages=[{
                "role": "user",
                "content": f"""Extract ONLY the main food ingredient from this product name. This ingredient is what best describes the item.  
Return just one to three words in singular form, lowercase, no punctuation.

Examples:
- "Woolworths Free Range Eggs 700g Pk 12" → egg
- "Always Fresh Sicilian Olives Pitted 230g" → olive
- "Four'N Twenty Angus Pies 700g" → beef pie
- "Mr Whisk Vanilla Meringues 130g" → meringue
- "Golden Crumpet Rounds Pk 6" → crumpet
- "Philadelphia Original Cream Cheese Twin Block 500g" → cream cheese 
- "Farmers Union Greek Yogurt Pots 150g" → greek yogurt

Product: {product_name}

Main ingredient:"""
            }]
        )
        
        return message.content[0].text.strip().lower()
    
    except Exception as e:
        print(f"Error processing '{product_name}': {e}")
        return 'Unknown'

results = []
for idx, product_name in enumerate(df_food['name'], 1):
    ingredient = extract_key_ingredient_claude(product_name)
    results.append(ingredient)

df_food['key_ingredient'] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_food['key_ingredient'] = results


In [21]:
df_food.to_csv('base_table_food.csv', index=False)

In [22]:
df_food.groupby('dietary_category').first().reset_index()

Unnamed: 0,dietary_category,category,stock_code,name,sale_price,original_price,savings,dietary_confidence,key_ingredient
0,Dairy (Reduced Fat),Bakery,91895,Woolworths Large Pavlova Base,7.5,7.5,0.0,0.335953,pavlova
1,Discretionary,Bakery,552336,Woolworths Decorated Gingerbread House 70g,3.5,3.5,0.0,0.38528,gingerbread
2,Fruit,Bakery,6030831,Woolworths Luscious Berry Trifle 1.2 kg,25.0,25.0,0.0,0.463502,berry trifle
3,Grains (Wholegrain),Bakery,328164,Gold Roasted Almond Shortbread 500g,19.0,19.0,0.0,0.625547,almond
4,Lean Meat & Alternatives,Breakfast Foods,786273,Carman’s Protein Bars 150-200g Pk 5,5.5,7.9,2.4,0.588123,protein bar
5,Vegetables & Legumes,Biscuits & Snacks,800481,Maretti Bruschette Roasted Garlic 150g,3.0,3.0,0.0,0.7372,bruschette


### Recipes

In [23]:
load_dotenv()
API_KEY = os.getenv("SPOONACULAR_API_KEY", "")

BASE_URL = "https://api.spoonacular.com"

In [24]:
COOKING_CATEGORIES = [
    'Baking', 'Canned & Packet food', 'Condiments',
    'Cooking, Seasoning & Gravy', 'Dairy', 'Deli & Chilled',
    'Fruit & Vegetables', 'International Foods', 'Jams & Spreads',
    'Meat', 'Seafood'
]

df_cooking = df_food[df_food['category'].isin(COOKING_CATEGORIES)]

In [25]:
ingredients = df_cooking['key_ingredient'].unique()
ingredient_string = ','.join(ingredients)

# api call
response = requests.get(
    f"{BASE_URL}/recipes/findByIngredients",
    params={
        'apiKey': API_KEY,
        'ingredients': ingredient_string,
        'number': 50,
        'ranking': 2
    }
)

In [26]:
if response.status_code == 200:
    recipes = response.json()
    
    sale_ingredients_list = df_cooking['key_ingredient'].unique()
    
    recipe_data = []
    for recipe in recipes:
        recipe_ingredients = [ing['name'] for ing in recipe['usedIngredients']] + \
                            [ing['name'] for ing in recipe['missedIngredients']]
        
        matched_ingredients = set()
        
        for recipe_ing in recipe_ingredients:
            for sale_ing in sale_ingredients_list:
                if sale_ing in recipe_ing.lower() or recipe_ing.lower() in sale_ing:
                    matched_ingredients.add(sale_ing)
        
        product_names = []
        total_savings = 0
        
        for matched_ing in matched_ingredients:
            product = df_cooking[df_cooking['key_ingredient'] == matched_ing].iloc[0]
            
            product_names.append(product['name'])
            total_savings += product['savings']
        
        recipe_data.append({
            'recipe_id': recipe['id'],
            'recipe_name': recipe['title'],
            'no_sale_items': len(matched_ingredients),
            'sale_items': list(matched_ingredients),
            'sale_product_names': product_names,
            'total_savings': round(total_savings, 2),
            'all_ingredients': recipe_ingredients
        })
    
    df_recipes = pd.DataFrame(recipe_data)
    df_recipes = df_recipes.sort_values('no_sale_items', ascending=False)

In [27]:
df_recipes

Unnamed: 0,recipe_id,recipe_name,no_sale_items,sale_items,sale_product_names,total_savings,all_ingredients
9,652359,Monte Carlo Sandwich,15,"[ham, fetta cheese, egg, peanut butter, blue c...",[Gold Glaze & Bake Davidson Plum & Honey Ham 8...,16.1,"[butter, cheese, ham, eggs, milk, turkey, sand..."
1,633741,Baked Potato Soup,10,"[potato, sour cream, peanut butter, onion, but...","[Woolworths Deli Style Potato Chips 175g, Bull...",12.55,"[slc bacon, baking potatoes, butter, cheddar c..."
31,633133,Avocado Chicken Parmigiana,10,"[egg, chicken, pasta sauce, cheese, pasta, sau...",[Praise Whole Egg Mayonnaise or Garlic Aioli 4...,19.8,"[chicken breast fillet, egg, tomato pasta sauc..."
24,157426,Stuffed Shells with Beef and Broc,10,"[tomato, tomato sauce, onion, carrot broccoli ...","[Woolworths Sundried Tomato Strips 270g, Leggo...",21.65,"[cream cheese, ground beef, onion, pasta shell..."
11,654935,Pasta with Peas and Italian Sausage,9,"[peanut butter, butter, pasta, pork sausage, s...","[Pic’s Peanut Butter 380g, Western Star Cultur...",21.35,"[butter, canned tomatoes, farfalle pasta, saus..."
6,651707,Mexican Stuffed Potatoes,9,"[potato, sour cream, salsa, onion, cheese, cre...","[Woolworths Deli Style Potato Chips 175g, Bull...",10.5,"[extra beef, salsa, yukon gold potatoes, chedd..."
14,655424,Pear Dutch Baby,8,"[egg, peanut butter, pork, butter, pear, sausa...",[Praise Whole Egg Mayonnaise or Garlic Aioli 4...,15.05,"[milk, eggs, butter, pork link sausage, usa bo..."
27,640509,Cream Cheese Stuffed Chicken Breasts,8,"[onion, chicken, butter, cream, bacon, cheese,...","[Woolworths Cocktail Onions 500g, Ingham’s Chi...",9.65,"[bacon, to 3 butter, cream cheese, onion, chic..."
8,647417,Hot Cheesy Bacon Party Dip,8,"[tomato, onion, cream, meat, mayonnaise, bacon...","[Woolworths Sundried Tomato Strips 270g, Woolw...",9.6,"[bacon, cream cheese, mayo, cheddar cheese), r..."
10,910030,Bacon Scallion Deviled Eggs,7,"[egg, oil, yogurt, greek yogurt, mayonnaise, b...",[Praise Whole Egg Mayonnaise or Garlic Aioli 4...,39.35,"[hardboiled eggs, bacon, mayonnaise, greek yog..."


In [28]:
def get_recipe_info(recipe_id):
    endpoint = f"{BASE_URL}/recipes/{recipe_id}/information"
    params = {'apiKey': API_KEY}
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        return response.json()
    return None

recipe_instructions = []

for recipe_id in df_recipes['recipe_id']:
    recipe_info = get_recipe_info(recipe_id)
    
    if recipe_info:
        recipe_instructions.append({
            'recipe_id': recipe_id,
            'instructions': recipe_info.get('instructions', 'No instructions available'),
            'cooking_time': recipe_info.get('readyInMinutes', None),
            'servings': recipe_info.get('servings', None),
            'source_url': recipe_info.get('sourceUrl', None)
        })
    
    time.sleep(0.5)

df_instructions = pd.DataFrame(recipe_instructions)

df_recipes_full = df_recipes.merge(df_instructions, on='recipe_id', how='left')

In [29]:
df_recipes_full

Unnamed: 0,recipe_id,recipe_name,no_sale_items,sale_items,sale_product_names,total_savings,all_ingredients,instructions,cooking_time,servings,source_url
0,652359,Monte Carlo Sandwich,15,"[ham, fetta cheese, egg, peanut butter, blue c...",[Gold Glaze & Bake Davidson Plum & Honey Ham 8...,16.1,"[butter, cheese, ham, eggs, milk, turkey, sand...",<ol><li>Spread butter on each slice of bread. ...,45.0,9.0,https://www.foodista.com/recipe/QSKMGBYY/monte...
1,633741,Baked Potato Soup,10,"[potato, sour cream, peanut butter, onion, but...","[Woolworths Deli Style Potato Chips 175g, Bull...",12.55,"[slc bacon, baking potatoes, butter, cheddar c...",<ol><li>Bake the potatoes until done. Let cool...,45.0,1.0,http://www.foodista.com/recipe/M5LKMX6J/baked-...
2,633133,Avocado Chicken Parmigiana,10,"[egg, chicken, pasta sauce, cheese, pasta, sau...",[Praise Whole Egg Mayonnaise or Garlic Aioli 4...,19.8,"[chicken breast fillet, egg, tomato pasta sauc...",Preheat oven to 200C/400F fan-forced. Place ch...,45.0,8.0,https://www.foodista.com/recipe/XWFPFM56/avoca...
3,157426,Stuffed Shells with Beef and Broc,10,"[tomato, tomato sauce, onion, carrot broccoli ...","[Woolworths Sundried Tomato Strips 270g, Leggo...",21.65,"[cream cheese, ground beef, onion, pasta shell...",<p>1. Steam your frozen broccoli and cut it in...,60.0,2.0,http://spoonacular.com/-1384892574462
4,654935,Pasta with Peas and Italian Sausage,9,"[peanut butter, butter, pasta, pork sausage, s...","[Pic’s Peanut Butter 380g, Western Star Cultur...",21.35,"[butter, canned tomatoes, farfalle pasta, saus...",<ol><li>Cook the pasta following the package d...,45.0,4.0,https://www.foodista.com/recipe/6TFQVSP7/pasta...
5,651707,Mexican Stuffed Potatoes,9,"[potato, sour cream, salsa, onion, cheese, cre...","[Woolworths Deli Style Potato Chips 175g, Bull...",10.5,"[extra beef, salsa, yukon gold potatoes, chedd...",Pierce potatoes and microwave the potatoes on ...,45.0,4.0,https://www.foodista.com/recipe/JVXXNK3M/mexic...
6,655424,Pear Dutch Baby,8,"[egg, peanut butter, pork, butter, pear, sausa...",[Praise Whole Egg Mayonnaise or Garlic Aioli 4...,15.05,"[milk, eggs, butter, pork link sausage, usa bo...","Combine flour, milk, eggs and salt; beat until...",45.0,6.0,https://www.foodista.com/recipe/L5BQSLQV/pear-...
7,640509,Cream Cheese Stuffed Chicken Breasts,8,"[onion, chicken, butter, cream, bacon, cheese,...","[Woolworths Cocktail Onions 500g, Ingham’s Chi...",9.65,"[bacon, to 3 butter, cream cheese, onion, chic...",<ol><li>Preheat the oven to 350 F.</li><li>In ...,45.0,4.0,https://www.foodista.com/recipe/X5RRF6VF/cream...
8,647417,Hot Cheesy Bacon Party Dip,8,"[tomato, onion, cream, meat, mayonnaise, bacon...","[Woolworths Sundried Tomato Strips 270g, Woolw...",9.6,"[bacon, cream cheese, mayo, cheddar cheese), r...",Preheat oven to 350\nMix together cream cheese...,45.0,16.0,https://www.foodista.com/recipe/HP3YKYFX/hot-c...
9,910030,Bacon Scallion Deviled Eggs,7,"[egg, oil, yogurt, greek yogurt, mayonnaise, b...",[Praise Whole Egg Mayonnaise or Garlic Aioli 4...,39.35,"[hardboiled eggs, bacon, mayonnaise, greek yog...",Halve eggs lengthwise and remove yolks. Place ...,45.0,24.0,https://fullbellysisters.blogspot.com/2017/07/...
