### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

### Woolworths weekly catalogue scraper

In [3]:
# api endpoint to get category list (use "view" endpoint)
url_view = "https://embed.salefinder.com.au/productlist/view/62222/"

# parameters for the request
params = {
    'locationId': '4679',
    'token': '570f5c4a44505b5f51477f531a03180a0e0b1c1362352b2e21363226253968717d7a787d6468626562612b',
    'saleGroup': '0',
    'rows_per_page': '10',
}

# headers to make our request look like a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': '*/*',
}

# fetch the main catalogue page
response = requests.get(url_view, params=params, headers=headers)
json_text = response.text.strip()[1:-1]
data = json.loads(json_text)

print(f"Fetched: {data['saleName']}")
print(f"Valid: {data['startDate']} to {data['endDate']}\n")

# parse the HTML to extract categories
soup = BeautifulSoup(data['content'], 'html.parser')
categories = {}

for link in soup.find_all('a', class_='sf-navcategory-link'):
    href = link.get('href', '')
    if 'categoryId=' in href:
        category_id = href.split('categoryId=')[1].split('&')[0]
        category_name = link.text.strip()
        categories[category_id] = category_name

Fetched: Weekly Catalogue NSW
Valid: 2025-11-19T00:00:00 to 2025-11-25T23:59:59



In [7]:
url_category = "https://embed.salefinder.com.au/productlist/category/62222/"

# list to store ALL products from ALL categories
all_products = []

# loop through each category
for category_id, category_name in categories.items():
    print(f"Scraping {category_name}...", end=" ")
    
    # parameters for this specific category
    params = {
        'locationId': '4679',
        'token': '570f5c4a44505b5f51477f531a03180a0e0b1c1362352b2e21363226253968717d7a787d6468626562612b',
        'saleGroup': '0',
        'categoryId': category_id, 
        'rows_per_page': '500',
    }
    
    # api request
    response = requests.get(url_category, params=params, headers=headers)
    json_text = response.text.strip()[1:-1]
    data = json.loads(json_text)
    
    # parse
    soup = BeautifulSoup(data['content'], 'html.parser')
    
    # extract products
    for product in soup.find_all('div', class_='shelfProductStamp'):
        name_tag = product.find('span', class_='sf-item-heading')
        sale_price_tag = product.find('span', class_='sf-pricedisplay')
        savings_tag = product.find('span', class_='sf-regprice')
        
        if not name_tag or not sale_price_tag:
            continue
        
        name = name_tag.text.strip()
        sale_price = sale_price_tag.text.strip()
        stock_code = product.get('data-stockcode')
        
        sale_value = float(sale_price.replace('$', '').replace(',', ''))    
            
        if savings_tag:
            savings = savings_tag.text.strip()
            savings_value = float(savings.replace('$', ''))
            original_value = sale_value + savings_value
        else:
            savings_value = 0.0
            original_value = sale_value
        
        all_products.append({
            'category': category_name,
            'stock_code': stock_code,
            'name': name,
            'sale_price': sale_value,
            'original_price': original_value,
            'savings': savings_value
        })
    
    print(f"{len([p for p in all_products if p['category'] == category_name])} products")
    time.sleep(0.3) 

# Create DataFrame
df = pd.DataFrame(all_products)

Scraping Baby... 6 products
Scraping Bakery... 22 products
Scraping Baking... 8 products
Scraping Beauty... 64 products
Scraping Biscuits & Snacks... 17 products
Scraping Breakfast Foods... 8 products
Scraping Canned & Packet food... 21 products
Scraping Clothing... 4 products
Scraping Condiments... 14 products
Scraping Confectionery... 14 products
Scraping Cooking, Seasoning & Gravy... 6 products
Scraping Dairy... 16 products
Scraping Deli & Chilled... 31 products
Scraping Desserts... 1 products
Scraping Drinks... 31 products
Scraping Frozen Food... 30 products
Scraping Fruit & Vegetables... 19 products
Scraping Health & Wellbeing... 77 products
Scraping Health Foods... 4 products
Scraping Home & Outdoor... 45 products
Scraping Household Cleaning... 8 products
Scraping Jams & Spreads... 4 products
Scraping Meat... 32 products
Scraping Papergoods, Wraps & Bags... 3 products
Scraping Pet care... 10 products
Scraping Seafood... 8 products
Scraping Stationery & Media... 14 products
Scrapi

In [8]:
df = pd.DataFrame(all_products)

df.groupby('category').first().reset_index()

Unnamed: 0,category,stock_code,name,sale_price,original_price,savings
0,Baby,612568,Tommee Tippee Night Time Soothers 18-36m Assor...,7.2,12.0,4.8
1,Bakery,6044304,35hr Sourdough Baguette Pk 2#,3.5,4.0,0.5
2,Baking,37736,CSR Dark Brown Sugar 1 kg,4.6,4.6,0.0
3,Beauty,6036022,Little Urchin SPF 50+ Face Natural Clear Zinc ...,12.5,25.0,12.5
4,Biscuits & Snacks,956027,Sakata Rice Crackers 80-90g,1.25,2.5,1.25
5,Breakfast Foods,199725,Kellogg’s Nutri Grain 765g or Coco Pops Cereal...,6.0,10.8,4.8
6,Canned & Packet food,666347,Woolworths Antipasto Varieties 110-135g – From...,6.5,7.5,1.0
7,Clothing,282756,Bonds Men’s Cushioned 1/4 Crew Sock Assorted Pk 3,8.0,16.0,8.0
8,Condiments,923892,Obela Dips 220g – From the Fridge,4.0,5.0,1.0
9,Confectionery,320104,Cadbury Medium Bars 30-55g,1.2,3.0,1.8


In [9]:
df['category'].unique()

array(['Baby', 'Bakery', 'Baking', 'Beauty', 'Biscuits & Snacks',
       'Breakfast Foods', 'Canned & Packet food', 'Clothing',
       'Condiments', 'Confectionery', 'Cooking, Seasoning & Gravy',
       'Dairy', 'Deli & Chilled', 'Desserts', 'Drinks', 'Frozen Food',
       'Fruit & Vegetables', 'Health & Wellbeing', 'Health Foods',
       'Home & Outdoor', 'Household Cleaning', 'Jams & Spreads', 'Meat',
       'Papergoods, Wraps & Bags', 'Pet care', 'Seafood',
       'Stationery & Media', 'Toiletries'], dtype=object)

### Australia food guidelines

https://www.eatforhealth.gov.au/food-essentials/how-much-do-we-need-each-day/recommended-number-serves-adults

In [10]:
# converting the table above into a df
dietary_guidelines = {
    'Group': [
        'Men 19-50',
        'Men 51-70', 
        'Men 70+',
        'Women 19-50',
        'Women 51-70',
        'Women 70+',
        'Pregnant',
        'Lactating'
    ],
    'Vegetables & Legumes': [6, 5.5, 5, 5, 5, 5, 5, 7.5],
    'Fruit': [2, 2, 2, 2, 2, 2, 2, 2],
    'Grains (Wholegrain)': [6, 6, 4.5, 6, 4, 3, 8.5, 9],
    'Lean Meat & Alternatives': [3, 2.5, 2.5, 2.5, 2, 2, 3.5, 2.5],
    'Dairy (Reduced Fat)': [2.5, 2.5, 3.5, 2.5, 4, 4, 2.5, 2.5],
    'Additional Serves Range': ['0-3', '0-2.5', '0-2.5', '0-2.5', '0-2.5', '0-2', '0-2.5', '0-2.5']
}

df_guidelines = pd.DataFrame(dietary_guidelines)

df_guidelines

Unnamed: 0,Group,Vegetables & Legumes,Fruit,Grains (Wholegrain),Lean Meat & Alternatives,Dairy (Reduced Fat),Additional Serves Range
0,Men 19-50,6.0,2,6.0,3.0,2.5,0-3
1,Men 51-70,5.5,2,6.0,2.5,2.5,0-2.5
2,Men 70+,5.0,2,4.5,2.5,3.5,0-2.5
3,Women 19-50,5.0,2,6.0,2.5,2.5,0-2.5
4,Women 51-70,5.0,2,4.0,2.0,4.0,0-2.5
5,Women 70+,5.0,2,3.0,2.0,4.0,0-2
6,Pregnant,5.0,2,8.5,3.5,2.5,0-2.5
7,Lactating,7.5,2,9.0,2.5,2.5,0-2.5


### Categorising sale items into the australian reccomended food groups

In [11]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# glove model
model = api.load('glove-wiki-gigaword-300')

# define non-food categories to ignore
NON_FOOD_CATEGORIES = [
    'Baby', 'Beauty', 'Toiletries', 'Household Cleaning', 
    'Home & Outdoor', 'Stationery & Media', 'Clothing', 'Pet care'
]

# define dietary guideline categories with some additional keywords
dietary_guidelines = {
    'Vegetables & Legumes': ['vegetables', 'vegetable', 'legumes', 'beans', 'peas', 'broccoli', 'carrot', 'salad', 'greens'],
    'Fruit': ['fruit', 'fruits', 'apple', 'banana', 'orange', 'berry', 'grape', 'melon'],
    'Grains (Wholegrain)': ['bread', 'rice', 'pasta', 'noodles', 'cereal', 'oats', 'grain', 'wheat', 'crackers', 'muffin', 'bagel', 'bun', 'roll', 'wrap', 'pita', 'tortilla'],
    'Lean Meat & Alternatives': ['meat', 'beef', 'chicken', 'pork', 'lamb', 'fish', 'salmon', 'tuna', 'eggs', 'pie', 'sausage', 'bacon', 'ham', 'protein'],
    'Dairy (Reduced Fat)': ['milk', 'cheese', 'yoghurt', 'yogurt', 'dairy', 'cream', 'butter'],
    'Discretionary': ['chocolate', 'candy', 'chips', 'biscuits', 'cookies', 'cake', 'icecream', 'soft-drink', 'lollies', 'snack']
}

def get_text_embedding(words):
    """
    convert a list of words into a single vector by averaging
    """
    vectors = []
    for word in words:
        word_clean = word.lower().strip()
        if word_clean in model:
            vectors.append(model[word_clean])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

def classify_product(woolworths_category, product_name):
    """
    combine Woolworths category + product name to classify into dietary guidelines
    """
    # check if this is a non-food category first
    if woolworths_category in NON_FOOD_CATEGORIES:
        return 'Non-Food', 1.0
    
    # combine category and product name into words
    combined_text = f"{woolworths_category} {product_name}"
    
    # tokenize
    words = []
    for word in combined_text.split():
        # remove numbers and short words
        if len(word) > 2 and not word.isdigit():
            words.append(word)
    
    # get embedding for this product
    product_embedding = get_text_embedding(words)
    
    # get embeddings for each dietary category
    best_category = None
    best_score = -1
    
    for category_name, keywords in dietary_guidelines.items():
        category_embedding = get_text_embedding(keywords)
        
        # calculate similarity
        similarity = cosine_similarity(
            product_embedding.reshape(1, -1),
            category_embedding.reshape(1, -1)
        )[0][0]
        
        if similarity > best_score:
            best_score = similarity
            best_category = category_name
    
    return best_category, best_score

In [12]:
df['dietary_category'] = df.apply(
    lambda row: classify_product(row['category'], row['name'])[0],
    axis=1
)

df['dietary_confidence'] = df.apply(
    lambda row: classify_product(row['category'], row['name'])[1],
    axis=1
)

In [13]:
df_food = df[df['dietary_category'] != 'Non-Food']
df_food.to_csv('food_items_classified_manual_inspect.csv', index=False)

In [14]:
df_food.groupby('dietary_category').first().reset_index()

Unnamed: 0,dietary_category,category,stock_code,name,sale_price,original_price,savings,dietary_confidence
0,Dairy (Reduced Fat),Bakery,159266,National Pies Varieties 360g Pk 2 – From the F...,6.8,8.5,1.7,0.532467
1,Discretionary,Bakery,6021067,Woolworths Chocolate Hazelnut Tarts Pk 6,7.5,7.5,0.0,0.737364
2,Fruit,Bakery,733768,Apple Turnover with Fresh Cream#,4.5,5.0,0.5,0.553427
3,Grains (Wholegrain),Bakery,6044304,35hr Sourdough Baguette Pk 2#,3.5,4.0,0.5,0.587197
4,Lean Meat & Alternatives,Biscuits & Snacks,71929,Infuzions Air Fries 90g – From the Health Food...,2.45,4.9,2.45,0.545252
5,Vegetables & Legumes,Canned & Packet food,335122,Always Fresh Olives 240g,4.8,10.8,6.0,0.658829


In [15]:
test = df_food.groupby('dietary_category').first().reset_index()

### Extracting key ingredient

In [16]:
import anthropic
import time
import os
from dotenv import load_dotenv

load_dotenv() 
client = anthropic.Anthropic(
    api_key=os.getenv("ANTHROPIC_API_KEY")
)

def extract_key_ingredient_claude(product_name):
    """
    use claude to extract the main ingredient from a product name
    """
    try:
        message = client.messages.create(
            model="claude-3-5-haiku-20241022", 
            max_tokens=20,
            messages=[{
                "role": "user",
                "content": f"""Extract ONLY the main food ingredient from this product name. This ingredient is what best describes the item.  
Return just one to three words in singular form, lowercase, no punctuation.

Examples:
- "Woolworths Free Range Eggs 700g Pk 12" → egg
- "Always Fresh Sicilian Olives Pitted 230g" → olive
- "Four'N Twenty Angus Pies 700g" → beef pie
- "Mr Whisk Vanilla Meringues 130g" → meringue
- "Golden Crumpet Rounds Pk 6" → crumpet
- "Philadelphia Original Cream Cheese Twin Block 500g" → cream cheese 
- "Farmers Union Greek Yogurt Pots 150g" → greek yogurt

Product: {product_name}

Main ingredient:"""
            }]
        )
        
        return message.content[0].text.strip().lower()
    
    except Exception as e:
        print(f"Error processing '{product_name}': {e}")
        return 'Unknown'

results = []
for idx, product_name in enumerate(df_food['name'], 1):
    ingredient = extract_key_ingredient_claude(product_name)
    results.append(ingredient)

df_food['key_ingredient'] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_food['key_ingredient'] = results


In [17]:
df_food.to_csv('base_table_food.csv', index=False)

In [18]:
df_food.groupby('dietary_category').first().reset_index()

Unnamed: 0,dietary_category,category,stock_code,name,sale_price,original_price,savings,dietary_confidence,key_ingredient
0,Dairy (Reduced Fat),Bakery,159266,National Pies Varieties 360g Pk 2 – From the F...,6.8,8.5,1.7,0.532467,pie
1,Discretionary,Bakery,6021067,Woolworths Chocolate Hazelnut Tarts Pk 6,7.5,7.5,0.0,0.737364,hazelnut tart
2,Fruit,Bakery,733768,Apple Turnover with Fresh Cream#,4.5,5.0,0.5,0.553427,apple
3,Grains (Wholegrain),Bakery,6044304,35hr Sourdough Baguette Pk 2#,3.5,4.0,0.5,0.587197,sourdough
4,Lean Meat & Alternatives,Biscuits & Snacks,71929,Infuzions Air Fries 90g – From the Health Food...,2.45,4.9,2.45,0.545252,potato
5,Vegetables & Legumes,Canned & Packet food,335122,Always Fresh Olives 240g,4.8,10.8,6.0,0.658829,olive


### Recipes

In [19]:
load_dotenv()
API_KEY = os.getenv("SPOONACULAR_API_KEY", "")

BASE_URL = "https://api.spoonacular.com"

In [20]:
COOKING_CATEGORIES = [
    'Baking', 'Canned & Packet food', 'Condiments',
    'Cooking, Seasoning & Gravy', 'Dairy', 'Deli & Chilled',
    'Fruit & Vegetables', 'International Foods', 'Jams & Spreads',
    'Meat', 'Seafood'
]

df_cooking = df_food[df_food['category'].isin(COOKING_CATEGORIES)]

In [21]:
ingredients = df_cooking['key_ingredient'].unique()
ingredient_string = ','.join(ingredients)

# api call
response = requests.get(
    f"{BASE_URL}/recipes/findByIngredients",
    params={
        'apiKey': API_KEY,
        'ingredients': ingredient_string,
        'number': 50,
        'ranking': 2
    }
)

In [22]:
if response.status_code == 200:
    recipes = response.json()
    
    sale_ingredients_list = df_cooking['key_ingredient'].unique()
    
    recipe_data = []
    for recipe in recipes:
        recipe_ingredients = [ing['name'] for ing in recipe['usedIngredients']] + \
                            [ing['name'] for ing in recipe['missedIngredients']]
        
        matched_ingredients = set()
        
        for recipe_ing in recipe_ingredients:
            for sale_ing in sale_ingredients_list:
                if sale_ing in recipe_ing.lower() or recipe_ing.lower() in sale_ing:
                    matched_ingredients.add(sale_ing)
        
        recipe_data.append({
            'recipe_id': recipe['id'],
            'recipe_name': recipe['title'],
            'no_sale_items': len(matched_ingredients),
            'sale_items': list(matched_ingredients),  # Just the key ingredients
            'all_ingredients': recipe_ingredients
        })
    
    df_recipes = pd.DataFrame(recipe_data)
    df_recipes = df_recipes.sort_values('no_sale_items', ascending=False)

In [23]:
df_recipes

Unnamed: 0,recipe_id,recipe_name,no_sale_items,sale_items,all_ingredients
43,651707,Mexican Stuffed Potatoes,9,"[tomato, avocado, cream cheese, potato, cheese...","[extra beef, salsa, yukon gold potatoes, chedd..."
45,645821,Grilled Potato Skins,8,"[cream cheese, potato, cheese, cream, butter, ...","[baking potatoes, bacon, finely-chopped ham, c..."
16,647956,Involtini Di Pollo - Ham and Cheese Stuffed Ch...,7,"[cream cheese, cheese, butter, chicken breast,...","[long and chicken breasts, ham, cheese, butter..."
46,647417,Hot Cheesy Bacon Party Dip,7,"[tomato, cream cheese, mayonnaise, cheese, cre...","[bacon, cream cheese, mayo, cheddar cheese), r..."
6,633132,Avocado Chicken Salad,7,"[tomato, avocado, cherry, mayonnaise, salad, c...","[avocados, cherry tomatoes, mayonnaise, salad,..."
9,654373,Palak Paneer,7,"[tomato, cream cheese, tomato sauce, cheese, b...","[spoon butter, cheese, garam masala, spinach, ..."
22,1003670,Stuffed Chicken Breast,6,"[cream cheese, cheese, cream, chicken breast, ...","[chicken seasoning, cream cheese, chicken brea..."
0,648303,Itty Bitty Burgers,6,"[tomato, cheese, mustard, beef, ham, spinach]","[a handful of baby spinach, grape tomatoes, ha..."
7,642583,"Farfalle with Peas, Ham and Cream",6,"[cheese, pea, cream, butter, pasta, ham]","[butter, ham, farfalle pasta, heavy cream, par..."
5,715563,Pierogi Casserole,6,"[potato, cheese, butter, milk, noodle, cheddar...","[butter, milk, sharp cheddar cheese, potatoes,..."
