### Imports

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

### Woolworths weekly catalogue scraper

In [4]:
# api endpoint to get category list (use "view" endpoint)
url_view = "https://embed.salefinder.com.au/productlist/view/62107/"

# parameters for the request
params = {
    'locationId': '4679',
    'token': '570f5c4a44505b5f51477f531a03180a0e0b1c1362352b2e21363226253968717d7a787d6468626562612b',
    'saleGroup': '0',
    'rows_per_page': '10',
}

# headers to make our request look like a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': '*/*',
}

# fetch the main catalogue page
response = requests.get(url_view, params=params, headers=headers)
json_text = response.text.strip()[1:-1]
data = json.loads(json_text)

print(f"Fetched: {data['saleName']}")
print(f"Valid: {data['startDate']} to {data['endDate']}\n")

# parse the HTML to extract categories
soup = BeautifulSoup(data['content'], 'html.parser')
categories = {}

for link in soup.find_all('a', class_='sf-navcategory-link'):
    href = link.get('href', '')
    if 'categoryId=' in href:
        category_id = href.split('categoryId=')[1].split('&')[0]
        category_name = link.text.strip()
        categories[category_id] = category_name

Fetched: Weekly Catalogue NSW
Valid: 2025-11-12T00:00:00 to 2025-11-18T23:59:59



In [5]:
url_category = "https://embed.salefinder.com.au/productlist/category/62107/"

# list to store ALL products from ALL categories
all_products = []

# loop through each category
for category_id, category_name in categories.items():
    print(f"Scraping {category_name}...", end=" ")
    
    # parameters for this specific category
    params = {
        'locationId': '4679',
        'token': '570f5c4a44505b5f51477f531a03180a0e0b1c1362352b2e21363226253968717d7a787d6468626562612b',
        'saleGroup': '0',
        'categoryId': category_id, 
        'rows_per_page': '500',
    }
    
    # api request
    response = requests.get(url_category, params=params, headers=headers)
    json_text = response.text.strip()[1:-1]
    data = json.loads(json_text)
    
    # parse
    soup = BeautifulSoup(data['content'], 'html.parser')
    
    # extract products
    for product in soup.find_all('div', class_='shelfProductStamp'):
        name_tag = product.find('span', class_='sf-item-heading')
        sale_price_tag = product.find('span', class_='sf-pricedisplay')
        savings_tag = product.find('span', class_='sf-regprice')
        
        if not name_tag or not sale_price_tag:
            continue
        
        name = name_tag.text.strip()
        sale_price = sale_price_tag.text.strip()
        stock_code = product.get('data-stockcode')
        
        sale_value = float(sale_price.replace('$', ''))
        
        if savings_tag:
            savings = savings_tag.text.strip()
            savings_value = float(savings.replace('$', ''))
            original_value = sale_value + savings_value
        else:
            savings_value = 0.0
            original_value = sale_value
        
        all_products.append({
            'category': category_name,
            'stock_code': stock_code,
            'name': name,
            'sale_price': sale_value,
            'original_price': original_value,
            'savings': savings_value
        })
    
    print(f"{len([p for p in all_products if p['category'] == category_name])} products")
    time.sleep(0.3) 

# Create DataFrame
df = pd.DataFrame(all_products)

Scraping Baby... 5 products
Scraping Bakery... 30 products
Scraping Baking... 2 products
Scraping Beauty... 13 products
Scraping Biscuits & Snacks... 19 products
Scraping Breakfast Foods... 10 products
Scraping Canned & Packet food... 22 products
Scraping Condiments... 4 products
Scraping Confectionery... 22 products
Scraping Cooking, Seasoning & Gravy... 3 products
Scraping Dairy... 19 products
Scraping Deli & Chilled... 21 products
Scraping Desserts... 3 products
Scraping Drinks... 27 products
Scraping Frozen Food... 36 products
Scraping Fruit & Vegetables... 13 products
Scraping Health & Wellbeing... 9 products
Scraping Health Foods... 6 products
Scraping Home & Outdoor... 49 products
Scraping Household Cleaning... 10 products
Scraping International Foods... 2 products
Scraping Jams & Spreads... 1 products
Scraping Meat... 25 products
Scraping Papergoods, Wraps & Bags... 2 products
Scraping Pet care... 9 products
Scraping Seafood... 5 products
Scraping Stationery & Media... 12 produ

In [6]:
df = pd.DataFrame(all_products)

df.groupby('category').first().reset_index()

Unnamed: 0,category,stock_code,name,sale_price,original_price,savings
0,Baby,200703,Huggies Ultra Dry Nappies Pk 30-54,13.0,22.0,9.0
1,Bakery,49622,Golden Crumpet Rounds Pk 6,2.4,4.8,2.4
2,Baking,205222,Woolworths Free Range Eggs 700g Pk 12,6.5,13.1,6.6
3,Beauty,225912,Dove Beauty Cream Bar Original 2 x 90g,2.75,5.5,2.75
4,Biscuits & Snacks,479848,Pringles Potato Chips 53g,1.35,2.7,1.35
5,Breakfast Foods,49622,Golden Crumpet Rounds Pk 6,2.4,4.8,2.4
6,Canned & Packet food,259537,Ho Mai Spring Roll or Entertainer Pack 1 kg Pk...,8.0,10.0,2.0
7,Condiments,323081,Yumi’s Dips 200g – From the Fridge,3.5,4.5,1.0
8,Confectionery,814558,"Cadbury Dairy Milk, Bubbly or Marvellous Creat...",4.0,8.0,4.0
9,"Cooking, Seasoning & Gravy",6049885,Maggi Gravy 1 kg,12.5,12.5,0.0


### Australia food guidelines

https://www.eatforhealth.gov.au/food-essentials/how-much-do-we-need-each-day/recommended-number-serves-adults

In [7]:
# converting the table above into a df
dietary_guidelines = {
    'Group': [
        'Men 19-50',
        'Men 51-70', 
        'Men 70+',
        'Women 19-50',
        'Women 51-70',
        'Women 70+',
        'Pregnant',
        'Lactating'
    ],
    'Vegetables & Legumes': [6, 5.5, 5, 5, 5, 5, 5, 7.5],
    'Fruit': [2, 2, 2, 2, 2, 2, 2, 2],
    'Grains (Wholegrain)': [6, 6, 4.5, 6, 4, 3, 8.5, 9],
    'Lean Meat & Alternatives': [3, 2.5, 2.5, 2.5, 2, 2, 3.5, 2.5],
    'Dairy (Reduced Fat)': [2.5, 2.5, 3.5, 2.5, 4, 4, 2.5, 2.5],
    'Additional Serves Range': ['0-3', '0-2.5', '0-2.5', '0-2.5', '0-2.5', '0-2', '0-2.5', '0-2.5']
}

df_guidelines = pd.DataFrame(dietary_guidelines)

df_guidelines

Unnamed: 0,Group,Vegetables & Legumes,Fruit,Grains (Wholegrain),Lean Meat & Alternatives,Dairy (Reduced Fat),Additional Serves Range
0,Men 19-50,6.0,2,6.0,3.0,2.5,0-3
1,Men 51-70,5.5,2,6.0,2.5,2.5,0-2.5
2,Men 70+,5.0,2,4.5,2.5,3.5,0-2.5
3,Women 19-50,5.0,2,6.0,2.5,2.5,0-2.5
4,Women 51-70,5.0,2,4.0,2.0,4.0,0-2.5
5,Women 70+,5.0,2,3.0,2.0,4.0,0-2
6,Pregnant,5.0,2,8.5,3.5,2.5,0-2.5
7,Lactating,7.5,2,9.0,2.5,2.5,0-2.5


### Categorising sale items into the australian reccomended food groups

In [8]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# glove model
model = api.load('glove-wiki-gigaword-300')

# define non-food categories to ignore
NON_FOOD_CATEGORIES = [
    'Baby', 'Beauty', 'Toiletries', 'Household Cleaning', 
    'Home & Outdoor', 'Stationery & Media', 'Clothing', 'Pet care'
]

# define dietary guideline categories with some additional keywords
dietary_guidelines = {
    'Vegetables & Legumes': ['vegetables', 'vegetable', 'legumes', 'beans', 'peas', 'broccoli', 'carrot', 'salad', 'greens'],
    'Fruit': ['fruit', 'fruits', 'apple', 'banana', 'orange', 'berry', 'grape', 'melon'],
    'Grains (Wholegrain)': ['bread', 'rice', 'pasta', 'noodles', 'cereal', 'oats', 'grain', 'wheat', 'crackers', 'muffin', 'bagel', 'bun', 'roll', 'wrap', 'pita', 'tortilla'],
    'Lean Meat & Alternatives': ['meat', 'beef', 'chicken', 'pork', 'lamb', 'fish', 'salmon', 'tuna', 'eggs', 'pie', 'sausage', 'bacon', 'ham', 'protein'],
    'Dairy (Reduced Fat)': ['milk', 'cheese', 'yoghurt', 'yogurt', 'dairy', 'cream', 'butter'],
    'Discretionary': ['chocolate', 'candy', 'chips', 'biscuits', 'cookies', 'cake', 'icecream', 'soft-drink', 'lollies', 'snack']
}

def get_text_embedding(words):
    """
    convert a list of words into a single vector by averaging
    """
    vectors = []
    for word in words:
        word_clean = word.lower().strip()
        if word_clean in model:
            vectors.append(model[word_clean])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

def classify_product(woolworths_category, product_name):
    """
    combine Woolworths category + product name to classify into dietary guidelines
    """
    # check if this is a non-food category first
    if woolworths_category in NON_FOOD_CATEGORIES:
        return 'Non-Food', 1.0
    
    # combine category and product name into words
    combined_text = f"{woolworths_category} {product_name}"
    
    # tokenize
    words = []
    for word in combined_text.split():
        # remove numbers and short words
        if len(word) > 2 and not word.isdigit():
            words.append(word)
    
    # get embedding for this product
    product_embedding = get_text_embedding(words)
    
    # get embeddings for each dietary category
    best_category = None
    best_score = -1
    
    for category_name, keywords in dietary_guidelines.items():
        category_embedding = get_text_embedding(keywords)
        
        # calculate similarity
        similarity = cosine_similarity(
            product_embedding.reshape(1, -1),
            category_embedding.reshape(1, -1)
        )[0][0]
        
        if similarity > best_score:
            best_score = similarity
            best_category = category_name
    
    return best_category, best_score

In [9]:
df['dietary_category'] = df.apply(
    lambda row: classify_product(row['category'], row['name'])[0],
    axis=1
)

df['dietary_confidence'] = df.apply(
    lambda row: classify_product(row['category'], row['name'])[1],
    axis=1
)

In [10]:
df_food = df[df['dietary_category'] != 'Non-Food']
df_food.to_csv('food_items_classified_manual_inspect.csv', index=False)

In [11]:
df_food.groupby('dietary_category').first().reset_index()

Unnamed: 0,dietary_category,category,stock_code,name,sale_price,original_price,savings,dietary_confidence
0,Dairy (Reduced Fat),Bakery,6040676,Mr Whisk Vanilla Meringues 130g,6.5,6.5,0.0,0.653899
1,Discretionary,Bakery,157944,Woolworths Petite Vol Au Vents Pk 12,4.5,4.5,0.0,0.269072
2,Fruit,Bakery,904886,Walker's Cranberry & Clementine Fruit Mince Pi...,6.5,6.5,0.0,0.675821
3,Grains (Wholegrain),Bakery,49622,Golden Crumpet Rounds Pk 6,2.4,4.8,2.4,0.395892
4,Lean Meat & Alternatives,Baking,205222,Woolworths Free Range Eggs 700g Pk 12,6.5,13.1,6.6,0.513886
5,Vegetables & Legumes,Condiments,465039,Always Fresh Sicilian Olives Pitted 230g,3.3,4.8,1.5,0.605417


In [12]:
test = df_food.groupby('dietary_category').first().reset_index()

### Extracting key ingredient

In [15]:
import anthropic
import time
import os
from dotenv import load_dotenv

load_dotenv() 
client = anthropic.Anthropic(
    api_key=os.getenv("ANTHROPIC_API_KEY")
)

def extract_key_ingredient_claude(product_name):
    """
    use claude to extract the main ingredient from a product name
    """
    try:
        message = client.messages.create(
            model="claude-3-5-haiku-20241022", 
            max_tokens=20,
            messages=[{
                "role": "user",
                "content": f"""Extract ONLY the main food ingredient from this product name. 
Return just one or two words in singular form, lowercase, no punctuation.

Examples:
- "Woolworths Free Range Eggs 700g Pk 12" → egg
- "Always Fresh Sicilian Olives Pitted 230g" → olive
- "Four'N Twenty Angus Pies 700g" → beef pie
- "Mr Whisk Vanilla Meringues 130g" → meringue
- "Golden Crumpet Rounds Pk 6" → crumpet

Product: {product_name}

Main ingredient:"""
            }]
        )
        
        return message.content[0].text.strip().lower()
    
    except Exception as e:
        print(f"Error processing '{product_name}': {e}")
        return 'Unknown'

results = []
for idx, product_name in enumerate(df_food['name'], 1):
    ingredient = extract_key_ingredient_claude(product_name)
    results.append(ingredient)

df_food['key_ingredient'] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_food['key_ingredient'] = results


In [17]:
df_food.to_csv('base_table_food.csv', index=False)

In [18]:
df_food.groupby('dietary_category').first().reset_index()

Unnamed: 0,dietary_category,category,stock_code,name,sale_price,original_price,savings,dietary_confidence,key_ingredient
0,Dairy (Reduced Fat),Bakery,6040676,Mr Whisk Vanilla Meringues 130g,6.5,6.5,0.0,0.653899,meringue
1,Discretionary,Bakery,157944,Woolworths Petite Vol Au Vents Pk 12,4.5,4.5,0.0,0.269072,pastry
2,Fruit,Bakery,904886,Walker's Cranberry & Clementine Fruit Mince Pi...,6.5,6.5,0.0,0.675821,fruit
3,Grains (Wholegrain),Bakery,49622,Golden Crumpet Rounds Pk 6,2.4,4.8,2.4,0.395892,crumpet
4,Lean Meat & Alternatives,Baking,205222,Woolworths Free Range Eggs 700g Pk 12,6.5,13.1,6.6,0.513886,egg
5,Vegetables & Legumes,Condiments,465039,Always Fresh Sicilian Olives Pitted 230g,3.3,4.8,1.5,0.605417,olive


### Recipe extraction