In [13]:
import re
import time
import math
import requests
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError, RequestException

In [14]:
# regex + cleaning helpers

unit_re = re.compile(
    r"\b(\d+(\.\d+)?\s?(g|kg|ml|l|cl|oz|lb|pack|pcs|pc|x))\b",
    re.IGNORECASE
)
multispace_re = re.compile(r"\s+")


def clean_name(s: str) -> str:
    # normalise product names so they look like shopping-list items.
    s = (s or "").strip().lower()
    s = s.replace("_", " ")
    s = unit_re.sub("", s)
    # remove most punctuation but keep &, ' and -
    s = re.sub(r"[^\w\s&'-]", " ", s)
    s = multispace_re.sub(" ", s).strip()
    return s


In [15]:
# categories to scrape
TARGET_CATEGORIES = [
    "Produce",
    "Meat & Seafood",
    "Dairy & Eggs",
    "Bakery",
    "Pantry",
    "Frozen Foods",
    "Beverages",
    "Snacks",
    "Personal Care",
    "Household",
    "Pet Supplies",
    "Deli",
    "Condiments & Sauces",
    "Canned Goods",
    "Pasta & Grains",
    "Other"
]


def normalise_category(cat: str) -> str:
    if not cat:
        return "Other"
    c = cat.strip().lower()
    mapping = {
        "produce": "Produce",
        "meat & seafood": "Meat & Seafood",
        "dairy & eggs": "Dairy & Eggs",
        "bakery": "Bakery",
        "pantry": "Pantry",
        "frozen foods": "Frozen Foods",
        "beverages": "Beverages",
        "snacks": "Snacks",
        "personal care": "Personal Care",
        "household": "Household",
        "pet supplies": "Pet Supplies",
        "deli": "Deli",
        "condiments & sauces": "Condiments & Sauces",
        "canned goods": "Canned Goods",
        "pasta & grains": "Pasta & Grains",
        "other": "Other",
    }
    return mapping.get(c, "Other")


In [16]:
# open food facts (off) -> manual category mapping -- used ChatGPT to help generate this list
off_to_manual_category = {
    "produce": [
        "en:fruits", "en:vegetables", "en:fruit", "en:vegetable",
        "en:produce", "en:salads", "en:herbs"
    ],
    "dairy & eggs": [
        "en:dairies", "en:dairy", "en:milk-and-yogurt", "en:cheeses",
        "en:yogurts", "en:eggs", "en:butter", "en:cream"
    ],
    "meat & seafood": [
        "en:meats", "en:meat", "en:poultry",
        "en:sausages", "en:fish-and-seafood"
    ],
    "bakery": [
        "en:breads", "en:bread", "en:bakery-products",
        "en:cakes", "en:biscuits", "en:pastries"
    ],
    "snacks": [
        "en:snacks", "en:salty-snacks", "en:crisps",
        "en:chips", "en:snack-foods", "en:confectioneries"
    ],
    "beverages": [
        "en:beverages", "en:drinks", "en:soft-drinks",
        "en:juices", "en:teas", "en:coffees"
    ],
    "canned goods": [
        "en:canned-foods", "en:canned-vegetables",
        "en:canned-fruits", "en:canned-fish"
    ],
    "condiments & sauces": [
        "en:condiments", "en:sauces", "en:ketchups", "en:mustards",
        "en:mayonnaises", "en:salad-dressings"
    ],
    "pasta & grains": [
        "en:pasta", "en:rices", "en:cereals", "en:flours",
        "en:grains"
    ],
    "frozen foods": [
        "en:frozen-foods", "en:frozen", "en:ice-creams"
    ],
}


def map_off_to_category(off_tags):
    off_tags = set(off_tags or [])
    for manual_cat, off_cats in off_to_manual_category.items():
        if any(tag in off_tags for tag in off_cats):
            return manual_cat
    return "other"

In [17]:
# scrape products from Open Food Facts
def fetch_off_page(country: str, page: int, page_size: int = 1000):
    url = "https://uk.openfoodfacts.org/cgi/search.pl"
    params = {
        "search_simple": 1,
        "action": "process",
        "json": 1,
        "page": page,
        "page_size": page_size,
        "fields": (
            "product_name,product_name_en,"
            "generic_name,generic_name_en,"
            "categories_tags,brands,quantity"
        ),
        "country": country,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()


def build_off_dataset(country="united-kingdom", pages=10, page_size=500, sleep_s=1.0) -> pd.DataFrame:
    rows = []
    seen = set()

    for page in range(1, pages + 1):
        data = fetch_off_page(country=country, page=page, page_size=page_size)
        products = data.get("products", [])

        for p in products:
            name = (
                p.get("product_name_en")
                or p.get("product_name")
                or p.get("generic_name_en")
                or p.get("generic_name")
                or ""
            )
            name_clean = clean_name(name)
            if len(name_clean) < 3:
                continue
            if name_clean in seen:
                continue
            seen.add(name_clean)

            manual_cat = map_off_to_category(p.get("categories_tags", []))
            cat = normalise_category(manual_cat)
            rows.append({"Item": name_clean, "Category": cat})

        print(
            f"[OFF] Page {page}/{pages}: +{len(products)} products, "
            f"dataset size now {len(rows)}"
        )
        time.sleep(sleep_s)

    df = pd.DataFrame(rows)
    return df

In [25]:
# supermarket scraping helpers
WEB_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    )
}


def scrape_titles(url: str,
                  category: str,
                  title_tags=("h3", "h2"),
                  sleep_s: float = 1.0) -> pd.DataFrame:
    try:
        print(f"[WEB] Fetching {category}: {url}")
        r = requests.get(url, headers=WEB_HEADERS, timeout=30)
        r.raise_for_status()
    except HTTPError as e:
        print(f"[WEB] HTTP error for {url}: {e}")
        return pd.DataFrame(columns=["Item", "Category"])
    except RequestException as e:
        print(f"[WEB] Request error for {url}: {e}")
        return pd.DataFrame(columns=["Item", "Category"])

    soup = BeautifulSoup(r.text, "html.parser")
    rows = []
    seen = set()

    for tag_name in title_tags:
        for el in soup.find_all(tag_name):
            text = el.get_text(strip=True)
            if not text:
                continue
            name_clean = clean_name(text)
            if len(name_clean) < 3:
                continue
            if name_clean in seen:
                continue
            seen.add(name_clean)
            rows.append(
                {"Item": name_clean, "Category": normalise_category(category)}
            )

    time.sleep(sleep_s)
    df = pd.DataFrame(rows)
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)
    print(f"[WEB] {category}: collected {len(df)} items from {url}")
    return df

In [None]:
from requests.exceptions import HTTPError, RequestException

def scrape_titles_css(url: str, category: str, css_selector: str, sleep_s: float = 1.0) -> pd.DataFrame:

    try:
        print(f"[WEB] Fetching {category}: {url}")
        r = requests.get(url, headers=WEB_HEADERS, timeout=30)
        r.raise_for_status()
    except HTTPError as e:
        print(f"[WEB] HTTP error for {url}: {e}")
        return pd.DataFrame(columns=["Item", "Category"])
    except RequestException as e:
        print(f"[WEB] Request error for {url}: {e}")
        return pd.DataFrame(columns=["Item", "Category"])

    soup = BeautifulSoup(r.text, "html.parser")
    elements = soup.select(css_selector)

    rows = []
    seen = set()

    for el in elements:
        text = el.get_text(strip=True)
        if not text:
            continue

        name_clean = clean_name(text)
        if len(name_clean) < 3:
            continue

        if name_clean in seen:
            continue
        seen.add(name_clean)

        rows.append(
            {"Item": name_clean, "Category": normalise_category(category)}
        )

    time.sleep(sleep_s)
    df = pd.DataFrame(rows)
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)
    print(f"[WEB] {category}: collected {len(df)} items from {url} ({len(df)} unique)")
    return df


In [56]:
def build_pets_at_home() -> pd.DataFrame:
    # Build Pet Supplies dataset from Pets at Home
    urls = [
        "https://www.petsathome.com/product/listing/dog/dog-food",
        "https://www.petsathome.com/product/listing/cat/cat-food/dry-cat-food",
        "https://www.petsathome.com/product/listing/fish/fish-food",
        "https://www.petsathome.com/product/listing/bird-and-wildlife/wildbird-food",
        "https://www.petsathome.com/product/listing/small-animal/rabbit/rabbit-food-and-feeding-hay",
        "https://www.petsathome.com/product/listing/small-animal/hamster/hamster-food",
    ]

    dfs = []
    for url in urls:
        # product titles: <h3 class="product-info_title__2XVM2">
        df_cat = scrape_titles_css(
            url,
            category="Pet Supplies",
            css_selector="h3.product-info_title__2XVM2",
        )
        if not df_cat.empty:
            dfs.append(df_cat)

    if not dfs:
        return pd.DataFrame(columns=["Item", "Category"])

    df = pd.concat(dfs, ignore_index=True)
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)
    return df






def build_eataly_food() -> pd.DataFrame:
    # Build food categories (Pasta & Grains, Condiments & Sauces, Canned Goods) from Eataly.
    url_map = {
        "Pasta & Grains": "https://www.eataly.com/us_en/nationwide-shipping/pasta",
        "Condiments & Sauces": "https://www.eataly.com/us_en/nationwide-shipping/pantry/salt-and-spices",
        "Canned Goods": "https://www.eataly.com/us_en/nationwide-shipping/pantry/canned-goods",
    }

    dfs = []
    for cat, url in url_map.items():
        # product titles: <div class="product-card-name" data-test-e2e="product-card-product-name">
        df_cat = scrape_titles_css(
            url,
            category=cat,
            css_selector='div.product-card-name[data-test-e2e="product-card-product-name"]',
        )
        if not df_cat.empty:
            dfs.append(df_cat)

    if not dfs:
        return pd.DataFrame(columns=["Item", "Category"])

    df = pd.concat(dfs, ignore_index=True)
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)
    return df


In [58]:

print(df[df["Category"] == "Pet Supplies"]["Item"].head(30))
print(df[df["Category"] == "Pasta & Grains"]["Item"].head(30))

575    wainwright's sensitive adult dry dog food atla...
576    wainwright's complete adult dry dog food salmo...
577    wainwright's dry adult dog food beef with supe...
578    wainwright's complete adult dry dog food turke...
579    hill's science plan sensitive stomach & skin m...
580    hill's science plan adult medium breed dry dog...
581    hill's science plan perfect weight large breed...
582    hill's science plan perfect weight medium bree...
583    james wellbeloved hypoallergenic senior dry do...
584    james wellbeloved hypoallergenic adult dry dog...
585    james wellbeloved adult dry dog food chicken &...
586    james wellbeloved senior dry dog food lamb & rice
587            ava sensitive skin & stomach dry dog food
588          ava medium breed adult dry dog food chicken
589           ava large breed adult dry dog food chicken
590           ava small breed adult dry dog food chicken
591    pro plan medium everyday nutrition adult dry d...
592    pro plan sensitive skin 

In [59]:
# combine all scraped data
def build_web_dataset() -> pd.DataFrame:
    # combine all web-based sources that are accessible: Pets at Home -> Pet Supplies, Boots -> Personal Care, Eataly -> Pasta & Grains, Condiments & Sauces, Canned Goods

    dfs = []

    df_pets = build_pets_at_home()
    if not df_pets.empty:
        dfs.append(df_pets)

    df_boots = build_boots_personal_care()
    if not df_boots.empty:
        dfs.append(df_boots)

    df_eataly = build_eataly_food()
    if not df_eataly.empty:
        dfs.append(df_eataly)

    if not dfs:
        print("[WEB] No web data collected.")
        return pd.DataFrame(columns=["Item", "Category"])

    df = pd.concat(dfs, ignore_index=True)
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)
    return df

In [60]:
# rebalance ccategories and cap at 600
def rebalance_by_cap(df: pd.DataFrame, cap: int = 600) -> pd.DataFrame:
    groups = []
    for cat, g in df.groupby("Category", group_keys=False):
        if len(g) > cap:
            g = g.sample(cap, random_state=42)
        groups.append(g)
    return pd.concat(groups, ignore_index=True)

In [61]:
# build final dataset
if __name__ == "__main__":
    # 1. Open Food Facts
    df_off = build_off_dataset(
        country="united-kingdom",
        pages=10,
        page_size=500,
        sleep_s=1.0,
    )

    # drop "Other" from OFF to only take confident labels
    df_off = df_off[df_off["Category"] != "Other"]

    # 2. Web datasets (Pets at Home, Boots, Eataly)
    df_web = build_web_dataset()

    # 3. Manual items
    manual_path = "data/extra_manual_items.csv"
    df_manual = pd.read_csv(manual_path)
    df_manual["Item"] = df_manual["Item"].apply(clean_name)
    df_manual["Category"] = df_manual["Category"].apply(normalise_category)

    # 4. Combine everything
    df = pd.concat([df_off, df_web, df_manual], ignore_index=True)

    # normalise categories + filter to the ones you actually use
    df["Category"] = df["Category"].apply(normalise_category)
    keep = {
        "Produce", "Meat & Seafood", "Dairy & Eggs", "Bakery", "Pantry",
        "Frozen Foods", "Beverages", "Snacks", "Personal Care", "Household",
        "Pet Supplies", "Deli", "Condiments & Sauces", "Canned Goods", "Pasta & Grains",
    }
    df = df[df["Category"].isin(keep)]

    # 5. Remove duplicates
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

    # 6. Cap large categories so they don't dominate
    df = rebalance_by_cap(df, cap=600)

    # 7. Save
    out_path = "data/off_grocery_dataset.csv"  # same name as before
    df.to_csv(out_path, index=False)
    print("Saved:", out_path)
    print(df["Category"].value_counts())

[OFF] Page 1/10: +100 products, dataset size now 92
[OFF] Page 2/10: +100 products, dataset size now 185
[OFF] Page 3/10: +100 products, dataset size now 274
[OFF] Page 4/10: +100 products, dataset size now 360
[OFF] Page 5/10: +100 products, dataset size now 446
[OFF] Page 6/10: +100 products, dataset size now 533
[OFF] Page 7/10: +100 products, dataset size now 622
[OFF] Page 8/10: +100 products, dataset size now 711
[OFF] Page 9/10: +100 products, dataset size now 795
[OFF] Page 10/10: +100 products, dataset size now 878
[WEB] Fetching Pet Supplies: https://www.petsathome.com/product/listing/dog/dog-food
[WEB] Pet Supplies: collected 40 items from https://www.petsathome.com/product/listing/dog/dog-food (40 unique)
[WEB] Fetching Pet Supplies: https://www.petsathome.com/product/listing/cat/cat-food/dry-cat-food
[WEB] Pet Supplies: collected 36 items from https://www.petsathome.com/product/listing/cat/cat-food/dry-cat-food (36 unique)
[WEB] Fetching Pet Supplies: https://www.petsathom

In [62]:
# df["Category"].value_counts()
df.groupby("Category")["Item"].nunique().sort_values(ascending=False)

Category
Pet Supplies           197
Snacks                 157
Bakery                 150
Condiments & Sauces     95
Dairy & Eggs            94
Beverages               80
Produce                 51
Canned Goods            43
Meat & Seafood          29
Pasta & Grains          29
Frozen Foods            28
Household               27
Name: Item, dtype: int64