In [None]:
import re
import time
import requests
import pandas as pd

In [27]:
# regex + cleaning helpers
unit_re = re.compile(
    r"\b(\d+(\.\d+)?\s?(g|kg|ml|l|cl|oz|lb|pack|pcs|pc|x))\b",
    re.IGNORECASE
)
multispace_re = re.compile(r"\s+")

def clean_name(s: str) -> str:
    s = (s or "").strip().lower()
    s = s.replace("_", " ")
    s = unit_re.sub("", s)
    s = re.sub(r"[^\w\s&'-]", " ", s)
    s = multispace_re.sub(" ", s).strip()
    return s

In [28]:
# categories to scrape
TARGET_CATEGORIES = [
    "Produce",
    "Meat & Seafood",
    "Dairy & Eggs",
    "Bakery",
    "Pantry",
    "Frozen Foods",
    "Beverages",
    "Snacks",
    "Personal Care",
    "Household",
    "Pet Supplies",
    "Deli",
    "Condiments & Sauces",
    "Canned Goods",
    "Pasta & Grains",
    "Other"
]


In [29]:
# open food facts (off) -> manual category mapping
off_to_manual_category = {
    "produce": [
        "en:fruits", "en:vegetables", "en:fruit", "en:vegetable",
        "en:produce", "en:salads", "en:herbs"
    ],
    "dairy & eggs": [
        "en:dairies", "en:dairy", "en:milk-and-yogurt", "en:cheeses",
        "en:yogurts", "en:eggs", "en:butter", "en:cream"
    ],
    "meat & seafood": [
        "en:meats", "en:meat", "en:poultry",
        "en:sausages", "en:fish-and-seafood"
    ],
    "bakery": [
        "en:breads", "en:bread", "en:bakery-products",
        "en:pastries", "en:cakes", "en:biscuits"
    ],
    "beverages": [
        "en:beverages", "en:drinks", "en:soft-drinks",
        "en:juices", "en:teas", "en:coffees", "en:waters"
    ],
    "snacks": [
        "en:snacks", "en:chocolates", "en:confectioneries",
        "en:crisps", "en:nuts", "en:snack-bars"
    ],
    "pantry": [
        "en:groceries", "en:rice", "en:pasta", "en:cereals",
        "en:flours", "en:sugars", "en:oils", "en:vinegars",
        "en:spices", "en:condiments"
    ],
    "frozen foods": [
        "en:frozen-foods", "en:frozen", "en:ice-creams"
    ]
}


def map_off_to_category(off_tags):
    off_tags = set(off_tags or [])
    for manual_cat, off_cats in off_to_manual_category.items():
        if any(tag in off_tags for tag in off_cats):
            return manual_cat
    return "other"

In [30]:
# scrape products from Open Food Facts
def fetch_off_page(country: str, page: int, page_size: int = 1000):
    # only requesting fields I need to reduce payload size
    url = "https://uk.openfoodfacts.org/cgi/search.pl"
    params = {
        "search_simple": 1,
        "action": "process",
        "json": 1,
        "page": page,
        "page_size": page_size,
        "fields": "product_name,product_name_en,generic_name,generic_name_en,categories_tags,brands,quantity",
        "country": country,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def build_dataset(country="united-kingdom", pages=5, page_size=500, sleep_s=1.0):
    rows = []
    seen = set()

    for page in range(1, pages + 1):
        data = fetch_off_page(country=country, page=page, page_size=page_size)
        products = data.get("products", [])

        for p in products:
            name = p.get("product_name_en") or p.get("product_name") or p.get("generic_name_en") or p.get("generic_name") or ""
            name_clean = clean_name(name)

            if len(name_clean) < 3:
                continue

            if name_clean in seen:
                continue
            seen.add(name_clean)

            cat = map_to_category(p.get("categories_tags", []), name_clean)
            rows.append({"Item": name_clean, "Category": cat})

        print(f"Page {page}/{pages}: +{len(products)} products, dataset size now {len(rows)}")
        time.sleep(sleep_s)

    return pd.DataFrame(rows)


In [31]:
# run and save
if __name__ == "__main__":
    df = build_dataset(country="united-kingdom", pages=10, page_size=500, sleep_s=1.0)

    # drop "Other" to only take confident labels
    df = df[df["Category"] != "Other"]

    # keep categories I already use
    keep = {
        "Produce", "Meat & Seafood", "Dairy & Eggs", "Bakery", "Pantry",
        "Frozen Foods", "Beverages", "Snacks", "Personal Care", "Household",
        "Pet Supplies", "Deli", "Condiments & Sauces", "Canned Goods", "Pasta & Grains"
    }
    df = df[df["Category"].isin(keep)]

    # cap each category to 600 examples so training isn't skewed
    CAP = 600
    df = df.groupby("Category", group_keys=False).apply(lambda g: g.sample(min(len(g), CAP), random_state=42))

    df.to_csv("data/off_grocery_dataset.csv", index=False)
    print("Saved:", "off_grocery_dataset.csv")
    print(df["Category"].value_counts())

Page 1/10: +100 products, dataset size now 91
Page 2/10: +100 products, dataset size now 183
Page 3/10: +100 products, dataset size now 271
Page 4/10: +100 products, dataset size now 356
Page 5/10: +100 products, dataset size now 444
Page 6/10: +100 products, dataset size now 532
Page 7/10: +100 products, dataset size now 620
Page 8/10: +100 products, dataset size now 709
Page 9/10: +100 products, dataset size now 791
Page 10/10: +100 products, dataset size now 879
Saved: off_grocery_dataset.csv
Category
Snacks            171
Bakery            153
Dairy & Eggs      108
Pantry             86
Beverages          82
Produce            25
Meat & Seafood     16
Canned Goods       13
Frozen Foods       10
Pasta & Grains      4
Name: count, dtype: int64


  df = df.groupby("Category", group_keys=False).apply(lambda g: g.sample(min(len(g), CAP), random_state=42))


In [32]:
df = pd.read_csv("data/off_grocery_dataset.csv")
print(df["Category"].value_counts())

Category
Snacks            171
Bakery            153
Dairy & Eggs      108
Pantry             86
Beverages          82
Produce            25
Meat & Seafood     16
Canned Goods       13
Frozen Foods       10
Pasta & Grains      4
Name: count, dtype: int64


In [33]:
counts = df["Category"].value_counts()
print((counts / counts.sum()).round(3))

Category
Snacks            0.256
Bakery            0.229
Dairy & Eggs      0.162
Pantry            0.129
Beverages         0.123
Produce           0.037
Meat & Seafood    0.024
Canned Goods      0.019
Frozen Foods      0.015
Pasta & Grains    0.006
Name: count, dtype: float64
