In [1]:
import re
import time
import math
import requests
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError, RequestException

In [2]:
# regex + cleaning helpers

unit_re = re.compile(
    r"\b(\d+(\.\d+)?\s?(g|kg|ml|l|cl|oz|lb|pack|pcs|pc|x))\b",
    re.IGNORECASE
)
multispace_re = re.compile(r"\s+")


def clean_name(s: str) -> str:
    # normalise product names so they look like shopping-list items.
    s = (s or "").strip().lower()
    s = s.replace("_", " ")
    s = unit_re.sub("", s)
    # remove most punctuation but keep &, ' and -
    s = re.sub(r"[^\w\s&'-]", " ", s)
    s = multispace_re.sub(" ", s).strip()
    return s


In [3]:
# categories to scrape
TARGET_CATEGORIES = [
    "Produce",
    "Meat & Seafood",
    "Dairy & Eggs",
    "Bakery",
    "Pantry",
    "Frozen Foods",
    "Beverages",
    "Snacks",
    "Personal Care",
    "Household",
    "Pet Supplies",
    "Deli",
    "Condiments & Sauces",
    "Canned Goods",
    "Pasta & Grains",
    "Other"
]


def normalise_category(cat):
    if not cat:
        return "Other"

    cat = cat.strip().lower()

    mapping = {
        "produce": "Produce",
        "meat & seafood": "Meat & Seafood",
        "dairy & eggs": "Dairy & Eggs",
        "bakery": "Bakery",
        "pantry": "Pantry",
        "frozen foods": "Frozen Foods",
        "beverages": "Beverages",
        "snacks": "Snacks",
        "personal care": "Personal Care",
        "household": "Household",
        "pet supplies": "Pet Supplies",
        "deli": "Deli",
        "condiments & sauces": "Condiments & Sauces",
        "canned goods": "Canned Goods",
        "pasta & grains": "Pasta & Grains",
        "other": "Other",
    }

    if cat in mapping:
        return mapping[cat]
    else:
        return "Other"


In [4]:
# open food facts (off) -> manual category mapping -- used ChatGPT to help generate this list
off_to_manual_category = {
    "produce": [
        "en:fruits", "en:vegetables", "en:fruit", "en:vegetable",
        "en:produce", "en:salads", "en:herbs"
    ],
    "dairy & eggs": [
        "en:dairies", "en:dairy", "en:milk-and-yogurt", "en:cheeses",
        "en:yogurts", "en:eggs", "en:butter", "en:cream"
    ],
    "meat & seafood": [
        "en:meats", "en:meat", "en:poultry",
        "en:sausages", "en:fish-and-seafood"
    ],
    "bakery": [
        "en:breads", "en:bread", "en:bakery-products",
        "en:cakes", "en:biscuits", "en:pastries"
    ],
    "snacks": [
        "en:snacks", "en:salty-snacks", "en:crisps",
        "en:chips", "en:snack-foods", "en:confectioneries"
    ],
    "beverages": [
        "en:beverages", "en:drinks", "en:soft-drinks",
        "en:juices", "en:teas", "en:coffees"
    ],
    "canned goods": [
        "en:canned-foods", "en:canned-vegetables",
        "en:canned-fruits", "en:canned-fish"
    ],
    "condiments & sauces": [
        "en:condiments", "en:sauces", "en:ketchups", "en:mustards",
        "en:mayonnaises", "en:salad-dressings"
    ],
    "pasta & grains": [
        "en:pasta", "en:rices", "en:cereals", "en:flours",
        "en:grains"
    ],
    "frozen foods": [
        "en:frozen-foods", "en:frozen", "en:ice-creams"
    ],
}


def map_off_to_category(off_tags):
    off_tags = set(off_tags or [])
    for manual_cat, off_cats in off_to_manual_category.items():
        if any(tag in off_tags for tag in off_cats):
            return manual_cat
    return "other"

In [5]:
# scrape products from Open Food Facts
def fetch_off_page(country: str, page: int, page_size: int = 1000):
    url = "https://uk.openfoodfacts.org/cgi/search.pl"
    params = {
        "search_simple": 1,
        "action": "process",
        "json": 1,
        "page": page,
        "page_size": page_size,
        "fields": (
            "product_name,product_name_en,"
            "generic_name,generic_name_en,"
            "categories_tags,brands,quantity"
        ),
        "country": country,
    }
    r = requests.get(url, params=params, timeout=30)
    if r.status_code != 200:
        return {}
    return r.json()


def build_off_dataset(country="united-kingdom", pages=10, page_size=500, sleep_s=1.0) -> pd.DataFrame:
    rows = []
    seen = set()

    for page in range(1, pages + 1):
        data = fetch_off_page(country=country, page=page, page_size=page_size)
        products = data.get("products", [])

        for p in products:
            name = (
                p.get("product_name_en")
                or p.get("product_name")
                or p.get("generic_name_en")
                or p.get("generic_name")
                or ""
            )
            name_clean = clean_name(name)
            if len(name_clean) < 3:
                continue
            if name_clean in seen:
                continue
            seen.add(name_clean)

            manual_cat = map_off_to_category(p.get("categories_tags", []))
            cat = normalise_category(manual_cat)
            rows.append({"Item": name_clean, "Category": cat})

        print(
            f"[OFF] Page {page}/{pages}: +{len(products)} products, "
            f"dataset size now {len(rows)}"
        )
        time.sleep(sleep_s)

    df = pd.DataFrame(rows)
    return df

In [7]:
def scrape_titles_css(url, category, css_selector):
    print("Fetching", category, "from", url)

    try:
        response = requests.get(url)
    except Exception as e:
        print("Error fetching page:", e)
        return pd.DataFrame(columns=["Item", "Category"])

    soup = BeautifulSoup(response.text, "html.parser")
    rows = []
    elements = soup.select(css_selector)

    for el in elements:
        text = el.get_text(strip=True)
        if not text:
            continue

        name_clean = clean_name(text)
        if len(name_clean) < 3:
            continue

        rows.append({
            "Item": name_clean,
            "Category": normalise_category(category)
        })

    df = pd.DataFrame(rows)
    df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

    print("Collected", len(df), "items for", category)
    return df


In [13]:
def build_pets_at_home():
    # build Pet Supplies dataset from Pets at Home
    urls = [
        "https://www.petsathome.com/product/listing/dog/dog-food",
        "https://www.petsathome.com/product/listing/cat/cat-food/dry-cat-food",
        "https://www.petsathome.com/product/listing/fish/fish-food",
        "https://www.petsathome.com/product/listing/bird-and-wildlife/wildbird-food",
        "https://www.petsathome.com/product/listing/small-animal/rabbit/rabbit-food-and-feeding-hay",
        "https://www.petsathome.com/product/listing/small-animal/hamster/hamster-food",
    ]

    all_dfs = []

    for url in urls:
        # product titles are inside <h3 class="product-info_title__2XVM2">
        df_cat = scrape_titles_css(
            url,
            "Pet Supplies",
            "h3.product-info_title__2XVM2"
        )

        if not df_cat.empty:
            all_dfs.append(df_cat)

    if len(all_dfs) == 0:
        return pd.DataFrame(columns=["Item", "Category"])

    result = pd.concat(all_dfs, ignore_index=True)
    result = result.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

    return result




def build_eataly_food():
    # build food categories from Eataly (Pasta & Grains, Condiments & Sauces, Canned Goods)
    url_map = {
        "Pasta & Grains": "https://www.eataly.com/us_en/nationwide-shipping/pasta",
        "Condiments & Sauces": "https://www.eataly.com/us_en/nationwide-shipping/pantry/salt-and-spices",
        "Canned Goods": "https://www.eataly.com/us_en/nationwide-shipping/pantry/canned-goods",
    }

    all_dfs = []

    for category_name, url in url_map.items():
        # product titles are inside this div
        df_cat = scrape_titles_css(
            url,
            category_name,
            'div.product-card-name span'
        )

        if not df_cat.empty:
            all_dfs.append(df_cat)

    if len(all_dfs) == 0:
        return pd.DataFrame(columns=["Item", "Category"])

    result = pd.concat(all_dfs, ignore_index=True)
    result = result.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

    return result


In [15]:
# build the datasets
pets_df = build_pets_at_home()
eataly_df = build_eataly_food()
off_df = build_off_dataset()


Fetching Pet Supplies from https://www.petsathome.com/product/listing/dog/dog-food
Collected 40 items for Pet Supplies
Fetching Pet Supplies from https://www.petsathome.com/product/listing/cat/cat-food/dry-cat-food
Collected 36 items for Pet Supplies
Fetching Pet Supplies from https://www.petsathome.com/product/listing/fish/fish-food
Collected 38 items for Pet Supplies
Fetching Pet Supplies from https://www.petsathome.com/product/listing/bird-and-wildlife/wildbird-food
Collected 37 items for Pet Supplies
Fetching Pet Supplies from https://www.petsathome.com/product/listing/small-animal/rabbit/rabbit-food-and-feeding-hay
Collected 36 items for Pet Supplies
Fetching Pet Supplies from https://www.petsathome.com/product/listing/small-animal/hamster/hamster-food
Collected 12 items for Pet Supplies
Fetching Pasta & Grains from https://www.eataly.com/us_en/nationwide-shipping/pasta
Collected 0 items for Pasta & Grains
Fetching Condiments & Sauces from https://www.eataly.com/us_en/nationwide-s

In [20]:
# combine them into one df
scraped_df = pd.concat([pets_df, eataly_df, off_df], ignore_index=True)

# clean categories one last time
scraped_df["Category"] = scraped_df["Category"].apply(normalise_category)

# drop duplicates again, just in case
scraped_df = scraped_df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

scraped_df

Unnamed: 0,Item,Category
0,wainwright's sensitive adult dry dog food atla...,Pet Supplies
1,wainwright's complete adult dry dog food salmo...,Pet Supplies
2,wainwright's complete adult dry dog food turke...,Pet Supplies
3,wainwright's dry adult dog food beef with supe...,Pet Supplies
4,hill's science plan sensitive stomach & skin m...,Pet Supplies
...,...,...
1069,barista coconut,Beverages
1070,grissini breadsticks,Bakery
1071,unsalted roasted nuts,Other
1072,salted microwave popcorn,Snacks


In [28]:
# build final dataset and save - combine scraped dataset with data.csv and extra_manual_items.csv

# load existing CSV datasets
data_df = pd.read_csv("data/data.csv")
manual_df = pd.read_csv("data/extra_manual_items.csv")

# clean manual + csv data
data_df["Item"] = data_df["Item"].apply(clean_name)
data_df["Category"] = data_df["Category"].apply(normalise_category)

manual_df["Item"] = manual_df["Item"].apply(clean_name)
manual_df["Category"] = manual_df["Category"].apply(normalise_category)

# combine scraped data with CSV data
df = pd.concat([scraped_df, data_df, manual_df], ignore_index=True)

# clean categories one last time
df["Category"] = df["Category"].apply(normalise_category)

# remove duplicates
df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

# save final dataset
df.to_csv("data/final_grocery_dataset.csv", index=False)

print(df["Category"].value_counts())

Category
Other                  269
Pet Supplies           212
Snacks                 171
Bakery                 166
Dairy & Eggs           108
Beverages               93
Condiments & Sauces     89
Produce                 62
Meat & Seafood          43
Frozen Foods            36
Household               35
Canned Goods            32
Pasta & Grains          21
Pantry                  18
Personal Care           16
Deli                    13
Name: count, dtype: int64


In [22]:
# df["Category"].value_counts()
df.groupby("Category")["Item"].nunique().sort_values(ascending=False)

Category
Other                  269
Pet Supplies           212
Snacks                 171
Bakery                 166
Dairy & Eggs           108
Beverages               93
Condiments & Sauces     89
Produce                 62
Meat & Seafood          43
Frozen Foods            36
Household               35
Canned Goods            32
Pasta & Grains          21
Pantry                  18
Personal Care           16
Deli                    13
Name: Item, dtype: int64