<a href="https://colab.research.google.com/github/gatesz33/asian_recipe_classifier_ds4002/blob/main/ingredients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json

# --- Target country cuisines ---
TARGET_CUISINES = {
    "chinese",
    "thai",
    "indian",
    "malaysian",
    "japanese",
    "korean",
    "filipino",
    "vietnamese"
}

# --- Parse recipe JSON-LD (ingredients only) ---
def parse_ingredients_jsonld(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "lxml")

    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)

            if isinstance(data, list):
                for entry in data:
                    if entry.get("@type") == "Recipe":
                        return entry.get("recipeIngredient", [])
            elif isinstance(data, dict) and data.get("@type") == "Recipe":
                return data.get("recipeIngredient", [])
        except Exception:
            continue
    return []

# --- Get recipe links from category feed ---
def get_recipe_links_feed(feed_url, limit=20):
    resp = requests.get(feed_url)
    if resp.status_code != 200:
        return []
    root = ET.fromstring(resp.content)

    links = []
    for item in root.findall("./channel/item")[:limit]:
        link_elem = item.find("link")
        if link_elem is not None and link_elem.text:
            links.append(link_elem.text.strip())
    return links

# --- Get all category feed URLs from sitemap (with namespace fix) ---
def get_all_category_feeds():
    sitemap_url = "https://tasteasianfood.com/category-sitemap.xml"
    resp = requests.get(sitemap_url)
    resp.raise_for_status()
    root = ET.fromstring(resp.content)

    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    feeds = {}
    for url_elem in root.findall(".//sm:loc", ns):
        cat_url = url_elem.text.strip()
        if "/category/" in cat_url:
            cat_name = cat_url.split("/")[-2].lower()   # normalize name
            feeds[cat_name] = cat_url + "feed/"
    return feeds

# --- Scrape only target country categories ---
def scrape_all(limit_per_cat=20):
    categories = get_all_category_feeds()
    # filter to only country cuisines
    categories = {name: feed for name, feed in categories.items() if name in TARGET_CUISINES}

    all_ingredients = {}
    for name, feed_url in categories.items():
        print(f"Scraping {name}...")
        links = get_recipe_links_feed(feed_url, limit=limit_per_cat)
        cat_ingredients = []
        for link in links:
            ingredients = parse_ingredients_jsonld(link)
            if ingredients:
                cat_ingredients.extend(ingredients)
        all_ingredients[name] = cat_ingredients
    return all_ingredients

# --- Run ---
all_ingredients = scrape_all(limit_per_cat=20)

with open("asian_ingredients_by_country.json", "w", encoding="utf-8") as f:
    json.dump(all_ingredients, f, ensure_ascii=False, indent=2)

print("✅ Done! Saved to asian_ingredients_by_country.json")


Scraping chinese...
Scraping indian...
Scraping japanese...
Scraping korean...
Scraping malaysian...
Scraping thai...
Scraping vietnamese...
✅ Done! Saved to asian_ingredients_by_country.json


In [29]:
import os
os.listdir()


['.config',
 'asian_recipes.csv',
 'asian_ingredients_by_country.json',
 'asian_ingredients.json',
 'asian_recipes.json',
 'sample_data']

In [30]:
from google.colab import files

files.download("asian_ingredients_by_country.json")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
# Save CSV (flattened)
rows = []
for country, ingredients in all_ingredients.items():
    for ing in ingredients:
        rows.append({"country": country, "ingredient": ing})

df = pd.DataFrame(rows)
df.to_csv("asian_ingredients_by_country.csv", index=False)

print("✅ Done! Saved to asian_ingredients_by_country.json and asian_ingredients_by_country.csv")

✅ Done! Saved to asian_ingredients_by_country.json and asian_ingredients_by_country.csv


In [32]:
from google.colab import files


files.download("asian_ingredients_by_country.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>