<a href="https://colab.research.google.com/github/katrinag2004/asian_recipe_classifier_ds4002/blob/main/ingredients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json
import pandas as pd

# --- Target country cuisines ---
TARGET_CUISINES = {
    "chinese",
    "thai",
    "indian",
    "malaysian",
    "japanese",
    "philipines",
    "vietnamese",
    "indonesian"
}

# --- Parse recipe JSON-LD (title + ingredients + instructions) ---
def parse_recipe_jsonld(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "lxml")

    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)

            if isinstance(data, list):
                for entry in data:
                    if entry.get("@type") == "Recipe":
                        return {
                            "title": entry.get("name"),
                            "url": url,
                            "ingredients": entry.get("recipeIngredient", []),
                            "instructions": entry.get("recipeInstructions", [])
                        }
            elif isinstance(data, dict) and data.get("@type") == "Recipe":
                return {
                    "title": data.get("name"),
                    "url": url,
                    "ingredients": data.get("recipeIngredient", []),
                            "instructions": data.get("recipeInstructions", [])
                }
        except Exception:
            continue
    return None

# --- Get recipe links from category feed ---
def get_recipe_links_feed(feed_url, limit=20): # Use the 'limit' parameter
    resp = requests.get(feed_url)
    if resp.status_code != 200:
        return []
    root = ET.fromstring(resp.content)

    links = []
    # Use the 'limit' parameter in the slice
    for item in root.findall("./channel/item")[:limit]:
        link_elem = item.find("link")
        if link_elem is not None and link_elem.text:
            links.append(link_elem.text.strip())
    return links

# --- Get all category feed URLs from sitemap (with namespace fix) ---
def get_all_category_feeds():
    sitemap_url = "https://tasteasianfood.com/category-sitemap.xml"
    resp = requests.get(sitemap_url)
    resp.raise_for_status()
    root = ET.fromstring(resp.content)

    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    feeds = {}
    for url_elem in root.findall(".//sm:loc", ns):
        cat_url = url_elem.text.strip()
        if "/category/" in cat_url:
            cat_name = cat_url.split("/")[-2].lower()   # normalize name
            feeds[cat_name] = cat_url + "feed/"
    return feeds

# --- Scrape only target country categories ---
def scrape_all(limit_per_cat=20):
    categories = get_all_category_feeds()
    categories = {name: feed for name, feed in categories.items() if name in TARGET_CUISINES}

    all_recipes = {}
    for name, feed_url in categories.items():
        print(f"Scraping {name}...")
        links = get_recipe_links_feed(feed_url, limit=limit_per_cat) # Pass limit_per_cat
        recipes = []
        for link in links:
            recipe = parse_recipe_jsonld(link)
            if recipe:
                recipes.append(recipe)
        all_recipes[name] = recipes
    return all_recipes

# --- Run ---
all_recipes = scrape_all(limit_per_cat=30)

# Save JSON (by country → by recipe → ingredients)
with open("asian_recipes_by_country.json", "w", encoding="utf-8") as f:
    json.dump(all_recipes, f, ensure_ascii=False, indent=2)

# Save CSV (flattened: one row per ingredient per recipe)
rows = []
for country, recipes in all_recipes.items():
    for r in recipes:
        for ing in r["ingredients"]:
            rows.append({
                "country": country,
                "recipe_title": r["title"],
                "recipe_url": r["url"],
                "ingredient": ing,
                "instructions": r["instructions"]
            })

df = pd.DataFrame(rows)
df.to_csv("asian_recipes_by_country.csv", index=False)

print("✅ Done! Saved to asian_recipes_by_country.json and asian_recipes_by_country.csv")

Scraping chinese...
Scraping indian...
Scraping indonesian...
Scraping japanese...
Scraping malaysian...
Scraping philipines...
Scraping thai...
Scraping vietnamese...
✅ Done! Saved to asian_recipes_by_country.json and asian_recipes_by_country.csv


In [20]:
# --- Get ALL recipe links from category feed, with pagination ---
def get_recipe_links_feed(feed_url, max_pages=10, max_links=None):  # <-- default 10
    links = []
    page = 1

    while page <= max_pages:
        url = f"{feed_url}?paged={page}"
        resp = requests.get(url)
        if resp.status_code != 200:
            break

        root = ET.fromstring(resp.content)
        items = root.findall("./channel/item")
        if not items:
            break  # no more pages

        for item in items:
            link_elem = item.find("link")
            if link_elem is not None and link_elem.text:
                links.append(link_elem.text.strip())
                if max_links and len(links) >= max_links:
                    return links

        page += 1

    return links


# --- Scrape only target categories, with pagination ---
def scrape_all(max_pages=10, limit_per_cat=None):  # <-- default 10
    categories = get_all_category_feeds()
    categories = {name: feed for name, feed in categories.items() if name in TARGET_CUISINES}

    all_recipes = {}
    for name, feed_url in categories.items():
        print(f"Scraping {name}...")
        links = get_recipe_links_feed(feed_url, max_pages=max_pages, max_links=limit_per_cat)
        recipes = []
        for link in links:
            recipe = parse_recipe_jsonld(link)
            if recipe:
                recipes.append(recipe)
        all_recipes[name] = recipes
    return all_recipes


# --- Run full scrape (10 pages per category) ---
all_recipes = scrape_all(max_pages=10, limit_per_cat=None)

import re

# --- Ingredient cleaner (remove measurements, units, etc.) ---
def clean_ingredient(ing):
    ing = re.sub(r"\([^)]*\)", "", ing)  # remove ( ... )
    ing = re.sub(r"\d+[\d\s\/\.\-]*", "", ing)  # remove numbers/fractions
    ing = re.sub(
        r"\b(?:tsp|tbsp|cups?|cup|g|kg|ml|l|oz|pounds?|pound|teaspoons?|tablespoons?|grams?|kilograms?|liters?|milliliters?|ounce|ounces)\b",
        "",
        ing,
        flags=re.IGNORECASE,
    )
    ing = re.sub(r"[^a-zA-Z\s]", "", ing)  # keep only letters/spaces
    return ing.strip()


# --- Group ingredients by recipe title and join them into a single string ---
def clean_ingredients(ingredients_list):
    cleaned = [clean_ingredient(ing) for ing in ingredients_list if ing.strip()]
    cleaned = sorted(set(filter(None, cleaned)))  # deduplicate + sort
    return ", ".join(cleaned)


# --- Build grouped dataset ---
processed_rows = []
for country, recipes in all_recipes.items():
    for r in recipes:
        processed_rows.append({
            "country": country,
            "recipe_title": r["title"],
            "ingredients": clean_ingredients(r["ingredients"]),
            "instructions": r["instructions"]
        })

processed_df = pd.DataFrame(processed_rows)

# Save to CSV
processed_df.to_csv("asian_recipes_by_country_cleaned.csv", index=False)

print("✅ Done! Saved grouped + cleaned data to asian_recipes_by_country_cleaned.csv")


Scraping chinese...
Scraping indian...
Scraping indonesian...
Scraping japanese...
Scraping malaysian...
Scraping philipines...
Scraping thai...
Scraping vietnamese...
✅ Done! Saved grouped + cleaned data to asian_recipes_by_country_cleaned.csv


In [21]:
from google.colab import files
files.download("asian_recipes_by_country_cleaned.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>