In [1]:
import re, time, requests, pandas as pd
from difflib import SequenceMatcher
from collections import deque
from tqdm.autonotebook import tqdm


  from tqdm.autonotebook import tqdm


In [None]:
USER_AGENT = "mlife_diet_nutrition/1.0 (lee55@email.sc.edu)"

# IN_CSV  = "/Users/clee/Documents/Lab/mlife/data/tmp/test.csv"
# OUT_CSV = "/Users/clee/Documents/Lab/mlife/data/tmp/output_filled.csv"

IN_CSV = r"C:\Users\chris\Documents\lab\mlife\data\tmp\missing_diet_log_fitbit.csv"
OUT_CSV = r"C:\Users\chris\Documents\lab\mlife\data\tmp\missing_diet_log_fitbit_filled.csv"

SEARCH_MAX_PER_MIN  = 8

FIELDS = ",".join([
    "code","brands","brands_tags","product_name","serving_size","countries_tags",
    "nutriments.energy-kcal_serving","nutriments.energy-kcal_100g",
    "nutriments.carbohydrates_serving","nutriments.carbohydrates_100g",
    "nutriments.fat_serving","nutriments.fat_100g",
    "nutriments.fiber_serving","nutriments.fiber_100g",
    "nutriments.proteins_serving","nutriments.proteins_100g",
    "nutriments.sodium_serving","nutriments.sodium_100g"
])

# ---- Helpers ----
def _norm(s):
    if not s: return ""
    s = s.lower().replace("\u2013","-").replace("\u2014","-")
    s = re.sub(r"\s+"," ", s)
    s = re.sub(r"[^\w\s\-\&'()/]", "", s)
    return s.strip()

def _tokens(s):
    return {t for t in re.split(r"[^\w&'/()-]+", _norm(s)) if t and t not in {"the","and","with","of"}}

def _score(a,b): 
    return SequenceMatcher(a=_norm(a), b=_norm(b)).ratio()

def _name_overlap(qname, prod_name):
    qa, qb = _tokens(qname), _tokens(prod_name)
    if not qa or not qb: return 0.0
    inter = len(qa & qb)
    return inter / max(len(qa), len(qb))

def _brand_set(p):
    raw = (p.get("brands") or "")
    tags = p.get("brands_tags") or []
    out = { _norm(x) for x in raw.split(",") if x.strip() }
    out |= { _norm(str(x)) for x in tags }
    return {x for x in out if x}

# ---- Rate limiter + GET wrapper (applies to all search calls) ----
class RateLimiter:
    def __init__(self, max_per_min): 
        self.max = max_per_min
        self.window = 60.0
        self.events = deque()
    def acquire(self):
        now = time.monotonic()
        while self.events and now - self.events[0] > self.window:
            self.events.popleft()
        if len(self.events) >= self.max:
            time.sleep(self.window - (now - self.events[0]) + 0.02)
        self.events.append(time.monotonic())

rl_search = RateLimiter(SEARCH_MAX_PER_MIN)

def _get(url, params):
    rl_search.acquire()
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, params=params, timeout=20)
    r.raise_for_status()
    return r

# ---- OFF search wrappers (NO product-by-code) ----
def off_v2_search(params, page_size=100):
    p = dict(params); p["page_size"] = page_size; p["fields"] = FIELDS
    return _get("https://world.openfoodfacts.net/api/v2/search", p).json().get("products", [])

def off_legacy_search(q, page_size=100, extra=None):
    p = {"search_terms": q, "search_simple": 1, "json": 1, "page_size": page_size}
    if extra: p.update(extra)
    return _get("https://world.openfoodfacts.org/cgi/search.pl", p).json().get("products", [])

# ---- Scoring + extraction ----
TARGET_KEYS = [
    ("energy-kcal_serving","energy-kcal_100g"),
    ("carbohydrates_serving","carbohydrates_100g"),
    ("fat_serving","fat_100g"),
    ("fiber_serving","fiber_100g"),
    ("proteins_serving","proteins_100g"),
    ("sodium_serving","sodium_100g"),
]

def nutrient_count(p):
    nu = p.get("nutriments") or {}
    return sum((nu.get(a) is not None) or (nu.get(b) is not None) for a,b in TARGET_KEYS)

def per_is_serving(p):
    nu = p.get("nutriments") or {}
    return any(nu.get(k) is not None for k,_ in TARGET_KEYS)

def country_us(p):
    tags = [str(x).lower() for x in (p.get("countries_tags") or [])]
    return any(x.endswith(":united-states") or x=="en:united-states" for x in tags)

def extract_row(p):
    nu = p.get("nutriments") or {}
    def pick(serv, g100):
        v, per = nu.get(serv), "serving"
        if v is None: v, per = nu.get(g100), "100g"
        return v, per
    cal,p1 = pick("energy-kcal_serving","energy-kcal_100g")
    carbs,p2 = pick("carbohydrates_serving","carbohydrates_100g")
    fat, p3 = pick("fat_serving","fat_100g")
    fiber,p4 = pick("fiber_serving","fiber_100g")
    protein,p5 = pick("proteins_serving","proteins_100g")
    sodium,p6 = pick("sodium_serving","sodium_100g")
    per = "serving" if [p1,p2,p3,p4,p5,p6].count("serving") >= 3 else "100g"
    return {
        "serving_size_fill": p.get("serving_size"),
        "per_fill": per,
        "Calories_fill": cal,
        "Carbs_fill": carbs,
        "Fat_fill": fat,
        "Fiber_fill": fiber,
        "Protein_fill": protein,
        "Sodium_fill": sodium,
        "OFF_code": p.get("code"),
        "OFF_name": p.get("product_name"),
        "OFF_brands": p.get("brands")
    }

# ---- Main: best match from brand+name (tries order variants) ----
def best_from_off_brand_name(brand, name, top_n=12,
                             min_score_if_brand_ok=0.70,
                             min_score_if_brand_miss=0.92,
                             min_overlap_if_brand_miss=0.60):
    brand = brand or ""
    name  = name or ""
    q1 = f"{brand} {name}".strip()
    q2 = f"{name} {brand}".strip() if brand and name else q1
    q3 = name.strip()

    prods = []
    prods += off_v2_search({"product_name": q1}, 200)
    if brand: prods += off_v2_search({"product_name": q1, "brands": brand}, 200)

    if q2 != q1:
        prods += off_v2_search({"product_name": q2}, 200)
        if brand: prods += off_v2_search({"product_name": q2, "brands": brand}, 200)

    if q3:
        prods += off_v2_search({"product_name": q3}, 200)
        if brand: prods += off_v2_search({"product_name": q3, "brands": brand}, 200)

    # legacy once with the broadest query
    prods += off_legacy_search(q3 or q1, 200)
    if brand:
        prods += off_legacy_search(q3 or q1, 200, extra={"tagtype_0":"brands","tag_contains_0":"contains","tag_0":brand})

    bnorm = _norm(brand)
    seen, rows = set(), []
    for p in prods:
        code = p.get("code")
        if not code or code in seen: 
            continue
        seen.add(code)
        pname = p.get("product_name") or ""
        brand_ok = (bnorm in _brand_set(p)) if bnorm else True
        s_text   = _score(q1, pname)          # fuzzy ratio on full query
        s_tokens = _name_overlap(name, pname) # token overlap vs NAME only
        rows.append({
            "brand_ok": brand_ok,
            "score_text": s_text,
            "score_tokens": s_tokens,
            "nutrient_count": nutrient_count(p),
            "per_serving": per_is_serving(p),
            "country_us": country_us(p),
            "product": p
        })

    if not rows:
        return None

    dfc = pd.DataFrame(rows).sort_values(
        ["brand_ok","score_text","score_tokens","nutrient_count","per_serving","country_us"],
        ascending=[False, False, False, False, False, False]
    ).head(top_n).reset_index(drop=True)

    top = dfc.loc[0]
    if top["brand_ok"]:
        if top["score_text"] < min_score_if_brand_ok:
            return None
    else:
        if not (top["score_text"] >= min_score_if_brand_miss and top["score_tokens"] >= min_overlap_if_brand_miss):
            return None

    return extract_row(top["product"])

# ---- Run over CSV (fill only '-' cells) ----
df = pd.read_csv(IN_CSV)
for c in ["Calories","Carbs","Fat","Fiber","Protein","Sodium"]:
    if c not in df.columns: df[c] = "-"

need_mask = (df[["Calories","Carbs","Fat","Fiber","Protein","Sodium"]]=="-").any(axis=1)
cache = {}
for i in tqdm(df[need_mask].index, desc="OFF lookups"):
    brand = str(df.at[i, "Brand"] or "").strip()
    name  = str(df.at[i, "Name"]  or "").strip()
    key = (brand, name)

    if key in cache:
        info = cache[key]
    else:
        info = best_from_off_brand_name(brand, name)
        cache[key] = info

    if not info:
        continue

    if df.at[i,"Calories"] == "-": df.at[i,"Calories"] = info["Calories_fill"]
    if df.at[i,"Carbs"]    == "-": df.at[i,"Carbs"]    = info["Carbs_fill"]
    if df.at[i,"Fat"]      == "-": df.at[i,"Fat"]      = info["Fat_fill"]
    if df.at[i,"Fiber"]    == "-": df.at[i,"Fiber"]    = info["Fiber_fill"]
    if df.at[i,"Protein"]  == "-": df.at[i,"Protein"]  = info["Protein_fill"]
    if df.at[i,"Sodium"]   == "-": df.at[i,"Sodium"]   = info["Sodium_fill"]

    # audit columns
    df.at[i,"serving_size_fill"] = info.get("serving_size_fill")
    df.at[i,"per_fill"]          = info.get("per_fill")
    df.at[i,"OFF_code"]          = info.get("OFF_code")
    df.at[i,"OFF_name"]          = info.get("OFF_name")
    df.at[i,"OFF_brands"]        = info.get("OFF_brands")

df.to_csv(OUT_CSV, index=False)
pd.DataFrame([{
    "input": IN_CSV, "output": OUT_CSV,
    "rows_with_missing": int(need_mask.sum()),
    "unique_keys": len(cache),
    "filled_rows": int(((df[["Calories","Carbs","Fat","Fiber","Protein","Sodium"]]!="-").all(axis=1) & need_mask).sum())
}])

OFF lookups: 100%|██████████| 99/99 [1:13:22<00:00, 44.47s/it]


Unnamed: 0,input,output,rows_with_missing,unique_keys,filled_rows
0,C:\Users\chris\Documents\lab\mlife\data\tmp\te...,C:\Users\chris\Documents\lab\mlife\data\tmp\ou...,99,74,31


In [31]:
USER_AGENT = "mlife_diet_nutrition/1.0 (lee55@email.sc.edu)"
def _norm(s):
    if not s: return ""
    s = s.lower().replace("\u2013","-").replace("\u2014","-")
    s = re.sub(r"\s+"," ", s)
    s = re.sub(r"[^\w\s\-\&'()/]", "", s)
    return s.strip()

def _score(a, b):
    return round(SequenceMatcher(a=_norm(a), b=_norm(b)).ratio(), 4)

def off_v2_search(params, page_size=200):
    base = "https://world.openfoodfacts.net/api/v2/search"
    p = dict(params)
    p["page_size"] = page_size
    r = requests.get(base, headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status()
    return r.json().get("products", [])

def off_legacy_search(q, page_size=200, extra=None):
    base = "https://world.openfoodfacts.org/cgi/search.pl"
    p = {"search_terms": q, "search_simple": 1, "json": 1, "page_size": page_size}
    if extra: p.update(extra)
    r = requests.get(base, headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status()
    return r.json().get("products", [])

def debug_candidates(query, brand_hint=None, country_hint=None):
    rows = []
    seen = set()

    # v2: product_name + optional brands + optional countries filter
    v2_params = {"product_name": query}
    if brand_hint: v2_params["brands"] = brand_hint
    if country_hint: v2_params["countries"] = country_hint
    prods = off_v2_search(v2_params, 200)

    # legacy: free text
    prods += off_legacy_search(query, 200)

    # legacy: structured brand filter (increases brand precision)
    if brand_hint:
        prods += off_legacy_search(
            query,
            200,
            extra={
                "tagtype_0": "brands",
                "tag_contains_0": "contains",
                "tag_0": brand_hint
            },
        )

    for p in prods:
        code = p.get("code")
        if not code or code in seen:
            continue
        seen.add(code)
        pn = p.get("product_name") or ""
        pb = p.get("brands") or ""
        rows.append({
            "score_vs_query": _score(query, pn),
            "product_name": pn,
            "brands": pb,
            "code": code,
            "serving_size": p.get("serving_size"),
            "countries": p.get("countries") or p.get("countries_tags")
        })
    df = pd.DataFrame(rows).sort_values(["score_vs_query"], ascending=False).reset_index(drop=True)
    return df

# Try the exact phrase first (no brand filter)
q = "Cinnamon French Toast Sticks Morning Delight"
df_try1 = debug_candidates(query=q)

# Try with brand filter + US bias
df_try2 = debug_candidates(query=q, brand_hint="Morning Delight", country_hint="United States")

# Show top 25 from each
print("=== Top matches (free text only) ===")
display(df_try1.head(25))
print("=== Top matches (brand filter + US) ===")
display(df_try2.head(25))

=== Top matches (free text only) ===


Unnamed: 0,score_vs_query,product_name,brands,code,serving_size,countries
0,0.3611,Mayonnaise recette originale,Star,6111184004129,,Maroc
1,0.3288,100G EXCELLENCE NOIR 85% LINDT,Lindt,3046920022606,100g,"Austria,Belgium,France,Germany,Italy,Morocco,N..."
2,0.3158,Excellence 70% Cocoa Intense Dark,Lindt,3046920028004,,"Austrália,Bélgica,Canadá,República Checa,Dinam..."
3,0.3103,Aïn Atlas 50cl,Aïn Atlas,6111035001673,50cl,Maroc
4,0.3103,Original Taste,Coca-Cola,5449000054227,250 ml,"Argélia,Áustria,Bélgica,Bulgária,Camarões,Croá..."
5,0.3099,100G EXCELLENCE NR 90% LINDT,Lindt,3046920029759,10 g,"Argelia,Austria,Bélgica,Bulgaria,República Che..."
6,0.3077,Fourrés Chocolat Noir,Bjorg,3229820100234,100g,"France,Luxembourg,Switzerland"
7,0.3077,sauce tomate cuisinée,solis,8445290615350,50g,Morocco
8,0.3077,Marmite Yeast Extract,"Marmite,Unilever",50184453,8 g,"Francia,Alemania,Irlanda,Italia,Países Bajos,E..."
9,0.3056,Geröstete Mandel Ohne Zucker,Alpro,5411188112709,100 ml,"Albânia,Áustria,Bélgica,Croácia,Finlândia,Fran..."


=== Top matches (brand filter + US) ===


Unnamed: 0,score_vs_query,product_name,brands,code,serving_size,countries
0,0.3611,Mayonnaise recette originale,Star,6111184004129,,Maroc
1,0.3288,100G EXCELLENCE NOIR 85% LINDT,Lindt,3046920022606,100g,"Austria,Belgium,France,Germany,Italy,Morocco,N..."
2,0.3158,Excellence 70% Cocoa Intense Dark,Lindt,3046920028004,,"Austrália,Bélgica,Canadá,República Checa,Dinam..."
3,0.3103,Aïn Atlas 50cl,Aïn Atlas,6111035001673,50cl,Maroc
4,0.3103,Original Taste,Coca-Cola,5449000054227,250 ml,"Argélia,Áustria,Bélgica,Bulgária,Camarões,Croá..."
5,0.3099,100G EXCELLENCE NR 90% LINDT,Lindt,3046920029759,10 g,"Argelia,Austria,Bélgica,Bulgaria,República Che..."
6,0.3077,Fourrés Chocolat Noir,Bjorg,3229820100234,100g,"France,Luxembourg,Switzerland"
7,0.3077,sauce tomate cuisinée,solis,8445290615350,50g,Morocco
8,0.3077,Marmite Yeast Extract,"Marmite,Unilever",50184453,8 g,"Francia,Alemania,Irlanda,Italia,Países Bajos,E..."
9,0.3056,Geröstete Mandel Ohne Zucker,Alpro,5411188112709,100 ml,"Albânia,Áustria,Bélgica,Croácia,Finlândia,Fran..."


In [22]:
# QUERY = "Outback Steakhouse Gold Coast Coconut Shrimp"
# BRAND_HINT = "Outback Steakhouse"

QUERY = "Market Pantry Apple & Cinnamon Instant Oatmeal"
BRAND_HINT = "Market Pantry"

def _norm(s):
    if not s: return ""
    s = s.lower().replace("\u2013","-").replace("\u2014","-")
    s = re.sub(r"\s+"," ", s)
    s = re.sub(r"[^\w\s\-\&'()/]", "", s)
    return s.strip()

def _score(a, b): return round(SequenceMatcher(a=_norm(a), b=_norm(b)).ratio(), 4)

def off_v2_search(params, page_size=200):
    p = dict(params); p["page_size"] = page_size
    r = requests.get("https://world.openfoodfacts.net/api/v2/search", headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status(); return r.json().get("products", [])

def off_legacy_search(q, page_size=200, extra=None):
    p = {"search_terms": q, "search_simple": 1, "json": 1, "page_size": page_size}
    if extra: p.update(extra)
    r = requests.get("https://world.openfoodfacts.org/cgi/search.pl", headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status(); return r.json().get("products", [])

def fetch_off_product(code):
    r = requests.get(f"https://world.openfoodfacts.net/api/v2/product/{code}", headers={"User-Agent": USER_AGENT}, timeout=20)
    r.raise_for_status(); return r.json().get("product", {}) or {}

TARGET_KEYS = [
    ("energy-kcal_serving","energy-kcal_100g"),
    ("carbohydrates_serving","carbohydrates_100g"),
    ("fat_serving","fat_100g"),
    ("fiber_serving","fiber_100g"),
    ("proteins_serving","proteins_100g"),
    ("sodium_serving","sodium_100g"),
]

def nutrient_count(prod):
    nu = prod.get("nutriments") or {}
    return sum((nu.get(k1) is not None) or (nu.get(k2) is not None) for k1,k2 in TARGET_KEYS)

def per_is_serving(prod):
    nu = prod.get("nutriments") or {}
    return any(nu.get(k1) is not None for k1,_ in TARGET_KEYS)

def country_us(prod):
    c = prod.get("countries_tags") or []
    c = [str(x).lower() for x in c]
    return any(x.endswith(":united-states") or x=="en:united-states" for x in c)

def extract_six(prod):
    nu = prod.get("nutriments") or {}
    def get_pair(k1,k2):
        v = nu.get(k1); per = "serving"
        if v is None: v = nu.get(k2); per = "100g"
        return v, per
    cal,p1 = get_pair("energy-kcal_serving","energy-kcal_100g")
    carbs,p2 = get_pair("carbohydrates_serving","carbohydrates_100g")
    fat,p3 = get_pair("fat_serving","fat_100g")
    fiber,p4 = get_pair("fiber_serving","fiber_100g")
    protein,p5 = get_pair("proteins_serving","proteins_100g")
    sodium,p6 = get_pair("sodium_serving","sodium_100g")
    per = "serving" if [p1,p2,p3,p4,p5,p6].count("serving") >= 3 else "100g"
    return pd.DataFrame([{
        "brand": prod.get("brands"),
        "item": prod.get("product_name"),
        "serving_size": prod.get("serving_size"),
        "per": per,
        "Calories": cal,
        "Carbs": carbs,
        "Fat": fat,
        "Fiber": fiber,
        "Protein": protein,
        "Sodium": sodium,
        "code": prod.get("code")
    }])

def one_cell_query(query, brand_hint=None, top_n=10):
    rows, seen = [], set()
    prods = off_v2_search({"product_name": query}, 200)
    if brand_hint: prods += off_v2_search({"product_name": query, "brands": brand_hint}, 200)
    prods += off_legacy_search(query, 200)
    if brand_hint:
        prods += off_legacy_search(query, 200, extra={"tagtype_0":"brands","tag_contains_0":"contains","tag_0":brand_hint})
    for p in prods:
        code = p.get("code"); 
        if not code or code in seen: continue
        seen.add(code)
        rows.append({
            "score_vs_query": _score(query, p.get("product_name") or ""),
            "code": code,
            "product_name": p.get("product_name") or "",
            "brands": p.get("brands") or ""
        })
    if not rows: return pd.DataFrame()
    dfc = pd.DataFrame(rows).sort_values("score_vs_query", ascending=False).head(top_n).reset_index(drop=True)
    details = []
    for _,r in dfc.iterrows():
        prod = fetch_off_product(r.code)
        details.append({
            "product": prod,
            "score_vs_query": r.score_vs_query,
            "nutrient_count": nutrient_count(prod),
            "per_serving": per_is_serving(prod),
            "country_us": country_us(prod)
        })
    dd = pd.DataFrame(details).sort_values(["score_vs_query","nutrient_count","per_serving","country_us"], ascending=[False,False,False,False]).reset_index(drop=True)
    return extract_six(dd.loc[0,"product"])

result_df = one_cell_query(QUERY, BRAND_HINT, top_n=10)
result_df

Unnamed: 0,brand,item,serving_size,per,Calories,Carbs,Fat,Fiber,Protein,Sodium,code
0,Market Pantry,Apples & Cinnamon Instant Oatmeal,1 PACKET (35 g),serving,130,27,1.5,3.01,3,0.16,85239981771


In [27]:
QUERY = "Morning delight Cinnamon French Toast Sticks "
BRAND_HINT = "Morning Delight"

FIELDS = ",".join([
    "code","brands","brands_tags","product_name","serving_size","countries_tags",
    "nutriments.energy-kcal_serving","nutriments.energy-kcal_100g",
    "nutriments.carbohydrates_serving","nutriments.carbohydrates_100g",
    "nutriments.fat_serving","nutriments.fat_100g",
    "nutriments.fiber_serving","nutriments.fiber_100g",
    "nutriments.proteins_serving","nutriments.proteins_100g",
    "nutriments.sodium_serving","nutriments.sodium_100g"
])

def _norm(s):
    if not s: return ""
    s = s.lower().replace("\u2013","-").replace("\u2014","-")
    s = re.sub(r"\s+"," ", s)
    s = re.sub(r"[^\w\s\-\&'()/]", "", s)
    return s.strip()

def _score(a,b): return round(SequenceMatcher(a=_norm(a), b=_norm(b)).ratio(), 4)

def off_v2_search(params, page_size=200):
    p = dict(params); p["page_size"] = page_size; p["fields"] = FIELDS
    r = requests.get("https://world.openfoodfacts.net/api/v2/search",
                     headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status(); return r.json().get("products", [])

def off_legacy_search(q, page_size=200, extra=None):
    p = {"search_terms": q, "search_simple": 1, "json": 1, "page_size": page_size}
    if extra: p.update(extra)
    r = requests.get("https://world.openfoodfacts.org/cgi/search.pl",
                     headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status(); return r.json().get("products", [])

def nutrient_count(p):
    nu = p.get("nutriments") or {}
    keys = [
        ("energy-kcal_serving","energy-kcal_100g"),
        ("carbohydrates_serving","carbohydrates_100g"),
        ("fat_serving","fat_100g"),
        ("fiber_serving","fiber_100g"),
        ("proteins_serving","proteins_100g"),
        ("sodium_serving","sodium_100g"),
    ]
    return sum((nu.get(a) is not None) or (nu.get(b) is not None) for a,b in keys)

def per_is_serving(p):
    nu = p.get("nutriments") or {}
    return any(nu.get(k) is not None for k in [
        "energy-kcal_serving","carbohydrates_serving","fat_serving",
        "fiber_serving","proteins_serving","sodium_serving"
    ])

def country_us(p):
    tags = [str(x).lower() for x in (p.get("countries_tags") or [])]
    return any(x.endswith(":united-states") or x=="en:united-states" for x in tags)

def extract_row(p):
    nu = p.get("nutriments") or {}
    def pick(serv, g100):
        v, per = nu.get(serv), "serving"
        if v is None: v, per = nu.get(g100), "100g"
        return v, per
    cal,p1 = pick("energy-kcal_serving","energy-kcal_100g")
    carbs,p2 = pick("carbohydrates_serving","carbohydrates_100g")
    fat, p3 = pick("fat_serving","fat_100g")
    fiber,p4 = pick("fiber_serving","fiber_100g")
    protein,p5 = pick("proteins_serving","proteins_100g")
    sodium,p6 = pick("sodium_serving","sodium_100g")
    per = "serving" if [p1,p2,p3,p4,p5,p6].count("serving") >= 3 else "100g"
    return {
        "brand": p.get("brands"),
        "item": p.get("product_name"),
        "serving_size": p.get("serving_size"),
        "per": per,
        "Calories": cal, "Carbs": carbs, "Fat": fat, "Fiber": fiber, "Protein": protein, "Sodium": sodium,
        "code": p.get("code")
    }

# --- Search by name/brand, rank, and extract directly from search payload ---
prods = off_v2_search({"product_name": QUERY}, 200)
if BRAND_HINT: prods += off_v2_search({"product_name": QUERY, "brands": BRAND_HINT}, 200)
prods += off_legacy_search(QUERY, 200)
if BRAND_HINT:
    prods += off_legacy_search(QUERY, 200, extra={"tagtype_0":"brands","tag_contains_0":"contains","tag_0":BRAND_HINT})

# dedupe by code and score/rank
seen, rows = set(), []
for p in prods:
    code = p.get("code")
    if not code or code in seen: 
        continue
    seen.add(code)
    rows.append({
        "score_vs_query": _score(QUERY, p.get("product_name") or ""),
        "nutrient_count": nutrient_count(p),
        "per_serving": per_is_serving(p),
        "country_us": country_us(p),
        "product": p
    })

if rows:
    dd = pd.DataFrame(rows).sort_values(
        ["score_vs_query","nutrient_count","per_serving","country_us"],
        ascending=[False, False, False, False]
    ).reset_index(drop=True)
    best = dd.loc[0,"product"]
    result_df = pd.DataFrame([extract_row(best)])
else:
    result_df = pd.DataFrame([{"error":"no candidates with nutriments from search"}])

result_df

Unnamed: 0,brand,item,serving_size,per,Calories,Carbs,Fat,Fiber,Protein,Sodium,code
0,Jason’s,Sourdough White Ciabattin Bread,1 portion (48 g),serving,110,21.2,0.384,1.39,4.85,0.174,5025125000006


In [35]:
BRAND = "Morning Delight"
NAME  = "Cinnamon French Toast Sticks"

FIELDS = ",".join([
    "code","brands","brands_tags","product_name","serving_size","countries_tags",
    "nutriments.energy-kcal_serving","nutriments.energy-kcal_100g",
    "nutriments.carbohydrates_serving","nutriments.carbohydrates_100g",
    "nutriments.fat_serving","nutriments.fat_100g",
    "nutriments.fiber_serving","nutriments.fiber_100g",
    "nutriments.proteins_serving","nutriments.proteins_100g",
    "nutriments.sodium_serving","nutriments.sodium_100g"
])

def _norm(s):
    if not s: return ""
    s = s.lower().replace("\u2013","-").replace("\u2014","-")
    s = re.sub(r"\s+"," ", s)
    s = re.sub(r"[^\w\s\-\&'()/]", "", s)
    return s.strip()

def _tokens(s):
    return {t for t in re.split(r"[^\w&'/()-]+", _norm(s)) if t and t not in {"the","and","with","of"}}

def _score(a,b): 
    return SequenceMatcher(a=_norm(a), b=_norm(b)).ratio()

def _name_overlap(qname, prod_name):
    qa, qb = _tokens(qname), _tokens(prod_name)
    if not qa or not qb: return 0.0
    inter = len(qa & qb)
    return inter / max(len(qa), len(qb))

def _brand_set(p):
    raw = (p.get("brands") or "")
    tags = p.get("brands_tags") or []
    out = { _norm(x) for x in raw.split(",") if x.strip() }
    out |= { _norm(str(x)) for x in tags }
    return {x for x in out if x}

def off_v2_search(params, page_size=200):
    p = dict(params); p["page_size"] = page_size; p["fields"] = FIELDS
    r = requests.get("https://world.openfoodfacts.net/api/v2/search",
                     headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status(); return r.json().get("products", [])

def off_legacy_search(q, page_size=200, extra=None):
    p = {"search_terms": q, "search_simple": 1, "json": 1, "page_size": page_size}
    if extra: p.update(extra)
    r = requests.get("https://world.openfoodfacts.org/cgi/search.pl",
                     headers={"User-Agent": USER_AGENT}, params=p, timeout=20)
    r.raise_for_status(); return r.json().get("products", [])

def nutrient_count(p):
    nu = p.get("nutriments") or {}
    keys = [
        ("energy-kcal_serving","energy-kcal_100g"),
        ("carbohydrates_serving","carbohydrates_100g"),
        ("fat_serving","fat_100g"),
        ("fiber_serving","fiber_100g"),
        ("proteins_serving","proteins_100g"),
        ("sodium_serving","sodium_100g"),
    ]
    return sum((nu.get(a) is not None) or (nu.get(b) is not None) for a,b in keys)

def per_is_serving(p):
    nu = p.get("nutriments") or {}
    return any(nu.get(k) is not None for k in [
        "energy-kcal_serving","carbohydrates_serving","fat_serving",
        "fiber_serving","proteins_serving","sodium_serving"
    ])

def country_us(p):
    tags = [str(x).lower() for x in (p.get("countries_tags") or [])]
    return any(x.endswith(":united-states") or x=="en:united-states" for x in tags)

def extract_row(p):
    nu = p.get("nutriments") or {}
    def pick(serv, g100):
        v, per = nu.get(serv), "serving"
        if v is None: v, per = nu.get(g100), "100g"
        return v, per
    cal,p1 = pick("energy-kcal_serving","energy-kcal_100g")
    carbs,p2 = pick("carbohydrates_serving","carbohydrates_100g")
    fat, p3 = pick("fat_serving","fat_100g")
    fiber,p4 = pick("fiber_serving","fiber_100g")
    protein,p5 = pick("proteins_serving","proteins_100g")
    sodium,p6 = pick("sodium_serving","sodium_100g")
    per = "serving" if [p1,p2,p3,p4,p5,p6].count("serving") >= 3 else "100g"
    return {
        "brand": p.get("brands"),
        "item": p.get("product_name"),
        "serving_size": p.get("serving_size"),
        "per": per,
        "Calories": cal, "Carbs": carbs, "Fat": fat, "Fiber": fiber, "Protein": protein, "Sodium": sodium,
        "code": p.get("code")
    }

def debug_and_pick(brand, name,
                   min_brand_ok_text=0.55, min_brand_ok_tokens=0.60,
                   min_miss_text=0.92,    min_miss_tokens=0.60,
                   top_n=15):
    q1 = f"{brand} {name}".strip()
    q2 = f"{name} {brand}".strip() if brand and name else q1
    q3 = name.strip()

    prods = []
    prods += off_v2_search({"product_name": q1}, 200)
    if brand: prods += off_v2_search({"product_name": q1, "brands": brand}, 200)

    if q2 != q1:
        prods += off_v2_search({"product_name": q2}, 200)
        if brand: prods += off_v2_search({"product_name": q2, "brands": brand}, 200)

    if q3:
        prods += off_v2_search({"product_name": q3}, 200)
        if brand: prods += off_v2_search({"product_name": q3, "brands": brand}, 200)

    prods += off_legacy_search(q3 or q1, 200)
    if brand:
        prods += off_legacy_search(q3 or q1, 200, extra={"tagtype_0":"brands","tag_contains_0":"contains","tag_0":brand})

    bnorm = _norm(brand)
    seen, rows = set(), []
    for p in prods:
        code = p.get("code")
        if not code or code in seen:
            continue
        seen.add(code)
        pname = p.get("product_name") or ""
        brand_ok = (bnorm in _brand_set(p)) if bnorm else True
        s_text   = _score(q1, pname)
        s_tokens = _name_overlap(name, pname)
        rows.append({
            "brand_ok": brand_ok,
            "score_text": round(s_text, 4),
            "score_tokens": round(s_tokens, 4),
            "nutrient_count": nutrient_count(p),
            "per_serving": per_is_serving(p),
            "country_us": country_us(p),
            "product_name": pname,
            "brands": p.get("brands"),
            "code": code,
            "product": p
        })

    if not rows:
        return pd.DataFrame(), None, "no candidates"

    dfc = pd.DataFrame(rows).sort_values(
        ["brand_ok","score_text","score_tokens","nutrient_count","per_serving","country_us"],
        ascending=[False, False, False, False, False, False]
    ).reset_index(drop=True)

    # --- No-brand re-rank (drop no-nutrient items, prefer strongest name) ---
    any_brand_ok = bool((dfc["brand_ok"] == True).any())
    if not any_brand_ok:
        df_nb = dfc[dfc["nutrient_count"] >= 3].copy()
        if df_nb.empty:
            df_nb = dfc[dfc["nutrient_count"] >= 1].copy()
        if not df_nb.empty:
            df_nb = df_nb.sort_values(
                ["score_tokens","score_text","nutrient_count","country_us","per_serving"],
                ascending=[False, False, False, False, False]
            ).reset_index(drop=True)
            dfc = df_nb

    # --- Option A acceptance ---
    top = dfc.iloc[0]
    reason = ""
    if any_brand_ok:
        if top["brand_ok"]:
            if not (top["score_text"] >= min_brand_ok_text or top["score_tokens"] >= min_brand_ok_tokens):
                reason = f"rejected: brand_ok but low scores (text {top['score_text']}, tokens {top['score_tokens']})"
                return dfc.head(top_n), None, reason
        else:
            if not (top["score_text"] >= min_miss_text and top["score_tokens"] >= min_miss_tokens):
                reason = f"rejected: brand miss and scores not strong enough (text {top['score_text']}, tokens {top['score_tokens']})"
                return dfc.head(top_n), None, reason
    else:
        # No brand matches at all -> allow strong name-only with nutrients
        fallback_tokens = 0.90
        fallback_text = 0.75
        fallback_min_nutrients = 3
        if not (top["score_tokens"] >= fallback_tokens and
                top["score_text"]  >= fallback_text  and
                top["nutrient_count"] >= fallback_min_nutrients):
            reason = (
                f"rejected: no brand matches; name-only not strong enough or too few nutrients "
                f"(text {top['score_text']}, tokens {top['score_tokens']}, nutrients {top['nutrient_count']})"
            )
            return dfc.head(top_n), None, reason

    # ✅ the missing line: return the accepted pick
    return dfc.head(top_n), extract_row(top["product"]), "accepted"

# ---- Run just this item ----
cands, picked, status = debug_and_pick(BRAND, NAME)
print("STATUS:", status)
display(cands)
if picked:
    print("\nPICKED RESULT:")
    display(pd.DataFrame([picked]))

STATUS: accepted


Unnamed: 0,brand_ok,score_text,score_tokens,nutrient_count,per_serving,country_us,product_name,brands,code,product
0,False,0.7778,1.0,6,True,True,Cinnamon French Toast Sticks,Great Value,78742020938,"{'_id': '0078742020938', '_keywords': ['and', ..."
1,False,0.7778,1.0,6,True,True,Cinnamon French Toast Sticks,,36800181632,"{'_id': '0036800181632', '_keywords': ['cinnam..."
2,False,0.7778,1.0,6,True,True,Cinnamon french toast sticks,,75450151220,"{'_id': '0075450151220', '_keywords': ['stick'..."
3,False,0.7778,1.0,6,True,True,Cinnamon french toast sticks,,41415201545,"{'_id': '0041415201545', '_keywords': ['cinnam..."
4,False,0.7778,1.0,6,True,True,Cinnamon French Toast Sticks,Members Mark,193968351854,"{'_id': '0193968351854', '_keywords': ['cinnam..."
5,False,0.7778,1.0,6,True,True,Cinnamon French Toast Sticks,Rich Products,41322241085,"{'_id': '0041322241085', '_keywords': ['cinnam..."
6,False,0.7778,1.0,6,True,True,Cinnamon French Toast Sticks,Market Pantry,85239571620,"{'_id': '0085239571620', '_keywords': ['cinnam..."
7,False,0.7778,1.0,6,False,True,Cinnamon french toast sticks,,75450242676,"{'_id': '0075450242676', '_keywords': ['french..."
8,False,0.7778,1.0,6,False,True,Cinnamon French Toast Sticks,,810038680330,"{'_id': '0810038680330', '_keywords': ['artifi..."
9,False,0.7778,1.0,6,False,True,Cinnamon French Toast Sticks,,51933368866,"{'_id': '0051933368866', '_keywords': ['cinnam..."



PICKED RESULT:


Unnamed: 0,brand,item,serving_size,per,Calories,Carbs,Fat,Fiber,Protein,Sodium,code
0,Great Value,Cinnamon French Toast Sticks,5 sticks (110 g),serving,330,44,15,2,5,0.34,78742020938
