In [1]:
import pandas as pd

# Load CSV
brandfood = pd.read_csv("../DATA/branded_food.csv", usecols=['fdc_id', 'ingredients'])

# Peek at the data
brandfood.head()


Unnamed: 0,fdc_id,ingredients
0,1105904,Vegetable Oil
1,1105905,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%..."
2,1105906,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA..."
3,1105907,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V..."
4,1105908,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN..."


In [3]:
food = pd.read_csv("../DATA/food.csv", usecols=["fdc_id", "description"])

brandfood["fdc_id"] = brandfood["fdc_id"].astype("Int64")
food["fdc_id"] = food["fdc_id"].astype("Int64")

food = food.rename(columns={"description": "food_name"})

brandfood = brandfood.merge(food, on="fdc_id", how="left")

brandfood.head()

Unnamed: 0,fdc_id,ingredients,food_name
0,1105904,Vegetable Oil,WESSON Vegetable Oil 1 GAL
1,1105905,"INGREDIENTS: BEEF STOCK, CONTAINS LESS THAN 2%...",SWANSON BROTH BEEF
2,1105906,"INGREDIENTS: CLAM STOCK, POTATOES, CLAMS, CREA...",CAMPBELL'S SLOW KETTLE SOUP CLAM CHOWDER
3,1105907,"INGREDIENTS: WATER, CREAM, BROCCOLI, CELERY, V...",CAMPBELL'S SLOW KETTLE SOUP CHEESE BROCCOLI
4,1105908,"INGREDIENTS: CHICKEN STOCK, CONTAINS LESS THAN...",SWANSON BROTH CHICKEN


In [5]:
import re

def clean_ingredients_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"^\s*ingredients?\s*:\s*", "", s)              # strip prefix

    # remove ONLY the phrase like "contains less than 2% of:" etc., keep what follows
    s = re.sub(
        r"\b(?:contains\s+less\s+than\s*\d+%|"
        r"contains\s*\d+%\s*or\s*less|"
        r"\d+%\s*or\s*less|"
        r"less\s+than\s*\d+%)\s*(?:of)?\s*:?\s*",
        "", s
    )

    # drop parenthetical breakdowns (e.g., vegetable oil (corn, canola) -> vegetable oil)
    while True:
        new_s = re.sub(r"\([^()]*\)", "", s)
        if new_s == s:
            break
        s = new_s

    # normalize separators
    s = re.sub(r"\band\/or\b", ",", s)
    s = s.replace(";", ",")

    # tidy spaces/commas
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"\s*,\s*", ", ", s)
    s = re.sub(r"(,\s*){2,}", ", ", s)
    s = s.strip(" ,")

    return s

In [None]:
# 0) Keep a raw copy for traceability
brandfood["ingredients_raw"] = brandfood["ingredients"].astype(str)

# 1) Apply your cleaner (the clean_ingredients_text you already defined)
brandfood["ingredients_clean"] = brandfood["ingredients_raw"].apply(clean_ingredients_text)

# 2) Tokenize to a list (optional but useful later)
import re, json

def _norm(t: str) -> str:
    return re.sub(r"\s+", " ", t).strip(" .-")

brandfood["ingredients_list"] = (
    brandfood["ingredients_clean"]
      .str.split(r"[;,]")                              # split on commas/semicolons
      .apply(lambda xs: [ _norm(x) for x in (xs or []) if isinstance(x, str) and x.strip() ])
)

brandfood["num_ingredients"]      = brandfood["ingredients_list"].apply(len)
brandfood["ingredients_list_str"] = brandfood["ingredients_list"].apply(lambda xs: " | ".join(xs))
brandfood["ingredients_list_json"]= brandfood["ingredients_list"].apply(json.dumps)

# 3) Choose a display name (you merged food.csv as 'food_name')
brandfood["product_name"] = brandfood["food_name"]  # rename for clarity

# 4) Reorder columns (put the useful ones first)
front = [
    "fdc_id",
    "product_name",
    "ingredients_list_json",
    "num_ingredients",
]
front = [c for c in front if c in brandfood.columns]
brandfood = brandfood[front]

brandfood.head()

# 5) Save
from pathlib import Path
OUTDIR = Path("../DATA/processed")
OUTDIR.mkdir(parents=True, exist_ok=True)

out_path = OUTDIR / "branded_food_clean.csv"
brandfood.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())
