In [5]:
import json
import re

def clean_ingredient(ingredient):
    """
    Cleans the ingredient string by removing measurements, units, and optional descriptions.
    """
    # Remove content in parentheses and various measurements and units
    cleaned = re.sub(
        r'\(.*?\)|\d+|/|\d+|\b(?:teaspoon|tablespoon|cup|cups|grams|g|ml|ounce|ounces|'
        r'pound|pounds|tbsp|tsp|lb|oz|kg|quart|liters|liter|dash|pinch|scoop|slices|'
        r'slice|part|ea\.|pkg\.)\b', '', ingredient, flags=re.IGNORECASE)
    # Remove extra spaces and punctuation, and trim whitespace
    cleaned = re.sub(r'[^a-zA-Z\s]', '', cleaned).strip()
    return cleaned

def process_ingredients(input_file_path, output_file_path):
    """
    Processes a list of ingredients by cleaning each entry and removing duplicates.
    """
    with open(input_file_path, 'r') as file:
        ingredients = json.load(file)

    cleaned_ingredients = set(clean_ingredient(ing) for ing in ingredients if clean_ingredient(ing))

    with open(output_file_path, 'w') as outfile:
        json.dump(list(cleaned_ingredients), outfile, indent=4)

input_path = './all_ingredients.json'
output_path = './cleaned_ingredients_2.json'
process_ingredients(input_path, output_path)


In [None]:
import json

# Load the ingredients from the JSON file
with open('cleaned_ingredients.json', 'r') as file:
    ingredients = json.load(file)

# Filter out empty elements and elements containing numbers
filtered_ingredients = [ingredient for ingredient in ingredients if ingredient and not any(char.isdigit() for char in ingredient)]

# Remove duplicates by converting the list to a set and back to a list
unique_ingredients = list(set(filtered_ingredients))

# Save the cleaned ingredients back to a JSON file
with open('cleaned_ingredients_bis.json', 'w') as file:
    json.dump(unique_ingredients, file, indent=4)

print("Cleaning done")

In [None]:
import json

# Load the ingredients from the JSON file
with open('cleaned_ingredients_bis_sorted_bis.json', 'r') as file:
    ingredients = json.load(file)

# Remove duplicates and sort alphabetically
unique_sorted_ingredients = sorted(set(ingredients))

# Save the cleaned ingredients back to a JSON file
with open('final_ingredients_sorted.json', 'w') as file:
    json.dump(unique_sorted_ingredients, file, indent=4)

print("Sorted and unique ingredients have been saved to 'final_ingredients_sorted.json'")