In [77]:
import datetime
import requests
import json
import re
import os
from copy import deepcopy
from collections import Counter

In [78]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [79]:
CANON_CATEGORIES = [
    "Weight Management",
    "Digestive & Gut Health",
    "Energy",
    "Sports Nutrition",
    "Joint & Mobility",
    "Cognitive Health",
    "Immune Health",
    "Hair, Skin & Nails",
    "Men's Wellness",
    "Women's Wellness",
    "Prenatal & Postnatal",
    "Liver & Detox",
    "Sleep & Relaxation",
    "Stress & Mood",
    "Bone Health",
    "Heart Health",
    "Blood Sugar Support",
    "Healthy Aging",
    "Vision & Eye Health",
    "Inflammation & Pain"
    ]
CANON_INGREDIENTS = ["5-HTP", "Acacia", "Acai", "Adenosine triphosphate", "Adrenal", "Agarikon", "Agmatine", "Akkermansia", "Alfalfa", "Algae", "Algae Oil", "Allicin", "Aloe vera", "Alpha-ketoglutaric acid", "Alpha-lipoic acid", "Amla", "Amylase", "Andrographis", "Anise", "Apigenin", "Apple", "Arabinogalactan", "Argan oil", "Arginine", "Arjuna", "Artichoke", "Ashwagandha", "Asparagine", "Aspartic acid", "Astaxanthin", "Astragalus", "Bacillus coagulans", "Bacillus subtilis", "Bacopa", "Bamboo", "Banaba", "Baobab", "Barberry", "Barley", "Bee propolis", "Beet", "Berberine", "Bergamot", "Beta-Alanine", "Beta-glucan", "Betaine", "Bifidobacterium adolescentis", "Bifidobacterium bifidum", "Bifidobacterium breve", "Bifidobacterium infantis", "Bifidobacterium longum", "Bilberry", "Biotin", "Bitter melon", "Black cherry", "Black cohosh", "Black currant", "Black pepper", "Black radish", "Black walnut", "Bladderwrack", "Blessed thistle", "Blue flag", "Blueberry", "Boldo", "Bone marrow", "Borage Oil", "Boron", "Boswellia", "Brewers yeast", "Broccoli", "Bromelain", "Buchu", "Buckthorn", "Buckwheat", "Bupleurum", "Burdock", "Butcher's broom", "Butterbur", "Butyrate", "Cabbage", "Caffeine", "Calcium", "Camelina Oil", "Camu camu", "Cannabidiol", "Cannabigerol", "Capsicum", "Cardamom", "Carnitine", "Carnosine", "Carrot", "Cascara sagrada", "Cat's claw", "Catalase", "Catuaba", "Cayenne", "Celery", "Cetyl myristoleate", "Chaga", "Chamomile", "Chanca piedra", "Chaste tree", "Chicory", "Chinese Yam", "Chitosan", "Chlorella", "Chloride", "Chlorophyll", "Choline", "Chondroitin", "Chromium", "Chrysin", "Chymotrypsin", "Cinnamon", "Cissus quadrangularis", "Citicoline", "Citrulline", "Citrus bioflavonoids", "Clostridium Butyricum", "Clove", "Cobalt", "Cocoa", "Coconut", "Coenzyme Q10", "Coleus forskohlii", "Collagen", "Colostrum", "Conjugated linoleic acid", "Copper", "Coptis", "Cordyceps", "Coriander", "Corn silk", "Couch grass", "Cramp bark", "Cranberry", "Creatine", "Cumin", "Curcumin", "D-Mannose", "D-Ribose", "Damiana", "Dandelion", "Dehydroepiandrosterone", "Devil's claw", "Dihydromyricetin (DHM)", "Diindolylmethane", "Dimethylglycine", "Diosmin", "Docosahexaenoic acid (DHA)", "Dong quai", "DPA", "Dulse", "Duodenum", "Ecdysterone", "Echinacea", "Ecklonia cava", "Eicosapentaenoic acid (EPA)", "Elderberry", "Elderflower", "Elecampane", "Eleuthero", "Epicatechin", "Epigallocatechin gallate", "Ergothioneine", "Eucalyptus", "Evening primrose oil", "Eyebright", "Fennel", "Fenugreek", "Feverfew", "Fish oil", "Flaxseed Oil", "Flower Pollen Extract", "Fo-ti (He Shou Wu)", "Folate", "Forskolin", "Fructooligosaccharides", "Fucoidan", "Fucoxanthin", "Fulvic acid", "GABA", "Gamma linolenic acid", "Garcinia cambogia", "Garlic", "Gelatin", "Gentian", "Germanium", "Ginger", "Ginkgo", "Ginseng", "Glucomannan", "Glucosamine", "Glutamic acid", "Glutamine", "Glutathione", "Goldenseal", "Gotu kola", "Grape", "Grapefruit", "Gravel root", "Graviola", "Green coffee bean", "Guarana", "Guggul", "Gymnema sylvestre", "Gynostemma", "Halostachine", "Hawthorn", "Hemp", "Hesperidin", "Hibiscus", "Higenamine", "Histidine", "HMB", "Holy Basil", "Honey", "Hoodia", "Hops", "Hordenine", "Horny goat weed", "Horse chestnut", "Horsetail", "Huperzine", "Hyaluronic acid", "Hydroxyproline", "Hyssop", "Immunoglobulins", "Indole-3-carbinol", "Inositol", "Inositol Hexaphosphate", "Inulin", "Iodine", "Iron", "Isatis", "Isoflavones", "Isoleucine", "Jerusalem artichoke", "Jujube", "Juniper", "Kale", "Kanna", "Kava kava", "Kelp", "Keratin", "Kidney", "Kola nut", "Kudzu", "Lactase", "Lactobacillus acidophilus", "Lactobacillus brevis", "Lactobacillus bulgaricus", "Lactobacillus casei", "Lactobacillus paracasei", "Lactobacillus plantarum", "Lactobacillus reuteri", "Lactobacillus rhamnosus", "Lactoferrin", "Lavender", "Lemon", "Lemon balm", "Leucine", "Licorice", "Lignans", "Linoleic acid", "Lion's Mane", "Lipase", "Lithium", "Bovine Spleen", "Bovine Pancreas", "Bovine Kidney", "Bovine Heart", "Bovine Liver", "Bovine Glandular", "Lobelia", "Long pepper", "Lumbrokinase", "Lutein", "Luteolin", "Lycopene", "Lysine", "Maca", "Magnesium Citrate", "Magnesium Glycinate", "Magnesium Malate", "Magnesium Taurinate", "Magnesium Threonate", "Magnesium Oxide", "Magnolia", "Maitake", "Malic acid", "Manganese", "Mangosteen", "Maqui Berry", "Marigold", "Marine Phytoplankton", "Marshmallow", "Matcha", "Medium chain triglycerides", "Melatonin", "Methionine", "Methylsulfonylmethane", "Milk thistle", "Mineral", "Molybdenum", "Monolaurin", "Moringa", "Motherwort", "Mucuna pruriens", "Muira puama", "Mullein", "Mussel", "Mustard", "Myricetin", "Myrrh", "N-Acetyl-Cysteine", "NADH", "Naringin", "Natto", "Nattokinase", "Neem", "Nettle", "Nickel", "NMN (Nicotinamide mononucleotide)", "Noni", "Nopal cactus", "Nucleic acid", "Oat", "Oleic acid", "Olive", "Orange", "Oregano", "Oregon grape", "Ornithine", "Osha", "Ox bile", "Oyster", "Palmitoylethanolamide", "Pancreas", "Pancreatin", "Papaya", "Paprika", "Para-aminobenzoic acid", "Parsley", "Passion flower", "Pau d'arco", "Pea protein", "Pectin", "Pectinase", "Peppermint", "Pepsin", "Perilla Oil", "Phenylalanine", "Phenylethylamine", "Phosphatidylcholine", "Phosphatidylserine", "Phosphorus", "Phytoceramides", "Phytosterols", "Picrorhiza", "Pine", "Pine bark", "Pineapple", "Piperine", "Plantain", "Plum", "Policosanol", "Polyphenol", "Polypodium vulgare", "Pomegranate", "Poria", "Potassium", "Pregnenolone", "Prickly ash", "Prickly pear", "Proanthocyanidins", "Proline", "Protease", "Hydrolyzed Whey Protein", "Prune", "Psyllium", "Pterostilbene", "Pumpkin", "Pumpkin Seed Oil", "Pygeum", "Quercetin", "Raspberry", "Rauwolscine", "Red clover", "Red root", "Red wine", "Red yeast rice", "Rehmannia", "Reishi", "Resveratrol", "Rhodiola", "Rhubarb", "Riboflavin", "Rice bran", "RNA", "Rose hips", "Rosemary", "Royal jelly", "Rubidium", "Rutin", "Saccharomyces boulardii", "Safflower Oil", "Saffron", "Sage", "Sarsaparilla", "Saw palmetto", "Schisandra", "Scute (Scutellaria)", "Sea Buckthorn", "Sea cucumber", "Selenium", "Senna", "Serine", "Serrapeptase", "Sesame", "Shark cartilage", "Shatavari", "Shiitake", "Shilajit", "Silicon", "Slippery elm", "Sodium", "Sophora Japonica", "Soybean", "Spermidine", "Spirulina", "Spleen", "Squalene", "St. John’s Wort", "Star anise", "Stevia", "Stoneroot", "Strawberry", "Streptococcus thermophilus", "Strontium", "Succinic acid", "Sulbutiamine", "Sulforaphane", "Suma", "Sunflower", "Superoxide dismutase", "Sweet cherry", "Synephrine", "Tangerine", "Tart cherry", "Taurine", "Theacrine", "Theanine", "Theobromine", "Threonine", "Thyme", "Thymus", "Thyroid", "Tin", "Tomato", "Tongkat Ali", "Toothed clubmoss", "Tribulus terrestris", "Triphala", "TUDCA (Tauroursodeoxycholic acid)", "Turkey rhubarb", "Turkey tail", "Tyrosine", "Uridine", "Urolithin A", "Uva ursi", "Valerian", "Valine", "Vanadium", "Vegetable glycerin", "Vinpocetine", "Vitamin A", "Vitamin B1", "Vitamin B12", "Vitamin B3", "Vitamin B5", "Vitamin B6", "Vitamin C", "Vitamin D3", "Vitamin E", "Wasabi", "Watercress", "Watermelon", "Wheat", "Whey protein", "White mulberry", "Wild yam", "Willow bark", "Witch hazel", "Wormwood", "Xylitol", "Xylooligosaccharides", "Yarrow", "Yellow dock", "Yellow pea", "Yerba mate", "Yerba santa", "Yohimbe", "Yucca", "Zeaxanthin", "Zinc", "Zucchini", "Guar Gum", "Chia", "Flaxseed", "Chickpea", "Quinoa", "Sweet Potato", "PQQ", "Citrulline Malate", "Alpha-GPC", "Vitamin K2", "Phosphatidic Acid", "Alpinia galanga", "Kiwifruit", "Mango", "Panax notoginseng", "Saccharomyces cerevisiae", "Black turmeric", "Krill Oil", "Aronia berry", "Terminalia bellerica", "Bifidobacterium lactis", "Dextrin", "Beta-Carotene", "Galactooligosaccharides", "Streptococcus salivarius", "Lactobacillus gasseri", "Apple cider vinegar", "Grains of Paradise", "Cellulase", "Ceramides", "Lactobacillus helveticus", "Coffee fruit", "DMAE", "Methylliberine", "Fermented Yeast", "Melon", "Tapioca", "Beta-hydroxybutyrate", "Honokiol", "Glycerin", "Kombucha", "Nicotinamide riboside", "Green Lipped Mussel", "Black Seed Oil", "Nitrate salts", "Red Spinach", "Rosa roxburghii", "Green Tea", "Alanine", "Erythritol", "Theobroma cacao", "Glycine", "Casein peptides", "7-Keto DHEA"]

In [None]:
with open("../output/simplified_studies.json", "r", encoding="utf-8") as f:
    studies = json.load(f)

In [81]:
len(studies)

5184

In [82]:
ingredient = "Dextrin"

In [83]:
# Normalize ingredient
def normalize_ingredient_name(name: str) -> str:
    if not isinstance(name, str):
        return ""
    
    normalized = name.lower().strip()
    normalized = re.sub(r"\d+\s*(mg|mcg|iu|g|ml|μg|units?)\b", "", normalized, flags=re.IGNORECASE)
    normalized = re.sub(r"\s+(supplement|capsule|tablet|powder|extract|oil)s?\b", "", normalized)
    normalized = re.sub(r"\s+", " ", normalized).strip()

    SYNONYM_MAP = {
        "\"apigenin\" and \"glycyrrhizin\"": "Apigenin",
        "(R)-3-hydroxybutyl (R)-3-hydroxybutyrate ketone monoester": "Beta-hydroxybutyrate",
        "(R)-3-hydroxybutyl (R)-3-hydroxybutyrate)": "Beta-hydroxybutyrate",
        "0.9% NaCl physiological saline": "Sodium",
        "0.9%Nacl": "Sodium",
        "1 dose of Beetroot juice intake": "Beet",
        "100 % Maltodextrin": "Dextrin",
        "140 ml per day of Beet-It nitrate beverage (James White Drinks Ltd., Ipswich, UK)": "Nitrate salts",
        "200mg GCE +200mg ALA": "Alpha-lipoic acid",
        "21st Century Gelatin Capsules": "Gelatin",
        "25 hydroxy cholecalciferol (Vitamin D) /25(OH)D3.": "Vitamin D3",
        "25(OH)D3": "Vitamin D3",
        "25-OH Vitamin D": "Vitamin D3",
        "3 mg/kg of caffeine": "Caffeine",
        "3-hydroxybutyrat": "Beta-hydroxybutyrate",
        "3-hydroxybutyrate": "Beta-hydroxybutyrate",
        "315 mg Alpha-GPC": "Alpha-GPC",
        "36 mg/day astaxanthin intervention": "Astaxanthin",
        "4000 IU of Vitamin D3": "Vitamin D3",
        "4000IU Vitamin D3": "Vitamin D3",
        "4g of taurine daily": "Taurine",
        "500mg seaweed": "Algae",
        "A combination of the vegetable bouillon, the microalgae and the beta-Glucan": "Beta-glucan",
        "Acacia Gum": "Acacia",
        "Acetate (Apple Cider Vinegar)": "Apple cider vinegar",
        "Acetyl cysteine": "N-Acetyl-Cysteine",
        "Acetyl-L-Carnitin, L Arginine, Co-Q10": "Carnitine",
        "Akkermansia muciniphila": "Akkermansia",
        "Alanine": "Alanine",
        "Algae oil 1": "Algae Oil",
        "Algae oil 2": "Algae Oil",
        "Allium sativum Extract": "Garlic",
        "Alpha lipoic acid (ALA)": "Alpha-lipoic acid",
        "Alpha-GPC, Creatine and Ashwagandha (Sensoril®)": "Alpha-GPC",
        "Alpha-Glycerophosphocholine（α-GPC）": "Alpha-GPC",
        "Alpha-ketoglutarate": "Alpha-ketoglutaric acid",
        "Alpha-ketoglutarate supplied as 1000mg capsule": "Alpha-ketoglutaric acid",
        "Alpha-lipoic acid": "Alpha-lipoic acid",
        "AMPK Charge+® with semaglutide": "Semaglutide",
        "ANKASCIN 568-P Red yeast rice capsules": "Red yeast rice",
        "Apple cider vinegar": "Apple cider vinegar",
        "Apple Polyphenols": "Polyphenol",
        "Apigenin": "Apigenin",
        "Arginine and Glutamine Oral Suspension": "Arginine",
        "Ashwagandha": "Ashwagandha",
        "Ashwagandha 300 mg standardized root extract": "Ashwagandha",
        "Ascorbic Acid (Vitamin C)": "Vitamin C",
        "Ascorbic Acid 500Mg Tab": "Vitamin C",
        "Ascorbic Acid 500Mg/Ml Inj": "Vitamin C",
        "Ascorbic acid": "Vitamin C",
        "Ascorbic acid (Vitamin C)": "Vitamin C",
        "Astaxanthin": "Astaxanthin",
        "Astaxanthin (12 mg/day) intervention": "Astaxanthin",
        "Astaxanthin (AST trial)": "Astaxanthin",
        "Astaxanthin 6 mg daily": "Astaxanthin",
        "Astaxanthin Oral Capsule": "Astaxanthin",
        "AstraGin (Ginseng and Astragalus Saponin Extract)": "Ginseng",
        "Astragalus": "Astragalus",
        "Astragalus extract": "Astragalus",
        "Bacillus Coagulans": "Bacillus coagulans",
        "Bacillus Subtilis": "Bacillus subtilis",
        "Bacillus coagulans": "Bacillus coagulans",
        "Bacillus subtilis ATCC 122264": "Bacillus subtilis",
        "Bacillus subtilis Capsules": "Bacillus subtilis",
        "Bacillus subtilis enteric-coated capsules": "Bacillus subtilis",
        "Bacopa monnieri": "Bacopa",
        "Beet": "Beet",
        "Beet Juice": "Beet",
        "Beet Root": "Beet",
        "Beet-root juice": "Beet",
        "Beetroot Juice": "Beet",
        "Beetroot Juice (BJ)": "Beet",
        "Beetroot Juice (Beet-It Stamina Shot) & Supervised Exercise Training": "Beet",
        "Beetroot Juice - Active": "Beet",
        "Beetroot Juice supplement": "Beet",
        "Beetroot drink": "Beet",
        "Beetroot extract": "Beet",
        "Beetroot juice": "Beet",
        "Beetroot juice (James White, UK)": "Beet",
        "Beetroot juice combined with caffeine (COM)": "Beet",
        "Beetroot juice concentrate (140 ml)": "Beet",
        "Beetroot juice plus vitamin C tablet": "Beet",
        "Beetroot juice rich in nitrate": "Beet",
        "Beetroot juice supplementation or placebo": "Beet",
        "Beetroot juice with caffeine placebo (BRJ)": "Beet",
        "Beetroot juice with nitrate extracted": "Beet",
        "Berberine": "Berberine",
        "Beta Alanine high dose": "Beta-Alanine",
        "Beta Alanine low dose": "Beta-Alanine",
        "Beta Glucan": "Beta-glucan",
        "Beta-Hydroxybutyrate": "Beta-hydroxybutyrate",
        "Beta-alanine": "Beta-Alanine",
        "Beta-glucan": "Beta-glucan",
        "Beta-glucan supplement group": "Beta-glucan",
        "Beta-hydroxybutyrate": "Beta-hydroxybutyrate",
        "Betaine": "Betaine",
        "Biotin supplement": "Biotin",
        "Black Seed Oil": "Black Seed Oil",
        "Blueberry": "Blueberry",
        "Blueberry Supplementation": "Blueberry",
        "Blueberry drink": "Blueberry",
        "Blueberry gel": "Blueberry",
        "Blueberry powder": "Blueberry",
        "Blueberry powder group": "Blueberry",
        "Blueberry powder supplement": "Blueberry",
        "Broccoli Sprout/Broccoli Seed Extract Supplement": "Broccoli",
        "Broccoli seed extract": "Broccoli",
        "Broccoli seed extract with mustard seed powder": "Broccoli",
        "Broccoli sprouts extract supplementation": "Broccoli",
        "Bromelain": "Bromelain",
        "Buckwheat": "Buckwheat",
        "Butyrate": "Butyrate",
        "Caffeine": "Caffeine",
        "Caffeine (200 mg) + Placebo (300 mg)": "Caffeine",
        "Caffeine 0 mg": "Caffeine",
        "Caffeine 2 mg": "Caffeine",
        "Caffeine 3 mg/kg Oral Powder": "Caffeine",
        "Caffeine 4 mg": "Caffeine",
        "Caffeine 6 mg/kg Oral Powder": "Caffeine",
        "Caffeine 6mg/kg": "Caffeine",
        "Caffeine 75mg": "Caffeine",
        "Caffeine 9 mg/kg Oral Powder": "Caffeine",
        "Caffeine Gum": "Caffeine",
        "Caffeine Gum 3mg/kg": "Caffeine",
        "Caffeine Supplement": "Caffeine",
        "Caffeine supplementation": "Caffeine",
        "Calcium": "Calcium",
        "Calcium Carbonate": "Calcium",
        "Calcium carbonate": "Calcium",
        "Camu Camu Capsules (Camu Camu powder encapsulated (500mg each) + ICI": "Camu camu",
        "Cannabidiol": "Cannabidiol",
        "Cannabidiol (CBD)": "Cannabidiol",
        "Cannabidiol (CBD) Broad-Spectrum Oil": "Cannabidiol",
        "Cannabidiol (CBD) Extract": "Cannabidiol",
        "Cannabidiol (CBD) powder formulation": "Cannabidiol",
        "Cannabidiol (CBD) supplementation": "Cannabidiol",
        "Cannabidiol (CBD)-Rich Broad Spectrum Hemp Extract Oil": "Cannabidiol",
        "Cannabidiol Capsules": "Cannabidiol",
        "Carnitine": "Carnitine",
        "Carnosine": "Carnosine",
        "Carnosine supplementation": "Carnosine",
        "Casein": "Casein peptides",
        "Chaga Mushroom": "Chaga",
        "Chamomile": "Chamomile",
        "Chamomile (Matricaria recutita)": "Chamomile",
        "Chamomile Extract Capsule": "Chamomile",
        "Chamomile Tea": "Chamomile",
        "Chia seed": "Chia",
        "Chickpea Pasta": "Chickpea",
        "Chlorella pyrenoidosa": "Chlorella",
        "Chlorella supplementation": "Chlorella",
        "Cholecalciferol": "Vitamin D3",
        "Choline": "Choline",
        "Cinnamon Capsules": "Cinnamon",
        "Citicoline": "Citicoline",
        "Clostridium butyricum": "Clostridium Butyricum",
        "Clostridum Butyricum Capsule": "Clostridium Butyricum",
        "Co-Enzyme Q10": "Coenzyme Q10",
        "CoQ10": "Coenzyme Q10",
        "CoQ10 1200 mg orally with Glutathione 1000 mg orally": "Coenzyme Q10",
        "CoQ10 supplementation": "Coenzyme Q10",
        "Cocoa": "Cocoa",
        "Cocoa Powder": "Cocoa",
        "Cocoa flavonoids": "Cocoa",
        "Coconut": "Coconut",
        "Coconut Oil": "Coconut",
        "Coenzyme Q 10": "Coenzyme Q10",
        "Coenzyme Q10": "Coenzyme Q10",
        "Coenzyme Q10 100 MG Oral Tablet": "Coenzyme Q10",
        "Coenzyme Q10 100 Milligrams Oral Capsule": "Coenzyme Q10",
        "Collagen": "Collagen",
        "Collagen + vitamin C": "Collagen",
        "Collagen Drinks": "Collagen",
        "Collagen Hydrolysate Supplement": "Collagen",
        "Collagen Peptide": "Collagen",
        "Collagen Peptides": "Collagen",
        "Collagen Supplement": "Collagen",
        "Collagen and Vitamin C": "Collagen",
        "Collagen hydrolysate": "Collagen",
        "Collagen hydrolyzed peptides": "Collagen",
        "Collagen peptide supplement": "Collagen",
        "Collagen protein": "Collagen",
        "Collagen sachet": "Collagen",
        "Coriander Seed Oil - Dose 1": "Coriander",
        "Coriander Seed Oil - Dose 2": "Coriander",
        "Cranberry": "Cranberry",
        "Cranberry Juice": "Cranberry",
        "Cranberry Juice A": "Cranberry",
        "Cranberry Juice B": "Cranberry",
        "Cranberry extract": "Cranberry",
        "Cranberry juice": "Cranberry",
        "Cranberry powder": "Cranberry",
        "Creatine": "Creatine",
        "Creatine Bolus 3": "Creatine",
        "Creatine Bolus 5": "Creatine",
        "Creatine Group": "Creatine",
        "Creatine Intermittent 5": "Creatine",
        "Creatine Monohydrate": "Creatine",
        "Creatine Monohydrate Supplementation": "Creatine",
        "Creatine Supplementation": "Creatine",
        "Creatine Whey Protein": "Creatine",
        "Creatine monohydrate": "Creatine",
        "Creatine supplementation": "Creatine",
        "Cumin Seed (Cuminum Cyminum)": "Cumin",
        "Curcumin": "Curcumin",
        "Curcumin (Longvida™)": "Curcumin",
        "Curcumin + Piperine": "Curcumin",
        "Curcumin Gummies": "Curcumin",
        "Curcumin Supplementation": "Curcumin",
        "Curcumin and virgin coconut oil extract (KurCo Smart)": "Curcumin",
        "Curcumin capsules 1 gm": "Curcumin",
        "Curcumin plus Piperine": "Curcumin",
        "Curcumin therapy": "Curcumin",
        "Curcumin, Omega-3 and Vitamin-D (COD)": "Curcumin",
        "Curcumin, vitamin d and green tea extract": "Curcumin",
        "Curcumin-Berberine (coptis)": "Curcumin",
        "Curcumin/ Demethoxycurcumin/Bisdemethoxycurcumin-containing Supplement": "Curcumin",
        "Curcumin/Boswellia Serrata/Ascorbic acid mixture": "Curcumin",
        "D-Mannose": "D-Mannose",
        "DHA": "Docosahexaenoic acid (DHA)",
        "DHA and EPA": "Docosahexaenoic acid (DHA)",  # (EPA also present in canon)
        "Dehydroepiandrosterone (DHEA)": "Dehydroepiandrosterone",
        "Dihydroberberine（DHB）400 mg": "Berberine",
        "Docosahexaenoic Acid": "Docosahexaenoic acid (DHA)",
        "Dried plums": "Prune",
        "EPA": "Eicosapentaenoic acid (EPA)",
        "EPA supplementation": "Eicosapentaenoic acid (EPA)",
        "Elderberry": "Elderberry",
        "Egg White Powder": "Egg",
        "Eggs": "Egg",
        "Epicatechin extract": "Epicatechin",
        "Epigallocatechin Gallate": "Epigallocatechin gallate",
        "Epigallocatechin-3-Gallate (EGCG)": "Epigallocatechin gallate",
        "Erythritol": "Erythritol",
        "Fenugreek (Trigonella foenum graecum)": "Fenugreek",
        "Fenugreek Seed Powder": "Fenugreek",
        "Fenugreek Seeds and Indian Rennet": "Fenugreek",
        "Fenugreek seed": "Fenugreek",
        "FeSO4": "Iron",
        "FeSO4 + Lf": "Iron",
        "FeSO4 + OTf": "Iron",
        "Ferrous bisglycinate": "Iron",
        "Ferrous fumarate": "Iron",
        "Ferrous sulfate": "Iron",
        "Ferrous sulphate 200mg oral tablet providing 65 mg of elemental iron": "Iron",
        "Fisetin": "Fisetin",
        "Fish Oil": "Fish oil",
        "Fish Oil Concentrate, 1000 Mg Oral Capsule": "Fish oil",
        "Fish oil": "Fish oil",
        "Fish oil (control)": "Fish oil",
        "Fish oil + Roasted wheat flour": "Fish oil",
        "Fish oil + Vegetable and fruit extracts": "Fish oil",
        "Fish oil + wild orange essential oil supplement": "Fish oil",
        "Fish oil enteral supplementation": "Fish oil",
        "Fish oil intervention": "Fish oil",
        "Fish oil oral supplementation": "Fish oil",
        "Fish oil supplement": "Fish oil",
        "Fish oil supplement 1": "Fish oil",
        "Fish oil supplement 2": "Fish oil",
        "Flaxseed": "Flaxseed",
        "Flaxseed oil capsule": "Flaxseed Oil",
        "Folate": "Folate",
        "Folic Acid": "Folate",
        "Folic acid": "Folate",
        "Folic acid and vitamin B12 fortified flour": "Folate",
        "Folic acid supplement": "Folate",
        "Folinic acid (oral); Cyanocobalamin sublingual": "Folate",
        "Fucoidan": "Fucoidan",
        "Galactooligosaccharides (GOS)": "Galactooligosaccharides",
        "Ginger": "Ginger",
        "Ginseng extract": "Ginseng",
        "Glucosamine/Chondroitin": "Glucosamine",
        "Glucose": "Glucose",
        "Glutamine": "Glutamine",
        "Glycine": "Glycine",
        "Grape Seed Extract": "Grape",
        "Grape extract (VinteraTM Premium Red Grape)": "Grape",
        "Grape powder supplementation": "Grape",
        "Grape seed extract": "Grape",
        "Green Tea": "Green Tea",
        "Green Tea Extract": "Green Tea",
        "Green tea": "Green Tea",
        "Hawthorn supplement": "Hawthorn",
        "HMB": "HMB",
        "HMB + Vitamin D3": "HMB",
        "HMB supplementation": "HMB",
        "Honey": "Honey",
        "Honokiol": "Honokiol",
        "Hyaluronic Acid (HA)": "Hyaluronic acid",
        "Hyaluronic acid 50 mg/capsule": "Hyaluronic acid",
        "Hyaluronic acid 75 mg/capsule": "Hyaluronic acid",
        "Hyaluronic acid, HA": "Hyaluronic acid",
        "Inulin": "Inulin",
        "Iron": "Iron",
        "Isoflavones": "Isoflavones",
        "Isolated Whey Protein": "Whey protein",
        "Kefir": "Kefir",
        "Kefir peptide": "Kefir",
        "Krill Oil": "Krill Oil",
        "Krill Oil 500 MG": "Krill Oil",
        "Krill oil": "Krill Oil",
        "Krill oil supplementation": "Krill Oil",
        "L-Arginine Powder": "Arginine",
        "L-Carnitine": "Carnitine",
        "L-Carnitine 4g": "Carnitine",
        "L-Citrulline": "Citrulline",
        "L-carnitine": "Carnitine",
        "L-citrulline": "Citrulline",
        "L-leucine": "Leucine",
        "L-theanine": "Theanine",
        "Lysine Group - Lysine-Fortified Bread": "Lysine",
        "Luteolin": "Luteolin",
        "Lycopene": "Lycopene",
        "Maca": "Maca",
        "Mango": "Mango",
        "Melatonin": "Melatonin",
        "Melatonin 0.5 mg": "Melatonin",
        "Melatonin 10 MG": "Melatonin",
        "Melatonin 10 MG Oral Tablet": "Melatonin",
        "Melatonin 3 MG Oral Tablet": "Melatonin",
        "Melatonin 3 mg": "Melatonin",
        "Melatonin 5 mg": "Melatonin",
        "Melatonin 6 mg": "Melatonin",
        "Melatonin intervention": "Melatonin",
        "Melatonin supplement intervention": "Melatonin",
        "Melatonin supplementation": "Melatonin",
        "Melatonin tablet 3 mg once daily": "Melatonin",
        "Melatonin treatment": "Melatonin",
        "Methylcobalamin": "Vitamin B12",
        "Metformin": "Metformin",
        "Metformin (1000 mg Twice a day)": "Metformin",
        "Metformin (Standard Treatment for Type 2 Diabetes)": "Metformin",
        "Metformin 500 mg/day": "Metformin",
        "Metformin Hydrochloride (HCL)": "Metformin",
        "Metformin Hydrochloride 500Mg Tablet": "Metformin",
        "Metformin Monotherapy": "Metformin",
        "Moringa Oleifera": "Moringa",
        "Moringa Oleifera Leaf Micronized Powders in Capsule": "Moringa",
        "Moringa leaf powder": "Moringa",
        "Moringa oleifera": "Moringa",
        "Moringa Oleifera Leaf Micronized Powders in Capsule": "Moringa",
        "Moringa leaf powder": "Moringa",
        "Moringa oleifera": "Moringa",
        "N Acetyl Cysteine": "N-Acetyl-Cysteine",
        "N Acetyl L Cysteine": "N-Acetyl-Cysteine",
        "N-Acetyl cysteine": "N-Acetyl-Cysteine",
        "N-Acetylcysteine": "N-Acetyl-Cysteine",
        "N-acetylcysteine": "N-Acetyl-Cysteine",
        "N-acetylcysteine (NAC)": "N-Acetyl-Cysteine",
        "NAC": "N-Acetyl-Cysteine",
        "NAC (N-acetyl cysteine), Alpha lipoic acid (ALA), liposomal glutathione (GSH)": "N-Acetyl-Cysteine",
        "NMN": "NMN (Nicotinamide mononucleotide)",
        "NMN capsule": "NMN (Nicotinamide mononucleotide)",
        "NMN intervention": "NMN (Nicotinamide mononucleotide)",
        "Nicotinamide": "Nicotinamide",
        "Nicotinamide Mononucleotide": "NMN (Nicotinamide mononucleotide)",
        "Nicotinamide Mononucleotide (NMN)": "NMN (Nicotinamide mononucleotide)",
        "Nicotinamide Riboside": "Nicotinamide riboside",
        "Nicotinamide Riboside (NR)": "Nicotinamide riboside",
        "Nicotinamide Riboside 1g (oral)": "Nicotinamide riboside",
        "Nicotinamide Riboside Chloride": "Nicotinamide riboside",
        "Nicotinamide riboside": "Nicotinamide riboside",
        "Nigella Sativa Oil capsule": "Black Seed Oil",
        "Nigella sativa extract (Nisatol®)": "Black Seed Oil",
        "Nitrate": "Nitrate salts",
        "Nitrate group": "Nitrate salts",
        "Nitrate-Rich Beetroot Juice": "Nitrate salts",
        "Nitrate-rich beetroot Juice": "Nitrate salts",
        "Nitrate-rich beetroot juice": "Nitrate salts",
        "Nitrates and Citrulline Malate": "Citrulline Malate",
        "Niacin": "Vitamin B3",
        "Omega-3": "Omega-3",
        "Omega-3 (EPA+DHA)": "Omega-3",
        "Omega-3 Fatty ACids": "Omega-3",
        "Omega-3 Fatty Acids": "Omega-3",
        "Omega-3 Fatty Acids (EPA plus DHA)": "Omega-3",
        "Omega-3 Polyunsaturated Fatty Acid.": "Omega-3",
        "Omega-3 Suplementation": "Omega-3",
        "Omega-3 Supplementation": "Omega-3",
        "Omega-3 fatty acid": "Omega-3",
        "Omega-3 polyunsaturated fatty acid": "Omega-3",
        "Omega-3 supplementation": "Omega-3",
        "OmegaBoost": "Omega-3",
        "Opuntia ficus-indica Supplementation": "Nopal cactus",
        "Oral Elemental Iron (Ferrous Sulfate) 80 mg/day": "Iron",
        "Pectin": "Pectin",
        "Peppermint Oil": "Peppermint",
        "Peppermint oil": "Peppermint",
        "Pea protein": "Pea protein",
        "Peanuts": "Peanut",
        "Potassium chloride supplement": "Potassium",
        "Potassium nitrate and inulin": "Inulin",
        "Prune": "Prune",
        "Prunes": "Prune",
        "Psyllium": "Psyllium",
        "Psyllium husk": "Psyllium",
        "Psyllium powder": "Psyllium",
        "Pterostilbene-silybin-nicotinamide riboside": "Nicotinamide riboside",
        "Pumpkin Seed Oil \" Ronkin®, KMT PHARMA, Egypt. \"": "Pumpkin Seed Oil",
        "Pectin": "Pectin",
        "Quercetin": "Quercetin",
        "Quercetin (dietary supplement)": "Quercetin",
        "Quercetin 1.250 mg (oral)": "Quercetin",
        "Raspberry": "Raspberry",
        "Red chili peppers": "Capsicum",
        "Red yeast rice": "Red yeast rice",
        "Reishi": "Reishi",
        "Resistant Dextrin": "Dextrin",
        "Resistant dextrin": "Dextrin",
        "Resistant Potato Starch": "Starch",
        "Resistant Potato Starch, Corn Starch": "Starch",
        "Resistant Starch": "Starch",
        "Resistant potato starch": "Starch",
        "Resistant starch": "Starch",
        "Resveratrol": "Resveratrol",
        "Rice Bran": "Rice bran",
        "Riboflavin": "Vitamin B2",
        "Rose Apple Extract Drink": "Rose",
        "Royal Jelly": "Royal jelly",
        "Safflower Oil": "Safflower Oil",
        "Safflower oil": "Safflower Oil",
        "Selenium": "Selenium",
        "Selenium nutritional supplementation": "Selenium",
        "Semaglutide": "Semaglutide",
        "Senna Tab": "Senna",
        "Shatavari": "Shatavari",
        "Spirulina": "Spirulina",
        "Spirulina Arthrospira platensis (microalgae)": "Spirulina",
        "Spirulina maxima supplementation": "Spirulina",
        "Stevia": "Stevia",
        "Starch": "Starch",
        "Strawberry": "Strawberry",
        "Sulforaphane": "Sulforaphane",
        "Superoxide dismutase": "Superoxide dismutase",
        "Sweet cherries, 280 g daily for 42 days": "Sweet cherry",
        "Synephrine": "Synephrine",
        "Starch (placebo)": "Starch",
        "Starch Only": "Starch",
        "Strawberry Intervention": "Strawberry",
        "Strawberry lyophilisate": "Strawberry",
        "Sucralose": "Sucralose",
        "Sulforaphane (Avmacol Extra Strength)": "Sulforaphane",
        "Sunflower Oil": "Sunflower",
        "Sunflower oil": "Sunflower",
        "Superoxide Dismutase": "Superoxide dismutase",
        "Synbiotic": "Synbiotic",
        "Synbiotic Supplement": "Synbiotic",
        "Synbiotic supplement": "Synbiotic",
        "Synbiotics": "Synbiotic",
        "Tart Cherry": "Tart cherry",
        "Taurine": "Taurine",
        "Taurine (Tau)": "Taurine",
        "Taurine and Exercise (Tau+Ex)": "Taurine",
        "Taurine supplementation": "Taurine",
        "TeTrimTeas welsh Herbal tea with senna": "Senna",
        "Thiamine": "Vitamin B1",
        "Trimethylglycine": "Trimethylglycine",
        "Urolithin A": "Urolithin A",
        "Urolithin A (Mitopure)": "Urolithin A",
        "Ursodeoxycholic Acid 250 Mg Oral Capsule": "Ursodeoxycholic acid",
        "Vitamin A": "Vitamin A",
        "Vitamin B12": "Vitamin B12",
        "Vitamin B3 100 MG Oral Tablet": "Vitamin B3",
        "Vitamin B6": "Vitamin B6",
        "Vitamin C": "Vitamin C",
        "Vitamin D": "Vitamin D3",
        "Vitamin D (Cholecalciferol )": "Vitamin D3",
        "Vitamin D3": "Vitamin D3",
        "Vitamin E": "Vitamin E",
        "Vitamin K2": "Vitamin K2",
        "Whey protein": "Whey protein",
        "Whey protein isolate": "Whey protein",
        "Wild Blueberry": "Blueberry",
        "Wild blueberry": "Blueberry",
        "Wild blueberry powder": "Blueberry",
        "Yohimbine": "Yohimbe",
        "Zinc": "Zinc",
        "cow's milk kefir": "Kefir",
        "creatine": "Creatine",
        "creatine monohydrate": "Creatine",
        "creatine supplementation": "Creatine",
        "curcumin": "Curcumin",
        "curcumin, high phenolic extra virgin olive oil (HP-EVOO)": "Curcumin",
        "curcumin, omega-3, and vitamin D (COD)": "Curcumin",
        "dextrin": "Dextrin",
        "dietary nitrate": "Nitrate salts",
        "docosahexaenoic acid (DHA)": "Docosahexaenoic acid (DHA)",
        "donor human milk": "Human milk",
        "eicosapentaenoic acid": "Eicosapentaenoic acid (EPA)",
        "ferrous sulfate": "Iron",
        "fish oil": "Fish oil",
        "flaxseed": "Flaxseed",
        "fructose": "Fructose",
        "fucoidan": "Fucoidan",
        "genistein": "Genistein",
        "geranylgeraniol": "Geranylgeraniol",
        "ginger extract": "Ginger",
        "glucose": "Glucose",
        "green tea": "Green Tea",
        "inulin": "Inulin",
        "iron supplementation": "Iron",
        "kefir": "Kefir",
        "krill oil": "Krill Oil",
        "lutein": "Lutein",
        "maca": "Maca",
        "magnesium": "Magnesium Citrate",
        "melatonin": "Melatonin",
        "protein supplement": "Hydrolyzed Whey Protein",
        "protein supplementation": "Hydrolyzed Whey Protein",
        "psyllium": "Psyllium",
        "psyllium husk": "Psyllium",
        "resveratrol": "Resveratrol",
        "soy isoflavones": "Isoflavones",
        "soy protein": "Pea protein",
        "starch": "Starch",
        "sucralose": "Sucralose",
        "sweet cherries, 280 g daily for 42 days": "Sweet cherry",
        "taurine": "Taurine",
        "tributyrin": "Tributyrin",
        "vitamin B6+ Vitamin B12": "Vitamin B6",
        "vitamin B6+ Vitamin B12 +vitamin C": "Vitamin B6",
        "vitamin B9": "Folate",
        "vitamin C": "Vitamin C",
        "vitamin D (cholecalciferol) supplementation": "Vitamin D3",
        "vitamin D3": "Vitamin D3",
        "walnuts": "Walnut",
        "wheat dextrin": "Dextrin",
        "whey protein": "Whey protein",
        "whey protein isolate": "Whey protein",
        "white rice": "White rice",
        "yohimbine": "Yohimbe",
        "zeaxanthine": "Zeaxanthin",
        "zinc acetate": "Zinc",
        "β-glucan": "Beta-glucan",
        "β-hydroxy β-methylbutyrate (HMB)": "HMB",
        "β-hydroxy β-methylbutyrate (HMB) supplement": "HMB",
    }
    
    for pattern, replacement in SYNONYM_MAP.items():
        if pattern in normalized:
            normalized = replacement
            break
    
    return normalized


In [84]:
filtered_studies = [
    study for study in studies
    if any(
        normalize_ingredient_name(name) == ingredient
        for name in study.get("interventions", {}).get("intervention_names", [])
    )
]
len(filtered_studies)

101

In [85]:
filtered_studies

[{'nctId': 'NCT07193927',
  'title': 'Investigation of the Efficacy of a Probiotic Mixture in Moderate Metabolic Dysfunction-Associated Steatotic Liver Disease (MASLD): A Mechanistic Trial',
  'status': None,
  'start_date': '2025-10-13',
  'start_year': 2025,
  'conditions': ['MASLD - Metabolic Dysfunction-Associated Steatotic Liver Disease',
   'Fatty Liver Disease, Nonalcoholic',
   'Overweight (BMI &gt; 25)'],
  'keywords': [],
  'interventions': {'intervention_names': ['Lactiplantibacillus plantarum Probiotic Mixture',
    'Placebo comparator containing maltodextrin'],
   'intervention_types': ['DIETARY_SUPPLEMENT', 'DIETARY_SUPPLEMENT']},
  'enrollment': 60,
  'sponsor': 'AB Biotics, SA'},
 {'nctId': 'NCT05416151',
  'title': 'Effect of B.Lactis Consumption on Gastro-Intestinal (GI) Symptoms in Healthy Women Reporting Minor GI Symptoms',
  'status': None,
  'start_date': '2022-05-24',
  'start_year': 2022,
  'conditions': ['Gastro-intestinal Symptoms in Healthy Subjects'],
  'key

In [None]:
# TRIM FOR CATEGORIES NORMALIZATION
FIELDS_TO_KEEP = {"nctId", "conditions"}

trimmed_studies_for_categories = deepcopy(filtered_studies)

for study in trimmed_studies_for_categories:
    for field in list(study.keys()):
        if field not in FIELDS_TO_KEEP:
            study.pop(field, None)
trimmed_studies_for_categories[:2]

[{'nctId': 'NCT07193927',
  'conditions': ['MASLD - Metabolic Dysfunction-Associated Steatotic Liver Disease',
   'Fatty Liver Disease, Nonalcoholic',
   'Overweight (BMI &gt; 25)']},
 {'nctId': 'NCT05416151',
  'conditions': ['Gastro-intestinal Symptoms in Healthy Subjects']},
 {'nctId': 'NCT05906589', 'conditions': ['Psyllium', 'Inulin']},
 {'nctId': 'NCT06297083', 'conditions': ['Peanut Allergy']},
 {'nctId': 'NCT05135351', 'conditions': ['Multiple Myeloma', 'Lymphoma']},
 {'nctId': 'NCT05744700',
  'conditions': ['Digestive Health', 'Gastrointestinal Health']},
 {'nctId': 'NCT07171411',
  'conditions': ['Muscle Hypertrophy',
   'Bone Mineral Density Loss',
   'Resistance Training Adaptation']},
 {'nctId': 'NCT07201909', 'conditions': ['Stress']},
 {'nctId': 'NCT06920667', 'conditions': ['Type 2 Diabetes Mellitus']},
 {'nctId': 'NCT06570330', 'conditions': ['Healthy Adults']},
 {'nctId': 'NCT05201651',
  'conditions': ['Young Healthy Adults (no Medical Condition/Disease)']},
 {'nctI

In [87]:
# TRIMMED FOR INGREDIENTS NORMALIZATION
trimmed_studies_for_ingredients = []

filtered_studies_copy = deepcopy(filtered_studies)
for study in filtered_studies_copy:
    trimmed = {}

    if "nctId" in study:
        trimmed["nctId"] = study["nctId"]

    interventions = study.get("interventions", {})
    if isinstance(interventions, dict):
        names = interventions.get("intervention_names")
        if names is not None:
            trimmed["intervention_names"] = names

    trimmed_studies_for_ingredients.append(trimmed)

trimmed_studies_for_ingredients[:2]

[{'nctId': 'NCT07193927',
  'intervention_names': ['Lactiplantibacillus plantarum Probiotic Mixture',
   'Placebo comparator containing maltodextrin']},
 {'nctId': 'NCT05416151',
  'intervention_names': ['Freeze-dried biotic', 'Maltodextrin']}]

In [99]:
# TRIM FOR SUMMARY
FIELDS_TO_KEEP = {"nctId", "keywords", "title", "conditions", "interventions"}

trimmed_studies_for_summary = deepcopy(filtered_studies)

for study in trimmed_studies_for_summary:
    for field in list(study.keys()):
        if field not in FIELDS_TO_KEEP:
            study.pop(field, None)
trimmed_studies_for_summary[:2]

[{'nctId': 'NCT07193927',
  'title': 'Investigation of the Efficacy of a Probiotic Mixture in Moderate Metabolic Dysfunction-Associated Steatotic Liver Disease (MASLD): A Mechanistic Trial',
  'conditions': ['MASLD - Metabolic Dysfunction-Associated Steatotic Liver Disease',
   'Fatty Liver Disease, Nonalcoholic',
   'Overweight (BMI &gt; 25)'],
  'keywords': [],
  'interventions': {'intervention_names': ['Lactiplantibacillus plantarum Probiotic Mixture',
    'Placebo comparator containing maltodextrin'],
   'intervention_types': ['DIETARY_SUPPLEMENT', 'DIETARY_SUPPLEMENT']}},
 {'nctId': 'NCT05416151',
  'title': 'Effect of B.Lactis Consumption on Gastro-Intestinal (GI) Symptoms in Healthy Women Reporting Minor GI Symptoms',
  'conditions': ['Gastro-intestinal Symptoms in Healthy Subjects'],
  'keywords': ['Gastro intestinal symptoms',
   'Healthy women',
   'Freeze-dried probiotic product',
   'Randomized controlled trial'],
  'interventions': {'intervention_names': ['Freeze-dried bio

In [None]:
categories_system_prompt = """
You are a data-normalization engine.

You will be given an input object with:
- studies_condition: an array of objects, each containing:
  - nctId
  - conditions (array of raw condition strings)
- canon_categories (canonical category list/mapping)

Your task:
- For EACH study in studies_condition:
  - Map each condition to the best matching category from canon_categories.
  - Use semantic meaning, not exact string matching.
  - Deduplicate categories.
  - Ignore conditions that do not clearly map.

Return ONLY valid JSON in this exact format:

{
  "study_categories": [
    {
      "nctId": "<nctId>",
      "categories": []
    }
  ]
}

Rules:
- Do NOT return explanations.
- Do NOT include the original conditions.
- Do NOT invent categories not present in canon_categories.
- If a study has no matches, return an empty categories array for it.
- Maintain the same order as the input studies.
"""
try:
    user_content = {
        "studies": trimmed_studies_for_categories,
        "canon_categories": CANON_CATEGORIES
    }

    payload = {
        "model": "gpt-5-mini",
        "response_format": {"type": "json_object"},
        "messages": [
            {"role": "system", "content": categories_system_prompt},
            {"role": "user", "content": json.dumps(user_content)}
        ]
    }

    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        json=payload,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {OPENAI_API_KEY}",
        },
        timeout=180
    )

    if response is None:
        raise Exception("No response returned from OpenAI")

    if response.status_code >= 400:
        raise Exception(f"Bad status code: {response.status_code} — {response.text}")

    ai = response.json()

    if "choices" not in ai:
        raise Exception("Missing 'choices' field in response")

    content = ai["choices"][0]["message"].get("content")
    if not content:
        raise Exception("Empty content returned from model")

    study_categories = json.loads(content)

except Exception as e:
    print(f"Upstream error: {e}", 502)

In [95]:
study_categories

{'study_categories': [{'nctId': 'NCT07193927', 'category': 'Liver & Detox'},
  {'nctId': 'NCT05416151', 'category': 'Digestive & Gut Health'},
  {'nctId': 'NCT05906589', 'category': 'Digestive & Gut Health'},
  {'nctId': 'NCT06297083', 'category': 'Immune Health'},
  {'nctId': 'NCT05135351', 'category': None},
  {'nctId': 'NCT05744700', 'category': 'Digestive & Gut Health'},
  {'nctId': 'NCT07171411', 'category': 'Sports Nutrition'},
  {'nctId': 'NCT07201909', 'category': 'Stress & Mood'},
  {'nctId': 'NCT06920667', 'category': 'Blood Sugar Support'},
  {'nctId': 'NCT06570330', 'category': None},
  {'nctId': 'NCT05201651', 'category': None},
  {'nctId': 'NCT06628869', 'category': 'Digestive & Gut Health'},
  {'nctId': 'NCT06682715', 'category': None},
  {'nctId': 'NCT05801042', 'category': 'Healthy Aging'},
  {'nctId': 'NCT05769101', 'category': 'Sports Nutrition'},
  {'nctId': 'NCT05178667', 'category': 'Weight Management'},
  {'nctId': 'NCT05480696', 'category': 'Liver & Detox'},
  {

In [None]:
def count_all_categories(study_categories_data):
    counter = Counter()

    studies = study_categories_data.get("study_categories", [])

    for study in studies:
        if not isinstance(study, dict):
            continue

        cats = study.get("categories", [])
        if not isinstance(cats, list):
            continue

        for c in cats:
            if c:  # skip empty strings
                counter[c] += 1

    return [
        {"category": c, "count": count}
        for c, count in counter.most_common()
    ]

categories = count_all_categories(study_categories)
categories

[{'category': 'Digestive & Gut Health', 'count': 16},
 {'category': 'Sports Nutrition', 'count': 11},
 {'category': 'Liver & Detox', 'count': 7},
 {'category': 'Stress & Mood', 'count': 7},
 {'category': 'Immune Health', 'count': 6},
 {'category': 'Weight Management', 'count': 6},
 {'category': 'Blood Sugar Support', 'count': 5},
 {'category': 'Healthy Aging', 'count': 5},
 {'category': 'Cognitive Health', 'count': 4},
 {'category': 'Hair, Skin & Nails', 'count': 3},
 {'category': 'Heart Health', 'count': 2},
 {'category': 'Inflammation & Pain', 'count': 2},
 {'category': 'Joint & Mobility', 'count': 2},
 {'category': 'Sleep & Relaxation', 'count': 1},
 {'category': 'Bone Health', 'count': 1},
 {'category': "Women's Wellness", 'count': 1},
 {'category': "Men's Wellness", 'count': 1}]

In [None]:
ingredients_system_prompt = """
You are a data-normalization engine.

You will be given an input object with:
- studies_interventions: an array of objects, each containing:
  - nctId
  - intervention_names (array of raw intervention strings)
- canon_ingredients (canonical ingredient list/mapping)

Your task:
- For EACH study in studies_interventions:
  - Map each intervention name to the best matching ingredient from canon_ingredients.
  - Use semantic meaning, not exact string matching.
  - Deduplicate ingredients.
  - Ignore placebo, comparators, maltodextrin, and non-active controls.
  - Ignore intervention names that do not clearly map to a real ingredient.

Return ONLY valid JSON in this exact format:

{
  "study_ingredients": [
    {
      "nctId": "<nctId>",
      "ingredients": []
    }
  ]
}

Rules:
- Do NOT return explanations.
- Do NOT include the original intervention names.
- Do NOT invent ingredients not present in canon_ingredients.
- If a study has no matches, return an empty ingredients array for it.
- Maintain the same order as the input studies.
"""

try:
    user_content = {
        "studies_interventions": trimmed_studies_for_ingredients,
        "canon_categories": CANON_INGREDIENTS
    }

    payload = {
        "model": "gpt-5-mini",
        "response_format": {"type": "json_object"},
        "messages": [
            {"role": "system", "content": ingredients_system_prompt},
            {"role": "user", "content": json.dumps(user_content)}
        ]
    }

    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        json=payload,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {OPENAI_API_KEY}",
        }
    )

    if response is None:
        raise Exception("No response returned from OpenAI")

    if response.status_code >= 400:
        raise Exception(f"Bad status code: {response.status_code} — {response.text}")

    ai = response.json()

    if "choices" not in ai:
        raise Exception("Missing 'choices' field in response")

    content = ai["choices"][0]["message"].get("content")
    if not content:
        raise Exception("Empty content returned from model")

    study_ingredients = json.loads(content)

except Exception as e:
    print(f"Upstream error: {e}", 502)


In [None]:
study_ingredients

{'study_ingredients': [{'nctId': 'NCT07193927',
   'ingredients': ['Lactobacillus plantarum']},
  {'nctId': 'NCT05416151', 'ingredients': []},
  {'nctId': 'NCT05906589', 'ingredients': ['Inulin', 'Psyllium']},
  {'nctId': 'NCT06297083', 'ingredients': ['Lactobacillus rhamnosus']},
  {'nctId': 'NCT05135351', 'ingredients': []},
  {'nctId': 'NCT05744700', 'ingredients': []},
  {'nctId': 'NCT07171411', 'ingredients': ['Whey protein', 'Collagen']},
  {'nctId': 'NCT07201909', 'ingredients': []},
  {'nctId': 'NCT06920667',
   'ingredients': ['Creatine', 'Fish oil', 'Vitamin D3']},
  {'nctId': 'NCT06570330', 'ingredients': []},
  {'nctId': 'NCT05201651', 'ingredients': ['Creatine']},
  {'nctId': 'NCT06628869', 'ingredients': []},
  {'nctId': 'NCT06682715', 'ingredients': ['Whey protein']},
  {'nctId': 'NCT05801042', 'ingredients': ['Lactobacillus rhamnosus']},
  {'nctId': 'NCT05769101', 'ingredients': []},
  {'nctId': 'NCT05178667', 'ingredients': []},
  {'nctId': 'NCT05480696',
   'ingredien

In [None]:
def compute_associate_ingredients(study_ingredients, ingredient):
    counter = Counter()

    for study in study_ingredients.get("study_ingredients"):
        ing_list = study.get("ingredients", [])
        if not isinstance(ing_list, list):
            continue

        for ing in ing_list:
            if ing != ingredient:
                counter[ing] += 1

    return[
        {"ingredient": ing, "association_count": count}
        for ing, count in counter.most_common()
    ]

associated_ingredients = compute_associate_ingredients(study_ingredients, ingredient)
associated_ingredients

[{'ingredient': 'Inulin', 'association_count': 15},
 {'ingredient': 'Creatine', 'association_count': 7},
 {'ingredient': 'Whey protein', 'association_count': 6},
 {'ingredient': 'Fructooligosaccharides', 'association_count': 3},
 {'ingredient': 'Lactobacillus rhamnosus', 'association_count': 2},
 {'ingredient': 'Collagen', 'association_count': 2},
 {'ingredient': 'Resistant Dextrin', 'association_count': 2},
 {'ingredient': 'Spirulina', 'association_count': 2},
 {'ingredient': 'Blueberry', 'association_count': 2},
 {'ingredient': 'Saffron', 'association_count': 2},
 {'ingredient': 'Beta-Alanine', 'association_count': 2},
 {'ingredient': 'Beta-hydroxybutyrate', 'association_count': 2},
 {'ingredient': 'Lactobacillus plantarum', 'association_count': 1},
 {'ingredient': 'Psyllium', 'association_count': 1},
 {'ingredient': 'Fish oil', 'association_count': 1},
 {'ingredient': 'Vitamin D3', 'association_count': 1},
 {'ingredient': 'Kombucha', 'association_count': 1},
 {'ingredient': 'NMN (Ni

In [None]:
year_counter = Counter()

for study in filtered_studies:
    year = study.get("start_year")
    if isinstance(year, int):      
        year_counter[year] += 1 

study_trial_per_year = [
    {"year": year, "count": count}
    for year, count in sorted(year_counter.items())
]

study_trial_per_year

[{'year': 2022, 'count': 27},
 {'year': 2023, 'count': 24},
 {'year': 2024, 'count': 26},
 {'year': 2025, 'count': 24}]

In [None]:
summary_system_prompt = """ 
You are a scientific study summarization engine.

You will be given:
- ingredient: a single ingredient name (string)
- studies: an array of objects, each containing:
  - nctId
  - title
  - keywords (array, may be empty)
  - conditions (array)
  - interventions (object)

Important Context:
- The provided studies have already been filtered in a prior normalization step to be relevant to the given ingredient.
- The ingredient may NOT be explicitly mentioned in the title, keywords, conditions, or interventions.
- You must still assume the ingredient is the active focus of these studies.

Your task:
- Write ONE concise, well-structured paragraph summarizing how the given ingredient is being utilized across the provided studies.
- Base the summary ONLY on:
  - Study titles
  - Keywords
  - Conditions
  - Interventions (form, delivery, comparator context if implied)
- Infer usage context such as:
  - target condition
  - population
  - mechanism or functional role
  - delivery format (if implied)

Rules:
- Focus ONLY on the given ingredient.
- Do NOT summarize each study separately.
- Synthesize into a single unified paragraph.
- Do NOT mention NCT IDs.
- Do NOT invent mechanisms, claims, or outcomes not supported by the input.
- If keywords are empty, infer only from title, conditions, and interventions.
- Keep the tone factual, neutral, and clinical.
- Do NOT include placebo or comparator effects as benefits.

Return ONLY valid JSON in this exact format:

{
  "summary": "<single paragraph summary>"
}

Do NOT return explanations, markdown, or any text outside the JSON.
"""


try:
    user_content = {
        "studies": trimmed_studies_for_summary,
        "ingredient": ingredient
    }

    payload = {
        "model": "gpt-5-mini",
        "response_format": {"type": "json_object"},
        "messages": [
            {"role": "system", "content": summary_system_prompt},
            {"role": "user", "content": json.dumps(user_content)}
        ]
    }

    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        json=payload,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {OPENAI_API_KEY}",
        }
    )

    if response is None:
        raise Exception("No response returned from OpenAI")

    if response.status_code >= 400:
        raise Exception(f"Bad status code: {response.status_code} — {response.text}")

    ai = response.json()

    if "choices" not in ai:
        raise Exception("Missing 'choices' field in response")

    content = ai["choices"][0]["message"].get("content")
    if not content:
        raise Exception("Empty content returned from model")

    summary = json.loads(content).get("summary")

except Exception as e:
    print(f"Upstream error: {e}", 502)

In [102]:
summary

'Across these trials, dextrin—most commonly as maltodextrin but also as resistant/soluble dextrin variants—is used in two principal ways: (1) as an inactive carbohydrate comparator or formulation carrier (placebo, powder, drink, biscuit or tablet excipient) in a wide range of nutrition, probiotic, supplement and drug‑adjunct studies involving healthy adults, older adults, infants/children, athletes and patient groups (e.g., diabetes, obesity, NAFLD, cancer, cystic fibrosis, psychiatric disorders, post‑COVID), and (2) as an active dietary‑fiber ingredient (resistant/soluble dextrins or oligodextrin) studied for prebiotic/fermentation effects on the gut microbiota, short‑chain fatty acids, intestinal fermentation/gas, glycaemic/metabolic responses, and gut–brain or gut–skin axis outcomes. Delivery formats implied by interventions include powders, drinks, biscuits, chewable tablets and capsules, and dextrin is frequently combined with or contrasted against proteins, probiotics, prebiotics

In [None]:
ingredient_profile = {
    "ingredient": ingredient,
    "study_summary": summary,
    "number_of_involved_studies": len(studies),
    "categories": categories,
    "associated_ingredients": associated_ingredients,
    "study_trial_per_year": study_trial_per_year
}
ingredient_profile

{'ingredient': 'Dextrin',
 'study_summary': 'Across the assembled trials, dextrin is being investigated primarily as an orally administered dietary fiber / prebiotic ingredient—often as a standalone fiber or as part of multi‑ingredient prebiotic blends—with the intent to modulate the gut microbiota and related metabolic, gastrointestinal and neurobehavioral outcomes. Studies use dextrin-type interventions to evaluate effects on GI symptoms and bowel function (including constipation and post‑challenge bloating/gas), on microbiome composition and short‑chain fatty acid production, on intestinal permeability and inflammatory markers, and on metabolic endpoints such as glycemic variability, NAFLD and obesity‑related measures. Trials span healthy volunteers and specific groups (older adults, children, people post‑stem cell transplant, patients with IBS or metabolic disease), and routinely assess safety/tolerability, changes in microbiome/metabolites, and clinical or quality‑of‑life outcomes

In [None]:
output_dir = "../output"

with open(f"{output_dir}/ingredient_profile.json", "w", encoding="utf-8") as f:
    json.dump(ingredient_profile, f, indent=2, ensure_ascii=False)