In [28]:
import time
import json
import logging
import re
import os
from pathlib import Path
from datetime import datetime
from collections import Counter, defaultdict
from typing import Any, Dict, List, Tuple, Optional, Generator
import requests
import numpy as np
import pandas as pd
from dateutil.parser import parse as parse_date


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


CONFIGS

In [29]:
BASE_URL = "https://clinicaltrials.gov/api/v2/studies"
PAGE_SIZE = 100
REQUEST_DELAY = 0.35

END_DATE = datetime(2025, 12, 31)
START_DATE = END_DATE - pd.Timedelta(days=4*365)

CANON_INGREDIENTS = ["5-HTP", "Acacia", "Acai", "Adenosine triphosphate", "Adrenal", "Agarikon", "Agmatine", "Akkermansia", "Alfalfa", "Algae", "Algae Oil", "Allicin", "Aloe vera", "Alpha-ketoglutaric acid", "Alpha-lipoic acid", "Amla", "Amylase", "Andrographis", "Anise", "Apigenin", "Apple", "Arabinogalactan", "Argan oil", "Arginine", "Arjuna", "Artichoke", "Ashwagandha", "Asparagine", "Aspartic acid", "Astaxanthin", "Astragalus", "Bacillus coagulans", "Bacillus subtilis", "Bacopa", "Bamboo", "Banaba", "Baobab", "Barberry", "Barley", "Bee propolis", "Beet", "Berberine", "Bergamot", "Beta-Alanine", "Beta-glucan", "Betaine", "Bifidobacterium adolescentis", "Bifidobacterium bifidum", "Bifidobacterium breve", "Bifidobacterium infantis", "Bifidobacterium longum", "Bilberry", "Biotin", "Bitter melon", "Black cherry", "Black cohosh", "Black currant", "Black pepper", "Black radish", "Black walnut", "Bladderwrack", "Blessed thistle", "Blue flag", "Blueberry", "Boldo", "Bone marrow", "Borage Oil", "Boron", "Boswellia", "Brewers yeast", "Broccoli", "Bromelain", "Buchu", "Buckthorn", "Buckwheat", "Bupleurum", "Burdock", "Butcher's broom", "Butterbur", "Butyrate", "Cabbage", "Caffeine", "Calcium", "Camelina Oil", "Camu camu", "Cannabidiol", "Cannabigerol", "Capsicum", "Cardamom", "Carnitine", "Carnosine", "Carrot", "Cascara sagrada", "Cat's claw", "Catalase", "Catuaba", "Cayenne", "Celery", "Cetyl myristoleate", "Chaga", "Chamomile", "Chanca piedra", "Chaste tree", "Chicory", "Chinese Yam", "Chitosan", "Chlorella", "Chloride", "Chlorophyll", "Choline", "Chondroitin", "Chromium", "Chrysin", "Chymotrypsin", "Cinnamon", "Cissus quadrangularis", "Citicoline", "Citrulline", "Citrus bioflavonoids", "Clostridium Butyricum", "Clove", "Cobalt", "Cocoa", "Coconut", "Coenzyme Q10", "Coleus forskohlii", "Collagen", "Colostrum", "Conjugated linoleic acid", "Copper", "Coptis", "Cordyceps", "Coriander", "Corn silk", "Couch grass", "Cramp bark", "Cranberry", "Creatine", "Cumin", "Curcumin", "D-Mannose", "D-Ribose", "Damiana", "Dandelion", "Dehydroepiandrosterone", "Devil's claw", "Dihydromyricetin (DHM)", "Diindolylmethane", "Dimethylglycine", "Diosmin", "Docosahexaenoic acid (DHA)", "Dong quai", "DPA", "Dulse", "Duodenum", "Ecdysterone", "Echinacea", "Ecklonia cava", "Eicosapentaenoic acid (EPA)", "Elderberry", "Elderflower", "Elecampane", "Eleuthero", "Epicatechin", "Epigallocatechin gallate", "Ergothioneine", "Eucalyptus", "Evening primrose oil", "Eyebright", "Fennel", "Fenugreek", "Feverfew", "Fish oil", "Flaxseed Oil", "Flower Pollen Extract", "Fo-ti (He Shou Wu)", "Folate", "Forskolin", "Fructooligosaccharides", "Fucoidan", "Fucoxanthin", "Fulvic acid", "GABA", "Gamma linolenic acid", "Garcinia cambogia", "Garlic", "Gelatin", "Gentian", "Germanium", "Ginger", "Ginkgo", "Ginseng", "Glucomannan", "Glucosamine", "Glutamic acid", "Glutamine", "Glutathione", "Goldenseal", "Gotu kola", "Grape", "Grapefruit", "Gravel root", "Graviola", "Green coffee bean", "Guarana", "Guggul", "Gymnema sylvestre", "Gynostemma", "Halostachine", "Hawthorn", "Hemp", "Hesperidin", "Hibiscus", "Higenamine", "Histidine", "HMB", "Holy Basil", "Honey", "Hoodia", "Hops", "Hordenine", "Horny goat weed", "Horse chestnut", "Horsetail", "Huperzine", "Hyaluronic acid", "Hydroxyproline", "Hyssop", "Immunoglobulins", "Indole-3-carbinol", "Inositol", "Inositol Hexaphosphate", "Inulin", "Iodine", "Iron", "Isatis", "Isoflavones", "Isoleucine", "Jerusalem artichoke", "Jujube", "Juniper", "Kale", "Kanna", "Kava kava", "Kelp", "Keratin", "Kidney", "Kola nut", "Kudzu", "Lactase", "Lactobacillus acidophilus", "Lactobacillus brevis", "Lactobacillus bulgaricus", "Lactobacillus casei", "Lactobacillus paracasei", "Lactobacillus plantarum", "Lactobacillus reuteri", "Lactobacillus rhamnosus", "Lactoferrin", "Lavender", "Lemon", "Lemon balm", "Leucine", "Licorice", "Lignans", "Linoleic acid", "Lion's Mane", "Lipase", "Lithium", "Bovine Spleen", "Bovine Pancreas", "Bovine Kidney", "Bovine Heart", "Bovine Liver", "Bovine Glandular", "Lobelia", "Long pepper", "Lumbrokinase", "Lutein", "Luteolin", "Lycopene", "Lysine", "Maca", "Magnesium Citrate", "Magnesium Glycinate", "Magnesium Malate", "Magnesium Taurinate", "Magnesium Threonate", "Magnesium Oxide", "Magnolia", "Maitake", "Malic acid", "Manganese", "Mangosteen", "Maqui Berry", "Marigold", "Marine Phytoplankton", "Marshmallow", "Matcha", "Medium chain triglycerides", "Melatonin", "Methionine", "Methylsulfonylmethane", "Milk thistle", "Mineral", "Molybdenum", "Monolaurin", "Moringa", "Motherwort", "Mucuna pruriens", "Muira puama", "Mullein", "Mussel", "Mustard", "Myricetin", "Myrrh", "N-Acetyl-Cysteine", "NADH", "Naringin", "Natto", "Nattokinase", "Neem", "Nettle", "Nickel", "NMN (Nicotinamide mononucleotide)", "Noni", "Nopal cactus", "Nucleic acid", "Oat", "Oleic acid", "Olive", "Orange", "Oregano", "Oregon grape", "Ornithine", "Osha", "Ox bile", "Oyster", "Palmitoylethanolamide", "Pancreas", "Pancreatin", "Papaya", "Paprika", "Para-aminobenzoic acid", "Parsley", "Passion flower", "Pau d'arco", "Pea protein", "Pectin", "Pectinase", "Peppermint", "Pepsin", "Perilla Oil", "Phenylalanine", "Phenylethylamine", "Phosphatidylcholine", "Phosphatidylserine", "Phosphorus", "Phytoceramides", "Phytosterols", "Picrorhiza", "Pine", "Pine bark", "Pineapple", "Piperine", "Plantain", "Plum", "Policosanol", "Polyphenol", "Polypodium vulgare", "Pomegranate", "Poria", "Potassium", "Pregnenolone", "Prickly ash", "Prickly pear", "Proanthocyanidins", "Proline", "Protease", "Hydrolyzed Whey Protein", "Prune", "Psyllium", "Pterostilbene", "Pumpkin", "Pumpkin Seed Oil", "Pygeum", "Quercetin", "Raspberry", "Rauwolscine", "Red clover", "Red root", "Red wine", "Red yeast rice", "Rehmannia", "Reishi", "Resveratrol", "Rhodiola", "Rhubarb", "Riboflavin", "Rice bran", "RNA", "Rose hips", "Rosemary", "Royal jelly", "Rubidium", "Rutin", "Saccharomyces boulardii", "Safflower Oil", "Saffron", "Sage", "Sarsaparilla", "Saw palmetto", "Schisandra", "Scute (Scutellaria)", "Sea Buckthorn", "Sea cucumber", "Selenium", "Senna", "Serine", "Serrapeptase", "Sesame", "Shark cartilage", "Shatavari", "Shiitake", "Shilajit", "Silicon", "Slippery elm", "Sodium", "Sophora Japonica", "Soybean", "Spermidine", "Spirulina", "Spleen", "Squalene", "St. John’s Wort", "Star anise", "Stevia", "Stoneroot", "Strawberry", "Streptococcus thermophilus", "Strontium", "Succinic acid", "Sulbutiamine", "Sulforaphane", "Suma", "Sunflower", "Superoxide dismutase", "Sweet cherry", "Synephrine", "Tangerine", "Tart cherry", "Taurine", "Theacrine", "Theanine", "Theobromine", "Threonine", "Thyme", "Thymus", "Thyroid", "Tin", "Tomato", "Tongkat Ali", "Toothed clubmoss", "Tribulus terrestris", "Triphala", "TUDCA (Tauroursodeoxycholic acid)", "Turkey rhubarb", "Turkey tail", "Tyrosine", "Uridine", "Urolithin A", "Uva ursi", "Valerian", "Valine", "Vanadium", "Vegetable glycerin", "Vinpocetine", "Vitamin A", "Vitamin B1", "Vitamin B12", "Vitamin B3", "Vitamin B5", "Vitamin B6", "Vitamin C", "Vitamin D3", "Vitamin E", "Wasabi", "Watercress", "Watermelon", "Wheat", "Whey protein", "White mulberry", "Wild yam", "Willow bark", "Witch hazel", "Wormwood", "Xylitol", "Xylooligosaccharides", "Yarrow", "Yellow dock", "Yellow pea", "Yerba mate", "Yerba santa", "Yohimbe", "Yucca", "Zeaxanthin", "Zinc", "Zucchini", "Guar Gum", "Chia", "Flaxseed", "Chickpea", "Quinoa", "Sweet Potato", "PQQ", "Citrulline Malate", "Alpha-GPC", "Vitamin K2", "Phosphatidic Acid", "Alpinia galanga", "Kiwifruit", "Mango", "Panax notoginseng", "Saccharomyces cerevisiae", "Black turmeric", "Krill Oil", "Aronia berry", "Terminalia bellerica", "Bifidobacterium lactis", "Dextrin", "Beta-Carotene", "Galactooligosaccharides", "Streptococcus salivarius", "Lactobacillus gasseri", "Apple cider vinegar", "Grains of Paradise", "Cellulase", "Ceramides", "Lactobacillus helveticus", "Coffee fruit", "DMAE", "Methylliberine", "Fermented Yeast", "Melon", "Tapioca", "Beta-hydroxybutyrate", "Honokiol", "Glycerin", "Kombucha", "Nicotinamide riboside", "Green Lipped Mussel", "Black Seed Oil", "Nitrate salts", "Red Spinach", "Rosa roxburghii", "Green Tea", "Alanine", "Erythritol", "Theobroma cacao", "Glycine", "Casein peptides", "7-Keto DHEA"]

SUPPLEMENT_TERMS = [
    "dietary supplement",
    "food supplement", 
    "nutraceutical",
    "vitamin",
    "mineral supplement",
    "herbal supplement",
    "probiotic",
    "prebiotic",
    "omega-3",
    "fish oil",
    "amino acid supplement"
]

RETURN_FIELDS = [
    "NCTId",
    "BriefTitle",
    "StartDate",
    "Condition",
    "Phase",
    "StudyType",
    "InterventionName",
    "InterventionType",
    "Keyword",
    "EnrollmentCount",
    "LeadSponsorName"
]

CUSTOM_KEYWORDS = [
    "Weight Management OR fat loss OR appetite control OR thermogenesis OR metabolic rate",
    "Digestive & Gut Health OR microbiome OR probiotics OR prebiotics OR gut barrier OR IBS",
    "Energy OR ATP production OR fatigue OR mitochondrial support OR stamina",
    "Sports Nutrition OR muscle recovery OR endurance OR performance OR hypertrophy",
    "Joint & Mobility OR cartilage OR osteoarthritis OR inflammation OR flexibility",
    "Cognitive Health OR memory OR focus OR neuroprotection OR neurotransmitters",
    "Immune Health OR innate immunity OR adaptive immunity OR viral defense OR inflammation control",
    "Hair Skin & Nails OR collagen synthesis OR keratin OR elasticity OR skin aging",
    "Men’s Wellness OR testosterone OR prostate health OR fertility OR muscle mass",
    "Women’s Wellness OR hormonal balance OR fertility OR menopause OR PCOS",
    "Prenatal & Postnatal OR fetal development OR maternal nutrition OR lactation OR postpartum recovery",
    "Liver & Detox OR hepatic function OR detoxification enzymes OR fatty liver OR oxidative stress",
    "Sleep & Relaxation OR circadian rhythm OR melatonin OR sleep latency OR sleep quality",
    "Stress & Mood OR cortisol OR anxiety OR depression OR serotonergic function",
    "Bone Health OR bone mineral density OR osteoblast activity OR calcium metabolism",
    "Heart Health OR cardiovascular disease OR lipid profile OR blood pressure OR endothelial function",
    "Blood Sugar Support OR glucose control OR insulin sensitivity OR HbA1c OR metabolic syndrome",
    "Healthy Aging OR longevity OR cellular senescence OR resilience OR frailty",
    "Vision & Eye Health OR retinal health OR macular degeneration OR visual acuity",
    "Inflammation & Pain OR chronic inflammation OR cytokines OR nociception OR pain modulation"
]




CLINICAL TRIAL API

In [30]:
class ClinicalTrialsClient:
    """
    Client for interacting with ClinicalTrials.gov API v2.
    Handles pagination, rate limiting, and error recovery.
    """
    
    def __init__(self, base_url: str, page_size: int = 100, delay: float = 0.35):
        self.base_url = base_url
        self.page_size = page_size
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update({
            "Accept": "application/json",
            "User-Agent": "ClinicalTrialsResearch/1.0"
        })
    
    def search(
        self,
        query_term: Optional[str] = None,
        query_cond: Optional[str] = None,
        query_intr: Optional[str] = None,
        query_outc: Optional[str] = None,
        filter_advanced: Optional[str] = None,
        fields: Optional[list] = None
    ) -> Generator[Dict[str, Any], None, None]:

        params = {
            "pageSize": self.page_size,
            "format": "json"
        }

        if query_term:
            params["query.term"] = query_term
        if query_cond:
            params["query.cond"] = query_cond
        if query_intr:
            params["query.intr"] = query_intr
        if query_outc:
            params["query.outc"] = query_outc
        if filter_advanced:
            params["filter.advanced"] = filter_advanced
        if fields:
            params["fields"] = fields
        
        page_token = None
        total_fetched = 0

        while True:
            if page_token:
                params["pageToken"] = page_token
            else:
                params.pop("pageToken", None)

            response = self._make_request(params)

            if response is None:
                break

            studies = response.get("studies", [])

            for study in studies:
                yield study
                total_fetched += 1

            page_token = response.get("nextPageToken")

            if not page_token:
                logger.info(f"Fetch complete: {total_fetched} studies retrieved")
                break

            logger.info(f"Progress: {total_fetched}")
            time.sleep(self.delay)

    def _make_request(
        self, 
        params: Dict[str, Any], 
        max_retries: int = 3
    ) -> Optional[Dict[str, Any]]:

        for attempt in range(max_retries):
            try:
                response = self.session.get(
                    self.base_url, 
                    params=params, 
                    timeout=30
                )
                response.raise_for_status()
                return response.json()
                
            except requests.exceptions.HTTPError as e:
                if response.status_code == 429:
                    wait_time = 2 ** attempt * 5
                    logger.warning(f"Rate limited. Waiting {wait_time}s...")
                    time.sleep(wait_time)
                elif response.status_code >= 500:
                    logger.warning(f"Server error {response.status_code}. Retry {attempt+1}")
                    time.sleep(2 ** attempt)
                else:
                    logger.error(f"HTTP error: {e}")
                    raise
                    
            except requests.exceptions.RequestException as e:
                logger.warning(f"Request failed: {e}. Retry {attempt+1}")
                time.sleep(2 ** attempt)
        
        logger.error("Max retries exceeded")
        return None


DATA CLEANUP

In [31]:
def extract_nested_field(record: Dict, path: str) -> Any:
    keys = path.split(".")
    value = record
    
    for key in keys:
        if isinstance(value, dict):
            value = value.get(key)
        else:
            return None
        if value is None:
            return None
    
    return value


def flatten_study_record(study: Dict) -> Dict[str, Any]:
    protocol = study.get("protocolSection", {})
    
    interventions_raw = extract_nested_field(
        study, 
        "protocolSection.armsInterventionsModule.interventions"
    ) or []
    
    intervention_names = []
    intervention_types = []
    for intr in interventions_raw:
        if isinstance(intr, dict):
            if intr.get("name"):
                intervention_names.append(intr["name"])
            if intr.get("type"):
                intervention_types.append(intr["type"])
    
    start_date_raw = extract_nested_field(
        study,
        "protocolSection.statusModule.startDateStruct.date"
    )
    start_date = None
    start_year = None
    if start_date_raw:
        try:
            start_date = parse_date(start_date_raw)
            start_year = start_date.year
        except (ValueError, TypeError):
            pass
    
    phases = extract_nested_field(study, "protocolSection.designModule.phases") or []
    phase_str = "; ".join(phases) if isinstance(phases, list) else str(phases)
    
    return {
        "nct_id": extract_nested_field(study, "protocolSection.identificationModule.nctId"),
        "title": extract_nested_field(study, "protocolSection.identificationModule.briefTitle"),
        "status": extract_nested_field(study, "protocolSection.statusModule.overallStatus"),
        "start_date": start_date,
        "start_year": start_year,
        "phase": phase_str,
        "study_type": extract_nested_field(study, "protocolSection.designModule.studyType"),
        "conditions": extract_nested_field(study, "protocolSection.conditionsModule.conditions") or [],
        "keywords": extract_nested_field(study, "protocolSection.conditionsModule.keywords") or [],
        "intervention_names": intervention_names,
        "intervention_types": intervention_types,
        "enrollment": extract_nested_field(study, "protocolSection.designModule.enrollmentInfo.count"),
        "sponsor": extract_nested_field(study, "protocolSection.sponsorCollaboratorsModule.leadSponsor.name")
    }


def build_dataframe(studies: List[Dict]) -> pd.DataFrame:
    flattened = [flatten_study_record(s) for s in studies]
    df = pd.DataFrame(flattened)
    
    if "start_date" in df.columns:
        df["start_date"] = pd.to_datetime(df["start_date"], errors="coerce")
    
    return df


def explode_list_column(
    df: pd.DataFrame, 
    column: str, 
    new_column: Optional[str] = None
) -> pd.DataFrame:
    if new_column is None:
        new_column = f"{column}_item"
    
    result = df.copy()
    result = result.explode(column)
    result = result.rename(columns={column: new_column})
    result = result.dropna(subset=[new_column])
    
    return result


def normalize_ingredient_name(name: str) -> str:
    if not isinstance(name, str):
        return ""
    
    normalized = name.lower().strip()
    normalized = re.sub(r"\d+\s*(mg|mcg|iu|g|ml|μg|units?)\b", "", normalized, flags=re.IGNORECASE)
    normalized = re.sub(r"\s+(supplement|capsule|tablet|powder|extract|oil)s?\b", "", normalized)
    normalized = re.sub(r"\s+", " ", normalized).strip()

    SYNONYM_MAP = {
        "\"apigenin\" and \"glycyrrhizin\"": "Apigenin",
        "(R)-3-hydroxybutyl (R)-3-hydroxybutyrate ketone monoester": "Beta-hydroxybutyrate",
        "(R)-3-hydroxybutyl (R)-3-hydroxybutyrate)": "Beta-hydroxybutyrate",
        "0.9% NaCl physiological saline": "Sodium",
        "0.9%Nacl": "Sodium",
        "1 dose of Beetroot juice intake": "Beet",
        "100 % Maltodextrin": "Dextrin",
        "140 ml per day of Beet-It nitrate beverage (James White Drinks Ltd., Ipswich, UK)": "Nitrate salts",
        "200mg GCE +200mg ALA": "Alpha-lipoic acid",
        "21st Century Gelatin Capsules": "Gelatin",
        "25 hydroxy cholecalciferol (Vitamin D) /25(OH)D3.": "Vitamin D3",
        "25(OH)D3": "Vitamin D3",
        "25-OH Vitamin D": "Vitamin D3",
        "3 mg/kg of caffeine": "Caffeine",
        "3-hydroxybutyrat": "Beta-hydroxybutyrate",
        "3-hydroxybutyrate": "Beta-hydroxybutyrate",
        "315 mg Alpha-GPC": "Alpha-GPC",
        "36 mg/day astaxanthin intervention": "Astaxanthin",
        "4000 IU of Vitamin D3": "Vitamin D3",
        "4000IU Vitamin D3": "Vitamin D3",
        "4g of taurine daily": "Taurine",
        "500mg seaweed": "Algae",
        "A combination of the vegetable bouillon, the microalgae and the beta-Glucan": "Beta-glucan",
        "Acacia Gum": "Acacia",
        "Acetate (Apple Cider Vinegar)": "Apple cider vinegar",
        "Acetyl cysteine": "N-Acetyl-Cysteine",
        "Acetyl-L-Carnitin, L Arginine, Co-Q10": "Carnitine",
        "Akkermansia muciniphila": "Akkermansia",
        "Alanine": "Alanine",
        "Algae oil 1": "Algae Oil",
        "Algae oil 2": "Algae Oil",
        "Allium sativum Extract": "Garlic",
        "Alpha lipoic acid (ALA)": "Alpha-lipoic acid",
        "Alpha-GPC, Creatine and Ashwagandha (Sensoril®)": "Alpha-GPC",
        "Alpha-Glycerophosphocholine（α-GPC）": "Alpha-GPC",
        "Alpha-ketoglutarate": "Alpha-ketoglutaric acid",
        "Alpha-ketoglutarate supplied as 1000mg capsule": "Alpha-ketoglutaric acid",
        "Alpha-lipoic acid": "Alpha-lipoic acid",
        "AMPK Charge+® with semaglutide": "Semaglutide",
        "ANKASCIN 568-P Red yeast rice capsules": "Red yeast rice",
        "Apple cider vinegar": "Apple cider vinegar",
        "Apple Polyphenols": "Polyphenol",
        "Apigenin": "Apigenin",
        "Arginine and Glutamine Oral Suspension": "Arginine",
        "Ashwagandha": "Ashwagandha",
        "Ashwagandha 300 mg standardized root extract": "Ashwagandha",
        "Ascorbic Acid (Vitamin C)": "Vitamin C",
        "Ascorbic Acid 500Mg Tab": "Vitamin C",
        "Ascorbic Acid 500Mg/Ml Inj": "Vitamin C",
        "Ascorbic acid": "Vitamin C",
        "Ascorbic acid (Vitamin C)": "Vitamin C",
        "Astaxanthin": "Astaxanthin",
        "Astaxanthin (12 mg/day) intervention": "Astaxanthin",
        "Astaxanthin (AST trial)": "Astaxanthin",
        "Astaxanthin 6 mg daily": "Astaxanthin",
        "Astaxanthin Oral Capsule": "Astaxanthin",
        "AstraGin (Ginseng and Astragalus Saponin Extract)": "Ginseng",
        "Astragalus": "Astragalus",
        "Astragalus extract": "Astragalus",
        "Bacillus Coagulans": "Bacillus coagulans",
        "Bacillus Subtilis": "Bacillus subtilis",
        "Bacillus coagulans": "Bacillus coagulans",
        "Bacillus subtilis ATCC 122264": "Bacillus subtilis",
        "Bacillus subtilis Capsules": "Bacillus subtilis",
        "Bacillus subtilis enteric-coated capsules": "Bacillus subtilis",
        "Bacopa monnieri": "Bacopa",
        "Beet": "Beet",
        "Beet Juice": "Beet",
        "Beet Root": "Beet",
        "Beet-root juice": "Beet",
        "Beetroot Juice": "Beet",
        "Beetroot Juice (BJ)": "Beet",
        "Beetroot Juice (Beet-It Stamina Shot) & Supervised Exercise Training": "Beet",
        "Beetroot Juice - Active": "Beet",
        "Beetroot Juice supplement": "Beet",
        "Beetroot drink": "Beet",
        "Beetroot extract": "Beet",
        "Beetroot juice": "Beet",
        "Beetroot juice (James White, UK)": "Beet",
        "Beetroot juice combined with caffeine (COM)": "Beet",
        "Beetroot juice concentrate (140 ml)": "Beet",
        "Beetroot juice plus vitamin C tablet": "Beet",
        "Beetroot juice rich in nitrate": "Beet",
        "Beetroot juice supplementation or placebo": "Beet",
        "Beetroot juice with caffeine placebo (BRJ)": "Beet",
        "Beetroot juice with nitrate extracted": "Beet",
        "Berberine": "Berberine",
        "Beta Alanine high dose": "Beta-Alanine",
        "Beta Alanine low dose": "Beta-Alanine",
        "Beta Glucan": "Beta-glucan",
        "Beta-Hydroxybutyrate": "Beta-hydroxybutyrate",
        "Beta-alanine": "Beta-Alanine",
        "Beta-glucan": "Beta-glucan",
        "Beta-glucan supplement group": "Beta-glucan",
        "Beta-hydroxybutyrate": "Beta-hydroxybutyrate",
        "Betaine": "Betaine",
        "Biotin supplement": "Biotin",
        "Black Seed Oil": "Black Seed Oil",
        "Blueberry": "Blueberry",
        "Blueberry Supplementation": "Blueberry",
        "Blueberry drink": "Blueberry",
        "Blueberry gel": "Blueberry",
        "Blueberry powder": "Blueberry",
        "Blueberry powder group": "Blueberry",
        "Blueberry powder supplement": "Blueberry",
        "Broccoli Sprout/Broccoli Seed Extract Supplement": "Broccoli",
        "Broccoli seed extract": "Broccoli",
        "Broccoli seed extract with mustard seed powder": "Broccoli",
        "Broccoli sprouts extract supplementation": "Broccoli",
        "Bromelain": "Bromelain",
        "Buckwheat": "Buckwheat",
        "Butyrate": "Butyrate",
        "Caffeine": "Caffeine",
        "Caffeine (200 mg) + Placebo (300 mg)": "Caffeine",
        "Caffeine 0 mg": "Caffeine",
        "Caffeine 2 mg": "Caffeine",
        "Caffeine 3 mg/kg Oral Powder": "Caffeine",
        "Caffeine 4 mg": "Caffeine",
        "Caffeine 6 mg/kg Oral Powder": "Caffeine",
        "Caffeine 6mg/kg": "Caffeine",
        "Caffeine 75mg": "Caffeine",
        "Caffeine 9 mg/kg Oral Powder": "Caffeine",
        "Caffeine Gum": "Caffeine",
        "Caffeine Gum 3mg/kg": "Caffeine",
        "Caffeine Supplement": "Caffeine",
        "Caffeine supplementation": "Caffeine",
        "Calcium": "Calcium",
        "Calcium Carbonate": "Calcium",
        "Calcium carbonate": "Calcium",
        "Camu Camu Capsules (Camu Camu powder encapsulated (500mg each) + ICI": "Camu camu",
        "Cannabidiol": "Cannabidiol",
        "Cannabidiol (CBD)": "Cannabidiol",
        "Cannabidiol (CBD) Broad-Spectrum Oil": "Cannabidiol",
        "Cannabidiol (CBD) Extract": "Cannabidiol",
        "Cannabidiol (CBD) powder formulation": "Cannabidiol",
        "Cannabidiol (CBD) supplementation": "Cannabidiol",
        "Cannabidiol (CBD)-Rich Broad Spectrum Hemp Extract Oil": "Cannabidiol",
        "Cannabidiol Capsules": "Cannabidiol",
        "Carnitine": "Carnitine",
        "Carnosine": "Carnosine",
        "Carnosine supplementation": "Carnosine",
        "Casein": "Casein peptides",
        "Chaga Mushroom": "Chaga",
        "Chamomile": "Chamomile",
        "Chamomile (Matricaria recutita)": "Chamomile",
        "Chamomile Extract Capsule": "Chamomile",
        "Chamomile Tea": "Chamomile",
        "Chia seed": "Chia",
        "Chickpea Pasta": "Chickpea",
        "Chlorella pyrenoidosa": "Chlorella",
        "Chlorella supplementation": "Chlorella",
        "Cholecalciferol": "Vitamin D3",
        "Choline": "Choline",
        "Cinnamon Capsules": "Cinnamon",
        "Citicoline": "Citicoline",
        "Clostridium butyricum": "Clostridium Butyricum",
        "Clostridum Butyricum Capsule": "Clostridium Butyricum",
        "Co-Enzyme Q10": "Coenzyme Q10",
        "CoQ10": "Coenzyme Q10",
        "CoQ10 1200 mg orally with Glutathione 1000 mg orally": "Coenzyme Q10",
        "CoQ10 supplementation": "Coenzyme Q10",
        "Cocoa": "Cocoa",
        "Cocoa Powder": "Cocoa",
        "Cocoa flavonoids": "Cocoa",
        "Coconut": "Coconut",
        "Coconut Oil": "Coconut",
        "Coenzyme Q 10": "Coenzyme Q10",
        "Coenzyme Q10": "Coenzyme Q10",
        "Coenzyme Q10 100 MG Oral Tablet": "Coenzyme Q10",
        "Coenzyme Q10 100 Milligrams Oral Capsule": "Coenzyme Q10",
        "Collagen": "Collagen",
        "Collagen + vitamin C": "Collagen",
        "Collagen Drinks": "Collagen",
        "Collagen Hydrolysate Supplement": "Collagen",
        "Collagen Peptide": "Collagen",
        "Collagen Peptides": "Collagen",
        "Collagen Supplement": "Collagen",
        "Collagen and Vitamin C": "Collagen",
        "Collagen hydrolysate": "Collagen",
        "Collagen hydrolyzed peptides": "Collagen",
        "Collagen peptide supplement": "Collagen",
        "Collagen protein": "Collagen",
        "Collagen sachet": "Collagen",
        "Coriander Seed Oil - Dose 1": "Coriander",
        "Coriander Seed Oil - Dose 2": "Coriander",
        "Cranberry": "Cranberry",
        "Cranberry Juice": "Cranberry",
        "Cranberry Juice A": "Cranberry",
        "Cranberry Juice B": "Cranberry",
        "Cranberry extract": "Cranberry",
        "Cranberry juice": "Cranberry",
        "Cranberry powder": "Cranberry",
        "Creatine": "Creatine",
        "Creatine Bolus 3": "Creatine",
        "Creatine Bolus 5": "Creatine",
        "Creatine Group": "Creatine",
        "Creatine Intermittent 5": "Creatine",
        "Creatine Monohydrate": "Creatine",
        "Creatine Monohydrate Supplementation": "Creatine",
        "Creatine Supplementation": "Creatine",
        "Creatine Whey Protein": "Creatine",
        "Creatine monohydrate": "Creatine",
        "Creatine supplementation": "Creatine",
        "Cumin Seed (Cuminum Cyminum)": "Cumin",
        "Curcumin": "Curcumin",
        "Curcumin (Longvida™)": "Curcumin",
        "Curcumin + Piperine": "Curcumin",
        "Curcumin Gummies": "Curcumin",
        "Curcumin Supplementation": "Curcumin",
        "Curcumin and virgin coconut oil extract (KurCo Smart)": "Curcumin",
        "Curcumin capsules 1 gm": "Curcumin",
        "Curcumin plus Piperine": "Curcumin",
        "Curcumin therapy": "Curcumin",
        "Curcumin, Omega-3 and Vitamin-D (COD)": "Curcumin",
        "Curcumin, vitamin d and green tea extract": "Curcumin",
        "Curcumin-Berberine (coptis)": "Curcumin",
        "Curcumin/ Demethoxycurcumin/Bisdemethoxycurcumin-containing Supplement": "Curcumin",
        "Curcumin/Boswellia Serrata/Ascorbic acid mixture": "Curcumin",
        "D-Mannose": "D-Mannose",
        "DHA": "Docosahexaenoic acid (DHA)",
        "DHA and EPA": "Docosahexaenoic acid (DHA)",  # (EPA also present in canon)
        "Dehydroepiandrosterone (DHEA)": "Dehydroepiandrosterone",
        "Dihydroberberine（DHB）400 mg": "Berberine",
        "Docosahexaenoic Acid": "Docosahexaenoic acid (DHA)",
        "Dried plums": "Prune",
        "EPA": "Eicosapentaenoic acid (EPA)",
        "EPA supplementation": "Eicosapentaenoic acid (EPA)",
        "Elderberry": "Elderberry",
        "Egg White Powder": "Egg",
        "Eggs": "Egg",
        "Epicatechin extract": "Epicatechin",
        "Epigallocatechin Gallate": "Epigallocatechin gallate",
        "Epigallocatechin-3-Gallate (EGCG)": "Epigallocatechin gallate",
        "Erythritol": "Erythritol",
        "Fenugreek (Trigonella foenum graecum)": "Fenugreek",
        "Fenugreek Seed Powder": "Fenugreek",
        "Fenugreek Seeds and Indian Rennet": "Fenugreek",
        "Fenugreek seed": "Fenugreek",
        "FeSO4": "Iron",
        "FeSO4 + Lf": "Iron",
        "FeSO4 + OTf": "Iron",
        "Ferrous bisglycinate": "Iron",
        "Ferrous fumarate": "Iron",
        "Ferrous sulfate": "Iron",
        "Ferrous sulphate 200mg oral tablet providing 65 mg of elemental iron": "Iron",
        "Fisetin": "Fisetin",
        "Fish Oil": "Fish oil",
        "Fish Oil Concentrate, 1000 Mg Oral Capsule": "Fish oil",
        "Fish oil": "Fish oil",
        "Fish oil (control)": "Fish oil",
        "Fish oil + Roasted wheat flour": "Fish oil",
        "Fish oil + Vegetable and fruit extracts": "Fish oil",
        "Fish oil + wild orange essential oil supplement": "Fish oil",
        "Fish oil enteral supplementation": "Fish oil",
        "Fish oil intervention": "Fish oil",
        "Fish oil oral supplementation": "Fish oil",
        "Fish oil supplement": "Fish oil",
        "Fish oil supplement 1": "Fish oil",
        "Fish oil supplement 2": "Fish oil",
        "Flaxseed": "Flaxseed",
        "Flaxseed oil capsule": "Flaxseed Oil",
        "Folate": "Folate",
        "Folic Acid": "Folate",
        "Folic acid": "Folate",
        "Folic acid and vitamin B12 fortified flour": "Folate",
        "Folic acid supplement": "Folate",
        "Folinic acid (oral); Cyanocobalamin sublingual": "Folate",
        "Fucoidan": "Fucoidan",
        "Galactooligosaccharides (GOS)": "Galactooligosaccharides",
        "Ginger": "Ginger",
        "Ginseng extract": "Ginseng",
        "Glucosamine/Chondroitin": "Glucosamine",
        "Glucose": "Glucose",
        "Glutamine": "Glutamine",
        "Glycine": "Glycine",
        "Grape Seed Extract": "Grape",
        "Grape extract (VinteraTM Premium Red Grape)": "Grape",
        "Grape powder supplementation": "Grape",
        "Grape seed extract": "Grape",
        "Green Tea": "Green Tea",
        "Green Tea Extract": "Green Tea",
        "Green tea": "Green Tea",
        "Hawthorn supplement": "Hawthorn",
        "HMB": "HMB",
        "HMB + Vitamin D3": "HMB",
        "HMB supplementation": "HMB",
        "Honey": "Honey",
        "Honokiol": "Honokiol",
        "Hyaluronic Acid (HA)": "Hyaluronic acid",
        "Hyaluronic acid 50 mg/capsule": "Hyaluronic acid",
        "Hyaluronic acid 75 mg/capsule": "Hyaluronic acid",
        "Hyaluronic acid, HA": "Hyaluronic acid",
        "Inulin": "Inulin",
        "Iron": "Iron",
        "Isoflavones": "Isoflavones",
        "Isolated Whey Protein": "Whey protein",
        "Kefir": "Kefir",
        "Kefir peptide": "Kefir",
        "Krill Oil": "Krill Oil",
        "Krill Oil 500 MG": "Krill Oil",
        "Krill oil": "Krill Oil",
        "Krill oil supplementation": "Krill Oil",
        "L-Arginine Powder": "Arginine",
        "L-Carnitine": "Carnitine",
        "L-Carnitine 4g": "Carnitine",
        "L-Citrulline": "Citrulline",
        "L-carnitine": "Carnitine",
        "L-citrulline": "Citrulline",
        "L-leucine": "Leucine",
        "L-theanine": "Theanine",
        "Lysine Group - Lysine-Fortified Bread": "Lysine",
        "Luteolin": "Luteolin",
        "Lycopene": "Lycopene",
        "Maca": "Maca",
        "Mango": "Mango",
        "Melatonin": "Melatonin",
        "Melatonin 0.5 mg": "Melatonin",
        "Melatonin 10 MG": "Melatonin",
        "Melatonin 10 MG Oral Tablet": "Melatonin",
        "Melatonin 3 MG Oral Tablet": "Melatonin",
        "Melatonin 3 mg": "Melatonin",
        "Melatonin 5 mg": "Melatonin",
        "Melatonin 6 mg": "Melatonin",
        "Melatonin intervention": "Melatonin",
        "Melatonin supplement intervention": "Melatonin",
        "Melatonin supplementation": "Melatonin",
        "Melatonin tablet 3 mg once daily": "Melatonin",
        "Melatonin treatment": "Melatonin",
        "Methylcobalamin": "Vitamin B12",
        "Metformin": "Metformin",
        "Metformin (1000 mg Twice a day)": "Metformin",
        "Metformin (Standard Treatment for Type 2 Diabetes)": "Metformin",
        "Metformin 500 mg/day": "Metformin",
        "Metformin Hydrochloride (HCL)": "Metformin",
        "Metformin Hydrochloride 500Mg Tablet": "Metformin",
        "Metformin Monotherapy": "Metformin",
        "Moringa Oleifera": "Moringa",
        "Moringa Oleifera Leaf Micronized Powders in Capsule": "Moringa",
        "Moringa leaf powder": "Moringa",
        "Moringa oleifera": "Moringa",
        "Moringa Oleifera Leaf Micronized Powders in Capsule": "Moringa",
        "Moringa leaf powder": "Moringa",
        "Moringa oleifera": "Moringa",
        "N Acetyl Cysteine": "N-Acetyl-Cysteine",
        "N Acetyl L Cysteine": "N-Acetyl-Cysteine",
        "N-Acetyl cysteine": "N-Acetyl-Cysteine",
        "N-Acetylcysteine": "N-Acetyl-Cysteine",
        "N-acetylcysteine": "N-Acetyl-Cysteine",
        "N-acetylcysteine (NAC)": "N-Acetyl-Cysteine",
        "NAC": "N-Acetyl-Cysteine",
        "NAC (N-acetyl cysteine), Alpha lipoic acid (ALA), liposomal glutathione (GSH)": "N-Acetyl-Cysteine",
        "NMN": "NMN (Nicotinamide mononucleotide)",
        "NMN capsule": "NMN (Nicotinamide mononucleotide)",
        "NMN intervention": "NMN (Nicotinamide mononucleotide)",
        "Nicotinamide": "Nicotinamide",
        "Nicotinamide Mononucleotide": "NMN (Nicotinamide mononucleotide)",
        "Nicotinamide Mononucleotide (NMN)": "NMN (Nicotinamide mononucleotide)",
        "Nicotinamide Riboside": "Nicotinamide riboside",
        "Nicotinamide Riboside (NR)": "Nicotinamide riboside",
        "Nicotinamide Riboside 1g (oral)": "Nicotinamide riboside",
        "Nicotinamide Riboside Chloride": "Nicotinamide riboside",
        "Nicotinamide riboside": "Nicotinamide riboside",
        "Nigella Sativa Oil capsule": "Black Seed Oil",
        "Nigella sativa extract (Nisatol®)": "Black Seed Oil",
        "Nitrate": "Nitrate salts",
        "Nitrate group": "Nitrate salts",
        "Nitrate-Rich Beetroot Juice": "Nitrate salts",
        "Nitrate-rich beetroot Juice": "Nitrate salts",
        "Nitrate-rich beetroot juice": "Nitrate salts",
        "Nitrates and Citrulline Malate": "Citrulline Malate",
        "Niacin": "Vitamin B3",
        "Omega-3": "Omega-3",
        "Omega-3 (EPA+DHA)": "Omega-3",
        "Omega-3 Fatty ACids": "Omega-3",
        "Omega-3 Fatty Acids": "Omega-3",
        "Omega-3 Fatty Acids (EPA plus DHA)": "Omega-3",
        "Omega-3 Polyunsaturated Fatty Acid.": "Omega-3",
        "Omega-3 Suplementation": "Omega-3",
        "Omega-3 Supplementation": "Omega-3",
        "Omega-3 fatty acid": "Omega-3",
        "Omega-3 polyunsaturated fatty acid": "Omega-3",
        "Omega-3 supplementation": "Omega-3",
        "OmegaBoost": "Omega-3",
        "Opuntia ficus-indica Supplementation": "Nopal cactus",
        "Oral Elemental Iron (Ferrous Sulfate) 80 mg/day": "Iron",
        "Pectin": "Pectin",
        "Peppermint Oil": "Peppermint",
        "Peppermint oil": "Peppermint",
        "Pea protein": "Pea protein",
        "Peanuts": "Peanut",
        "Potassium chloride supplement": "Potassium",
        "Potassium nitrate and inulin": "Inulin",
        "Prune": "Prune",
        "Prunes": "Prune",
        "Psyllium": "Psyllium",
        "Psyllium husk": "Psyllium",
        "Psyllium powder": "Psyllium",
        "Pterostilbene-silybin-nicotinamide riboside": "Nicotinamide riboside",
        "Pumpkin Seed Oil \" Ronkin®, KMT PHARMA, Egypt. \"": "Pumpkin Seed Oil",
        "Pectin": "Pectin",
        "Quercetin": "Quercetin",
        "Quercetin (dietary supplement)": "Quercetin",
        "Quercetin 1.250 mg (oral)": "Quercetin",
        "Raspberry": "Raspberry",
        "Red chili peppers": "Capsicum",
        "Red yeast rice": "Red yeast rice",
        "Reishi": "Reishi",
        "Resistant Dextrin": "Dextrin",
        "Resistant dextrin": "Dextrin",
        "Resistant Potato Starch": "Starch",
        "Resistant Potato Starch, Corn Starch": "Starch",
        "Resistant Starch": "Starch",
        "Resistant potato starch": "Starch",
        "Resistant starch": "Starch",
        "Resveratrol": "Resveratrol",
        "Rice Bran": "Rice bran",
        "Riboflavin": "Vitamin B2",
        "Rose Apple Extract Drink": "Rose",
        "Royal Jelly": "Royal jelly",
        "Safflower Oil": "Safflower Oil",
        "Safflower oil": "Safflower Oil",
        "Selenium": "Selenium",
        "Selenium nutritional supplementation": "Selenium",
        "Semaglutide": "Semaglutide",
        "Senna Tab": "Senna",
        "Shatavari": "Shatavari",
        "Spirulina": "Spirulina",
        "Spirulina Arthrospira platensis (microalgae)": "Spirulina",
        "Spirulina maxima supplementation": "Spirulina",
        "Stevia": "Stevia",
        "Starch": "Starch",
        "Strawberry": "Strawberry",
        "Sulforaphane": "Sulforaphane",
        "Superoxide dismutase": "Superoxide dismutase",
        "Sweet cherries, 280 g daily for 42 days": "Sweet cherry",
        "Synephrine": "Synephrine",
        "Starch (placebo)": "Starch",
        "Starch Only": "Starch",
        "Strawberry Intervention": "Strawberry",
        "Strawberry lyophilisate": "Strawberry",
        "Sucralose": "Sucralose",
        "Sulforaphane (Avmacol Extra Strength)": "Sulforaphane",
        "Sunflower Oil": "Sunflower",
        "Sunflower oil": "Sunflower",
        "Superoxide Dismutase": "Superoxide dismutase",
        "Synbiotic": "Synbiotic",
        "Synbiotic Supplement": "Synbiotic",
        "Synbiotic supplement": "Synbiotic",
        "Synbiotics": "Synbiotic",
        "Tart Cherry": "Tart cherry",
        "Taurine": "Taurine",
        "Taurine (Tau)": "Taurine",
        "Taurine and Exercise (Tau+Ex)": "Taurine",
        "Taurine supplementation": "Taurine",
        "TeTrimTeas welsh Herbal tea with senna": "Senna",
        "Thiamine": "Vitamin B1",
        "Trimethylglycine": "Trimethylglycine",
        "Urolithin A": "Urolithin A",
        "Urolithin A (Mitopure)": "Urolithin A",
        "Ursodeoxycholic Acid 250 Mg Oral Capsule": "Ursodeoxycholic acid",
        "Vitamin A": "Vitamin A",
        "Vitamin B12": "Vitamin B12",
        "Vitamin B3 100 MG Oral Tablet": "Vitamin B3",
        "Vitamin B6": "Vitamin B6",
        "Vitamin C": "Vitamin C",
        "Vitamin D": "Vitamin D3",
        "Vitamin D (Cholecalciferol )": "Vitamin D3",
        "Vitamin D3": "Vitamin D3",
        "Vitamin E": "Vitamin E",
        "Vitamin K2": "Vitamin K2",
        "Whey protein": "Whey protein",
        "Whey protein isolate": "Whey protein",
        "Wild Blueberry": "Blueberry",
        "Wild blueberry": "Blueberry",
        "Wild blueberry powder": "Blueberry",
        "Yohimbine": "Yohimbe",
        "Zinc": "Zinc",
        "cow's milk kefir": "Kefir",
        "creatine": "Creatine",
        "creatine monohydrate": "Creatine",
        "creatine supplementation": "Creatine",
        "curcumin": "Curcumin",
        "curcumin, high phenolic extra virgin olive oil (HP-EVOO)": "Curcumin",
        "curcumin, omega-3, and vitamin D (COD)": "Curcumin",
        "dextrin": "Dextrin",
        "dietary nitrate": "Nitrate salts",
        "docosahexaenoic acid (DHA)": "Docosahexaenoic acid (DHA)",
        "donor human milk": "Human milk",
        "eicosapentaenoic acid": "Eicosapentaenoic acid (EPA)",
        "ferrous sulfate": "Iron",
        "fish oil": "Fish oil",
        "flaxseed": "Flaxseed",
        "fructose": "Fructose",
        "fucoidan": "Fucoidan",
        "genistein": "Genistein",
        "geranylgeraniol": "Geranylgeraniol",
        "ginger extract": "Ginger",
        "glucose": "Glucose",
        "green tea": "Green Tea",
        "inulin": "Inulin",
        "iron supplementation": "Iron",
        "kefir": "Kefir",
        "krill oil": "Krill Oil",
        "lutein": "Lutein",
        "maca": "Maca",
        "magnesium": "Magnesium Citrate",
        "melatonin": "Melatonin",
        "protein supplement": "Hydrolyzed Whey Protein",
        "protein supplementation": "Hydrolyzed Whey Protein",
        "psyllium": "Psyllium",
        "psyllium husk": "Psyllium",
        "resveratrol": "Resveratrol",
        "soy isoflavones": "Isoflavones",
        "soy protein": "Pea protein",
        "starch": "Starch",
        "sucralose": "Sucralose",
        "sweet cherries, 280 g daily for 42 days": "Sweet cherry",
        "taurine": "Taurine",
        "tributyrin": "Tributyrin",
        "vitamin B6+ Vitamin B12": "Vitamin B6",
        "vitamin B6+ Vitamin B12 +vitamin C": "Vitamin B6",
        "vitamin B9": "Folate",
        "vitamin C": "Vitamin C",
        "vitamin D (cholecalciferol) supplementation": "Vitamin D3",
        "vitamin D3": "Vitamin D3",
        "walnuts": "Walnut",
        "wheat dextrin": "Dextrin",
        "whey protein": "Whey protein",
        "whey protein isolate": "Whey protein",
        "white rice": "White rice",
        "yohimbine": "Yohimbe",
        "zeaxanthine": "Zeaxanthin",
        "zinc acetate": "Zinc",
        "β-glucan": "Beta-glucan",
        "β-hydroxy β-methylbutyrate (HMB)": "HMB",
        "β-hydroxy β-methylbutyrate (HMB) supplement": "HMB",
    }
    
    for pattern, replacement in SYNONYM_MAP.items():
        if pattern in normalized:
            normalized = replacement
            break
    
    return normalized


def normalize_condition_name(condition: str) -> str:
    if not isinstance(condition, str):
        return ""
    
    normalized = condition.lower().strip()
    
    CONDITION_CATEGORIES = {
        r"diabetes|diabetic|glycemic|hba1c|blood sugar": "Blood Sugar Support",
        r"obesity|overweight|weight loss|bmi": "Weight Management",
        r"cardiovascular|heart|cardiac|coronary|hypertension|blood pressure": "Heart Health",
        r"cognitive|cognition|memory|alzheimer|dementia|brain": "Cognitive Health",
        r"depression|anxiety|mood|mental health|psychiatric": "Stress & Mood",
        r"cancer|tumor|oncology|carcinoma|neoplasm": "Healthy Aging",
        r"inflammation|inflammatory|arthritis|autoimmune": "Inflammation & Pain",
        r"gut|intestinal|digestive|ibs|ibd|microbiome|gastrointestinal": "Digestive & Gut Health",
        r"bone|osteoporosis|fracture|skeletal": "Bone Health",
        r"pregnancy|prenatal|maternal|fertility|reproductive": "Prenatal & Postnatal",
        r"aging|elderly|geriatric|longevity": "Healthy Aging",
        r"sleep|insomnia|circadian": "Sleep & Relaxation",
        r"skin|dermatology|atopic": "Hair, Skin & Nails"
    }

    for pattern, category in CONDITION_CATEGORIES.items():
        if re.search(pattern, normalized):
            return category
    
    return normalized


ANALYSIS

In [32]:
def count_by_field_with_evidence(
    df: pd.DataFrame, 
    field: str, 
    evidence_source_field: str,
    normalize_func=None,
    top_n: int = 5,
    max_evidences: int = 10
) -> pd.DataFrame:

    item_counter = Counter()
    item_evidence_map = defaultdict(list)

    for _, row in df.iterrows():
        items = row.get(field)
        evidence = row.get(evidence_source_field)

        if not isinstance(items, list):
            continue

        for item in items:
            if normalize_func:
                item = normalize_func(item)

            if not item:
                continue

            item_lower = item.lower()
            if item_lower in {"placebo", "placebos", "control", "placebo group"}:
                continue

            item_counter[item] += 1

            if evidence and len(item_evidence_map[item]) < max_evidences:
                item_evidence_map[item].append(evidence)

    total = sum(item_counter.values())

    result = pd.DataFrame([
        {
            "item": item,
            "count": count,
            "percentage": round(count / total * 100, 1),
            "evidences": item_evidence_map[item]
        }
        for item, count in item_counter.most_common(top_n)
    ])

    return result


def calculate_yearly_counts(
    df: pd.DataFrame,
    group_field: str,
    date_field: str = "start_year",
    normalize_func=None
) -> pd.DataFrame:
    """
    Calculate trial counts per year for each category.
    
    Args:
        df: Source DataFrame
        group_field: Field to group by (e.g., conditions)
        date_field: Date/year field for temporal grouping
        normalize_func: Optional normalization function
        
    Returns:
        pd.DataFrame: Pivot table with years as columns
    """
    # Explode list field if necessary
    if df[group_field].apply(lambda x: isinstance(x, list)).any():
        exploded = df.explode(group_field).copy()
    else:
        exploded = df.copy()
    
    # Normalize if function provided
    if normalize_func:
        exploded[group_field] = exploded[group_field].apply(
            lambda x: normalize_func(x) if pd.notna(x) else x
        )
    
    # Remove empty/null values
    exploded = exploded.dropna(subset=[group_field, date_field])
    exploded = exploded[exploded[group_field] != ""]
    
    # Group and count
    yearly = exploded.groupby([group_field, date_field]).size().reset_index(name="count")
    
    # Pivot to wide format
    pivot = yearly.pivot(index=group_field, columns=date_field, values="count").fillna(0)
    
    return pivot


def calculate_momentum(
    yearly_counts: pd.DataFrame,
    df: pd.DataFrame,
    normalize_func,
    min_total_trials: int = 10
) -> pd.DataFrame:
    """
    Calculate growth momentum (CAGR) for each category.
    
    Momentum is calculated as compound annual growth rate (CAGR):
    CAGR = (ending_value / beginning_value)^(1/years) - 1
    
    For categories with zero starting value, uses simple growth classification.
    
    Args:
        yearly_counts: Pivot table from calculate_yearly_counts
        min_total_trials: Minimum total trials to include in analysis
        
    Returns:
        pd.DataFrame: Categories with momentum metrics
    """

    years = sorted(yearly_counts.columns)
    if len(years) < 2:
        raise ValueError("Need at least 2 years of data for momentum calculation")
    
    first_year = years[0]
    last_year = years[-1]
    # added for filtering and evidence
    df_recent = df[df["start_year"] == last_year]
    n_years = last_year - first_year
    
    results = []
    
    for category in yearly_counts.index:
        row = yearly_counts.loc[category]
        total = row.sum()
        
        # Skip categories with too few trials
        if total < min_total_trials:
            continue
        
        start_val = row[first_year]
        end_val = row[last_year]
        
        # Calculate CAGR where possible
        if start_val > 0 and end_val > 0:
            cagr = (end_val / start_val) ** (1 / n_years) - 1
        elif start_val == 0 and end_val > 0:
            cagr = float('inf')  # New/emerging category
        elif end_val == 0:
            cagr = -1.0  # Declining to zero
        else:
            cagr = 0.0
        
        # Calculate simple YoY change for most recent period
        if len(years) >= 2:
            recent_change = row[years[-1]] - row[years[-2]]
            recent_pct = (recent_change / row[years[-2]] * 100) if row[years[-2]] > 0 else 0
        else:
            recent_change = 0
            recent_pct = 0
        
        # Classify momentum
        if cagr == float('inf'):
            classification = "EMERGING"
        elif cagr > 0.15:
            classification = "ACCELERATING"
        elif cagr > 0.05:
            classification = "GROWING"
        elif cagr > -0.05:
            classification = "STABLE"
        elif cagr > -0.15:
            classification = "SLOWING"
        else:
            classification = "DECLINING"

        evidence_rows = []

        for _, row2 in df_recent.iterrows():
            conds = row2.get("conditions")

            if not isinstance(conds, list):
                continue

            # Find the EXACT raw condition text that mapped to this category
            matched_texts = [
                c for c in conds if normalize_func(c) == category
            ]

            if matched_texts:
                evidence_rows.append({
                    "nct_id": row2["nct_id"],
                    "start_date": row2["start_date"].date().isoformat(),  
                    "condition": matched_texts[0]                      
                })

        
        results.append({
            "category": category,
            "total_trials": int(total),
            f"trials_{first_year}": int(start_val),
            f"trials_{last_year}": int(end_val),
            "cagr": round(cagr * 100, 1) if cagr != float('inf') else None,
            "recent_yoy_change": int(recent_change),
            "recent_yoy_pct": round(recent_pct, 1),
            "momentum": classification,
            "evidence_trials": evidence_rows[:5]
        })
    
    return pd.DataFrame(results).sort_values("total_trials", ascending=False)


def classify_thematic_area(
    title: str,
    conditions: List[str],
    interventions: List[str],
    keywords: List[str]
) -> List[str]:
    """
    Classify a trial into one or more thematic priority areas.
    
    Args:
        title: Trial title
        conditions: List of conditions
        interventions: List of interventions
        keywords: List of keywords
        
    Returns:
        List[str]: Matched thematic areas
    """
    # Combine all text for searching
    all_text = " ".join([
        title or "",
        " ".join(conditions or []),
        " ".join(interventions or []),
        " ".join(keywords or [])
    ]).lower()
    
    THEMES = {
        "Weight Management": [
            r"weight", r"fat loss", r"body fat", r"appetite", r"thermogen", r"glucose",
            r"metabolic rate", r"satiety", r"calorie"
        ],

        "Digestive & Gut Health": [
            r"gut", r"digest", r"microbiome", r"microbiota", r"probiotic", r"prebiotic",
            r"ibs", r"constipation", r"bloating", r"gut.?brain"
        ],

        "Energy": [
            r"energy", r"fatigue", r"stamina", r"vitality", r"atp", r"mitochondri",
            r"endurance"
        ],

        "Sports Nutrition": [
            r"muscle", r"hypertrophy", r"strength", r"endurance", r"recovery",
            r"performance", r"athlet", r"exercise"
        ],

        "Joint & Mobility": [
            r"joint", r"cartilage", r"osteoarthritis", r"mobility", r"flexibility",
            r"arthritis", r"connective tissue", r"stiffness"
        ],

        "Cognitive Health": [
            r"cognit", r"memory", r"focus", r"attention", r"neuroprotect",
            r"brain", r"nootropic", r"mental"
        ],

        "Immune Health": [
            r"immune", r"immunomod", r"antiviral", r"antibod", r"infection",
            r"innate immunity", r"adaptive immunity"
        ],

        "Hair, Skin & Nails": [
            r"skin", r"hair", r"nail", r"collagen", r"keratin", r"elasticity",
            r"wrinkle", r"dermatolog"
        ],

        "Men's Wellness": [
            r"testosterone", r"androgen", r"prostate", r"male fertility",
            r"erectile", r"sperm", r"men.?s health"
        ],

        "Women's Wellness": [
            r"women.?s health", r"hormonal", r"estrogen", r"progesterone",
            r"pcos", r"menopause", r"premenopause", r"ovarian"
        ],

        "Prenatal & Postnatal": [
            r"prenatal", r"postnatal", r"maternal", r"pregnan", r"fetal",
            r"lactation", r"breastfeeding", r"infant"
        ],

        "Liver & Detox": [
            r"liver", r"hepatic", r"detox", r"masld", r"nafld", r"fatty liver",
            r"glutathione", r"detoxification"
        ],

        "Sleep & Relaxation": [
            r"sleep", r"insomnia", r"melatonin", r"circadian", r"rest",
            r"relax", r"sleep quality"
        ],

        "Stress & Mood": [
            r"stress", r"cortisol", r"mood", r"anxiety", r"depress",
            r"serotonin", r"calming", r"adaptogen"
        ],

        "Bone Health": [
            r"bone", r"osteoblast", r"osteoclast", r"osteoporosis",
            r"bone density", r"calcium", r"vitamin d"
        ],

        "Heart Health": [
            r"cardio", r"cvd", r"blood pressure", r"hypertension", r"lipid",
            r"cholesterol", r"endothelial"
        ],

        "Blood Sugar Support": [
            r"glucose", r"glycemic", r"insulin", r"hba1c", r"blood sugar",
            r"insulin sensitivity", r"metabolic"
        ],

        "Healthy Aging": [
            r"aging", r"longevity", r"senescence", r"frailty", r"resilience",
            r"muscle mass", r"vitality"
        ],

        "Vision & Eye Health": [
            r"vision", r"eye", r"retina", r"macula", r"ocular", r"visual"
        ],

        "Inflammation & Pain": [
            r"inflamm", r"cytokine", r"pain", r"nocicept", r"crp",
            r"il-6", r"tnf", r"analges"
        ]
    }

    matched = []
    
    for theme, patterns in THEMES.items():
        for pattern in patterns:
            if re.search(pattern, all_text):
                matched.append(theme)
                break  # Only match each theme once
    
    return matched


def analyze_thematic_priorities(df: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze trial counts across custom thematic priority areas.
    
    Args:
        df: DataFrame with title, conditions, intervention_names, keywords columns
        
    Returns:
        pd.DataFrame: Theme counts and rankings
    """
    theme_counts = Counter()
    theme_trials = {
        category: [] for category in [
            "Weight Management",
            "Digestive & Gut Health",
            "Energy",
            "Sports Nutrition",
            "Joint & Mobility",
            "Cognitive Health",
            "Immune Health",
            "Hair, Skin & Nails",
            "Men's Wellness",
            "Women's Wellness",
            "Prenatal & Postnatal",
            "Liver & Detox",
            "Sleep & Relaxation",
            "Stress & Mood",
            "Bone Health",
            "Heart Health",
            "Blood Sugar Support",
            "Healthy Aging",
            "Vision & Eye Health",
            "Inflammation & Pain"
        ]
    }

    for _, row in df.iterrows():
        themes = classify_thematic_area(
            row.get("title", ""),
            row.get("conditions", []),
            row.get("intervention_names", []),
            row.get("keywords", [])
        )
        
        for theme in themes:
            theme_counts[theme] += 1
            if row.get("nct_id"):
                theme_trials[theme].append(row["nct_id"])
    
    total_themed = sum(theme_counts.values())
    
    results = []
    for theme, count in theme_counts.most_common():
        results.append({
            "theme": theme,
            "trial_count": count,
            "percentage_of_themed": round(count / total_themed * 100, 1) if total_themed > 0 else 0,
            "example_nct_ids": theme_trials[theme][:5]
        })
    
    for theme in theme_trials.keys():
        if theme not in theme_counts:
            results.append({
                "theme": theme,
                "trial_count": 0,
                "percentage_of_themed": 0,
                "example_nct_ids": []
            })
    
    return pd.DataFrame(results).sort_values("trial_count", ascending=False)


In [33]:
def build_search_query() -> str:
    terms = " OR ".join([f'"{term}"' for term in SUPPLEMENT_TERMS])
    return terms

def build_fields() -> str:
    return ",".join(RETURN_FIELDS)

def build_broad_keyword_query() -> str:
    return " OR ".join(CUSTOM_KEYWORDS)

def build_date_filter() -> str:
    start_str = START_DATE.strftime("%Y-%m-%d")
    end_str = END_DATE.strftime("%Y-%m-%d")
    return f"AREA[StartDate]RANGE[{start_str},{end_str}]"

def inject_dual_evidence_from_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Inject two evidence columns:
    - evidence_ingredients
    - evidence_conditions
    """

    df = df.copy()

    def build_ingredient_evidence(row):
        return {
            "nct_id": row.get("nct_id"),
            "title": row.get("title"),
            "start_date": (
                row.get("start_date").date().isoformat()
                if row.get("start_date") else None
            ),
            "intervention_names": row.get("intervention_names"),
        }

    def build_condition_evidence(row):
        return {
            "nct_id": row.get("nct_id"),
            "title": row.get("title"),
            "start_date": (
                row.get("start_date").date().isoformat()
                if row.get("start_date") else None
            ),
            "conditions": row.get("conditions"),
        }

    df["evidence_ingredients"] = df.apply(build_ingredient_evidence, axis=1)
    df["evidence_conditions"] = df.apply(build_condition_evidence, axis=1)

    return df


In [34]:
def export_distinct_ingredients_to_json(
    df: pd.DataFrame,
    field: str = "intervention_names",
    output_path: str = "output/distinct_ingredients.json"
):

    all_items = set()

    for items in df[field].dropna():
        if isinstance(items, list):
            for item in items:
                if isinstance(item, str):
                    clean = item.strip()
                    if clean:
                        all_items.add(clean)

    distinct_ingredients = sorted(all_items)

    output_path = Path(output_path)
    output_path.parent.mkdir(exist_ok=True)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(distinct_ingredients, f, indent=2, ensure_ascii=False)

    return distinct_ingredients


APPLICATION

In [35]:
output_dir = Path(os.getenv("OUTPUT_DIR"))
output_dir.mkdir(exist_ok=True)

client = ClinicalTrialsClient(
    base_url=BASE_URL,
    page_size=PAGE_SIZE,
    delay=REQUEST_DELAY
)

intervention_query = build_search_query()
date_filter = build_date_filter()
fields = build_fields()
broad_keywords = build_broad_keyword_query()

logger.info("Fetching dietary supplement trials...")
logger.info(f"Query: {broad_keywords}")
logger.info(f"Date filter: {date_filter}")

studies = list(client.search(
    query_intr=intervention_query,
    query_term=broad_keywords,
    filter_advanced=date_filter,
    fields=fields
))

logger.info(f"Retrieved {len(studies)} studies")


2025-12-11 17:35:41,666 - INFO - Fetching dietary supplement trials...
2025-12-11 17:35:41,666 - INFO - Query: Weight Management OR fat loss OR appetite control OR thermogenesis OR metabolic rate OR Digestive & Gut Health OR microbiome OR probiotics OR prebiotics OR gut barrier OR IBS OR Energy OR ATP production OR fatigue OR mitochondrial support OR stamina OR Sports Nutrition OR muscle recovery OR endurance OR performance OR hypertrophy OR Joint & Mobility OR cartilage OR osteoarthritis OR inflammation OR flexibility OR Cognitive Health OR memory OR focus OR neuroprotection OR neurotransmitters OR Immune Health OR innate immunity OR adaptive immunity OR viral defense OR inflammation control OR Hair Skin & Nails OR collagen synthesis OR keratin OR elasticity OR skin aging OR Men’s Wellness OR testosterone OR prostate health OR fertility OR muscle mass OR Women’s Wellness OR hormonal balance OR fertility OR menopause OR PCOS OR Prenatal & Postnatal OR fetal development OR maternal nutr

In [36]:
# This assumes build_dataframe exists in your notebook or imported earlier
df = build_dataframe(studies)

df = inject_dual_evidence_from_df(df)

logger.info(f"DataFrame shape: {df.shape}")

df.head()


2025-12-11 17:36:28,941 - INFO - DataFrame shape: (5185, 15)


Unnamed: 0,nct_id,title,status,start_date,start_year,phase,study_type,conditions,keywords,intervention_names,intervention_types,enrollment,sponsor,evidence_ingredients,evidence_conditions
0,NCT05315388,"Effects of ""Vitamin N"" Nature Immersion Therap...",,2022-10-15,2022,,INTERVENTIONAL,"[Cortisol Excess, Stress, Psychological, Anxie...","[Forest therapy, Nature therapy, Cortisol, Psy...",[Immersion therapy in nature - Vitamin N],[OTHER],118,Jeadran N. Malagón-Rojas,"{'nct_id': 'NCT05315388', 'title': 'Effects of...","{'nct_id': 'NCT05315388', 'title': 'Effects of..."
1,NCT07013305,Pre- and Post-Treatment Investigation of B12 a...,,2025-06-11,2025,,OBSERVATIONAL,"[Fibromyalgia (FM), Vitamin B 12 Deficiency, F...","[Antiepileptic (anticonvulsant) drugs, Vitamin...",[],[],80,Gaziosmanpasa Research and Education Hospital,"{'nct_id': 'NCT07013305', 'title': 'Pre- and P...","{'nct_id': 'NCT07013305', 'title': 'Pre- and P..."
2,NCT06699537,Lacticaseibacillus Rhamnosus LRa05 for Allevia...,,2024-12-01,2024,,INTERVENTIONAL,[Allergic Rhinitis (AR)],[],"[probiotic product, Maltodextrin]","[DIETARY_SUPPLEMENT, DIETARY_SUPPLEMENT]",70,"Wecare Probiotics Co., Ltd.","{'nct_id': 'NCT06699537', 'title': 'Lacticasei...","{'nct_id': 'NCT06699537', 'title': 'Lacticasei..."
3,NCT04963777,Prebiotics in Patients With Type 1 Diabetes,,2022-03-29,2022,,INTERVENTIONAL,[Type 1 Diabetes],"[Gut microbiota, Glycemic control, Prebiotic, ...","[Prebiotic, Placebo]","[DIETARY_SUPPLEMENT, DIETARY_SUPPLEMENT]",144,University of Calgary,"{'nct_id': 'NCT04963777', 'title': 'Prebiotics...","{'nct_id': 'NCT04963777', 'title': 'Prebiotics..."
4,NCT06032442,A Study to Assess Efficacy of Supporting Prope...,,2022-03-21,2022,,INTERVENTIONAL,"[Osteoarthritis, Osteo Arthritis Knee, Osteoar...","[undenatured type ii collagen, collagen, Boswe...","[ARTNEO, Artra]","[DIETARY_SUPPLEMENT, DRUG]",70,NPO Petrovax,"{'nct_id': 'NCT06032442', 'title': 'A Study to...","{'nct_id': 'NCT06032442', 'title': 'A Study to..."


In [37]:
# Distinct ingredients
distinct_ingredients = export_distinct_ingredients_to_json(
    df,
    field="intervention_names",
    output_path="output/distinct_ingredients.json"
)

len(distinct_ingredients)


7535

BUILD SIMPLIFIED JSON

In [38]:
logger.info("Building simplified studies JSON from flattened DataFrame...")

simplified_studies = []

for _, row in df.iterrows():
    simplified_studies.append({
        "nctId": row.get("nct_id"),
        "title": row.get("title"),
        "status": row.get("status"),
        "study_type": row.get("study_type"),
        "phase": row.get("phase"),
        "start_date": (
            row.get("start_date").date().isoformat()
            if pd.notna(row.get("start_date")) else None
        ),
        "start_year": row.get("start_year"),
        "conditions": row.get("conditions", []),
        "keywords": row.get("keywords", []),

        "interventions": {
            "intervention_names": row.get("intervention_names", []),
            "intervention_types": row.get("intervention_types", [])
        },

        "enrollment": row.get("enrollment"),
        "sponsor": row.get("sponsor"),
    })

# === SAVE JSON FILE ===
simplified_json_path = output_dir / "simplified_studies.json"

with open(simplified_json_path, "w", encoding="utf-8") as f:
    json.dump(simplified_studies, f, indent=2, ensure_ascii=False)

logger.info(f"Simplified studies JSON exported to {simplified_json_path}")

# === ALSO RETURN AS DATAFRAME FOR INSPECTION ===
df_simplified = pd.DataFrame(simplified_studies)
df_simplified.head()

2025-12-11 17:36:28,975 - INFO - Building simplified studies JSON from flattened DataFrame...
2025-12-11 17:36:29,282 - INFO - Simplified studies JSON exported to output\simplified_studies.json


Unnamed: 0,nctId,title,status,study_type,phase,start_date,start_year,conditions,keywords,interventions,enrollment,sponsor
0,NCT05315388,"Effects of ""Vitamin N"" Nature Immersion Therap...",,INTERVENTIONAL,,2022-10-15,2022,"[Cortisol Excess, Stress, Psychological, Anxie...","[Forest therapy, Nature therapy, Cortisol, Psy...",{'intervention_names': ['Immersion therapy in ...,118,Jeadran N. Malagón-Rojas
1,NCT07013305,Pre- and Post-Treatment Investigation of B12 a...,,OBSERVATIONAL,,2025-06-11,2025,"[Fibromyalgia (FM), Vitamin B 12 Deficiency, F...","[Antiepileptic (anticonvulsant) drugs, Vitamin...","{'intervention_names': [], 'intervention_types...",80,Gaziosmanpasa Research and Education Hospital
2,NCT06699537,Lacticaseibacillus Rhamnosus LRa05 for Allevia...,,INTERVENTIONAL,,2024-12-01,2024,[Allergic Rhinitis (AR)],[],"{'intervention_names': ['probiotic product', '...",70,"Wecare Probiotics Co., Ltd."
3,NCT04963777,Prebiotics in Patients With Type 1 Diabetes,,INTERVENTIONAL,,2022-03-29,2022,[Type 1 Diabetes],"[Gut microbiota, Glycemic control, Prebiotic, ...","{'intervention_names': ['Prebiotic', 'Placebo'...",144,University of Calgary
4,NCT06032442,A Study to Assess Efficacy of Supporting Prope...,,INTERVENTIONAL,,2022-03-21,2022,"[Osteoarthritis, Osteo Arthritis Knee, Osteoar...","[undenatured type ii collagen, collagen, Boswe...","{'intervention_names': ['ARTNEO', 'Artra'], 'i...",70,NPO Petrovax


In [39]:
df.to_pickle(output_dir / "raw_trials.pkl")
df.to_csv(output_dir / "raw_trials.csv", index=False)


In [40]:
# === DELIVERABLE 1: Top Ingredients and Conditions ===
logger.info("Analyzing top ingredients...")

top_ingredients = count_by_field_with_evidence(
    df,
    field="intervention_names",
    evidence_source_field="evidence_ingredients",
    normalize_func=normalize_ingredient_name,
    top_n=5,
    max_evidences=5
)
top_ingredients.to_csv(output_dir / "top_ingredients.csv", index=False)

logger.info("Analyzing top conditions...")

top_conditions = count_by_field_with_evidence(
    df,
    field="conditions",
    evidence_source_field="evidence_conditions",
    normalize_func=normalize_condition_name,
    top_n=25,
    max_evidences=5
)
top_conditions.to_csv(output_dir / "top_conditions.csv", index=False)


# === DELIVERABLE 2: Momentum Analysis ===
logger.info("Calculating condition momentum...")

yearly_conditions = calculate_yearly_counts(
    df,
    "conditions",
    normalize_func=normalize_condition_name
)

condition_momentum = calculate_momentum(
    yearly_conditions,
    df,
    normalize_condition_name
)

condition_momentum.to_csv(output_dir / "condition_momentum.csv", index=False)


logger.info("Calculating ingredient momentum...")

yearly_ingredients = calculate_yearly_counts(
    df,
    "intervention_names",
    normalize_func=normalize_ingredient_name
)

ingredient_momentum = calculate_momentum(
    yearly_ingredients,
    df,
    normalize_condition_name
)

ingredient_momentum.to_csv(output_dir / "ingredient_momentum.csv", index=False)


# === DELIVERABLE 3: Thematic Priority Analysis ===
logger.info("Analyzing thematic priorities...")

thematic_analysis = analyze_thematic_priorities(df)
thematic_analysis.to_csv(output_dir / "thematic_priorities.csv", index=False)


# === Summary Report ===
logger.info("Generating summary report...")

summary = {
    "analysis_date": datetime.now().isoformat(),
    "date_range": f"{START_DATE.date()} to {END_DATE.date()}",
    "total_trials": len(df),
    "top_5_ingredients": top_ingredients.head(5).to_dict("records"),
    "top_5_conditions": top_conditions.head(5).to_dict("records"),
    "accelerating_areas": condition_momentum[
        condition_momentum["momentum"] == "ACCELERATING"
    ][["category", "recent_yoy_pct", "cagr", "evidence_trials"]]
    .head(5)
    .to_dict("records"),
    "emerging_areas": condition_momentum[
        condition_momentum["momentum"] == "EMERGING"
    ][["category", "recent_yoy_pct", "cagr", "evidence_trials"]]
    .head(5)
    .to_dict("records"),
    "thematic_summary": thematic_analysis[["theme", "trial_count"]].to_dict("records"),
}

with open(output_dir / "summary.json", "w") as f:
    json.dump(summary, f, indent=2, default=str)

logger.info(f"Analysis complete. Output saved to {output_dir}/")


# === CONSOLE SUMMARY OUTPUT ===
print("\n" + "=" * 60)
print("CLINICAL TRIALS ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nTotal trials analyzed: {len(df)}")
print(f"Date range: {START_DATE.date()} to {END_DATE.date()}")

print("\nTop 5 Ingredients:")
print(top_ingredients.head(5).to_string(index=False))

print("\nTop 5 Conditions:")
print(top_conditions.head(5).to_string(index=False))

print("\nThematic Priority Rankings:")
print(thematic_analysis[["theme", "trial_count"]].to_string(index=False))


2025-12-11 17:36:29,415 - INFO - Analyzing top ingredients...
2025-12-11 17:36:30,015 - INFO - Analyzing top conditions...
2025-12-11 17:36:30,174 - INFO - Calculating condition momentum...
2025-12-11 17:36:33,530 - INFO - Calculating ingredient momentum...
2025-12-11 17:36:36,125 - INFO - Analyzing thematic priorities...
2025-12-11 17:36:36,559 - INFO - Generating summary report...
2025-12-11 17:36:36,563 - INFO - Analysis complete. Output saved to output/



CLINICAL TRIALS ANALYSIS SUMMARY

Total trials analyzed: 5185
Date range: 2022-01-01 to 2025-12-31

Top 5 Ingredients:
        item  count  percentage                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 