## Step 1: Setup and Data Loading

In [40]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully")

Libraries imported successfully


In [5]:
# Load CSV files, skipping the first 6 lines of metadata
file_paths = {
    2015: './data/nih/2015-total-cost-keywords.csv',
    2016: './data/nih/2016-total-cost-keywords.csv',
    2017: './data/nih/2017-total-cost-keywords.csv',
    2018: './data/nih/2018-total-cost-keywords.csv',
    2019: './data/nih/2019-total-cost-keywords.csv',
    2020: './data/nih/2020-total-cost-keywords.csv',
    2021: './data/nih/2021-total-cost-keywords.csv',
    2022: './data/nih/2022-total-cost-keywords.csv', 
    2023: './data/nih/2023-total-cost-keywords.csv',
    2024: './data/nih/2024-total-cost-keywords.csv',
    2025: './data/nih/2025-total-cost-keywords.csv'
}

# Load all dataframes
dataframes = {}
for year, path in file_paths.items():
    print(f"Loading {year} data...")
    df = pd.read_csv(path, skiprows=6)  # Skip first 6 lines of metadata
    print(f"  - Raw shape: {df.shape}")
    dataframes[year] = df
    
print("\nAll files loaded successfully!")

Loading 2015 data...
  - Raw shape: (9484, 4)
Loading 2016 data...
  - Raw shape: (10502, 4)
Loading 2017 data...
  - Raw shape: (10472, 4)
Loading 2018 data...
  - Raw shape: (11533, 4)
Loading 2019 data...
  - Raw shape: (11769, 4)
Loading 2020 data...
  - Raw shape: (12142, 4)
Loading 2021 data...
  - Raw shape: (11811, 4)
Loading 2022 data...
  - Raw shape: (11921, 4)
Loading 2023 data...
  - Raw shape: (11647, 4)
Loading 2024 data...
  - Raw shape: (10603, 4)
Loading 2025 data...
  - Raw shape: (8493, 4)

All files loaded successfully!


## Step 2: Data Preprocessing

In [6]:
# Examine the structure of one dataframe
print("Sample dataframe structure (2015):")
print(f"Columns: {list(dataframes[2015].columns)}")
print(f"\nFirst few rows:")
print(dataframes[2015].head())
print(f"\nData types:")
print(dataframes[2015].dtypes)

Sample dataframe structure (2015):
Columns: ['Project Terms', 'Total Cost', 'Total Cost(Sub Projects)', 'Unnamed: 3']

First few rows:
                                                                                         Project Terms  \
0  Accounting;Adult;Affect;African American;Alleles;American;Angiotensin II;Angiotensins;Animal Mod...   
1  Aging;Apical;Astrocytes;Binding (Molecular Function);Blood Circulation;Brain;Carrier Proteins;Ce...   
2  Activities of Daily Living;Address;Adult;Aerobic;Aerobic Exercise;Aftercare;Age;Awareness;Behavi...   
3  Address;Adherence (attribute);Adult;Age;Area;Arteries;Atherosclerosis;Cellular Phone;Clinical;Co...   
4  Address;Adult;Affect;African American;Age;Aging;Cessation of life;Characteristics;Cognitive;Cogn...   

  Total Cost Total Cost(Sub Projects)  Unnamed: 3  
0     469306                                  NaN  
1     419392                                  NaN  
2     149209                                  NaN  
3     300000        

In [7]:
def preprocess_dataframe(df, year):
    """
    Clean and preprocess the dataframe:
    1. Drop unnecessary columns
    2. Remove rows with empty Project Terms or Total Cost
    3. Clean and validate data types
    """
    print(f"Preprocessing {year} data...")
    
    # Make a copy to avoid modifying original
    df_clean = df.copy()
    
    # Drop unnecessary columns
    columns_to_drop = []
    if 'Total Cost(Sub Projects)' in df_clean.columns:
        columns_to_drop.append('Total Cost(Sub Projects)')
    if 'Unnamed: 3' in df_clean.columns:
        columns_to_drop.append('Unnamed: 3')
    
    if columns_to_drop:
        df_clean = df_clean.drop(columns=columns_to_drop)
        print(f"  - Dropped columns: {columns_to_drop}")
    
    # Remove rows where Project Terms or Total Cost are empty/null
    initial_rows = len(df_clean)
    
    # Remove rows with empty or whitespace-only Project Terms
    df_clean = df_clean[df_clean['Project Terms'].notna()]
    df_clean = df_clean[df_clean['Project Terms'].str.strip() != '']
    
    # Remove rows with empty or null Total Cost
    df_clean = df_clean[df_clean['Total Cost'].notna()]
    
    # Convert Total Cost to numeric, handling any string formatting
    df_clean['Total Cost'] = pd.to_numeric(df_clean['Total Cost'], errors='coerce')
    df_clean = df_clean[df_clean['Total Cost'].notna()]
    
    final_rows = len(df_clean)
    rows_removed = initial_rows - final_rows
    
    print(f"  - Initial rows: {initial_rows}")
    print(f"  - Final rows: {final_rows}")
    print(f"  - Rows removed: {rows_removed}")
    print(f"  - Total funding: ${df_clean['Total Cost'].sum():,.0f}")
    
    return df_clean

In [8]:
# Preprocess all dataframes
clean_dataframes = {}
for year, df in dataframes.items():
    clean_dataframes[year] = preprocess_dataframe(df, year)
    print()

print("Data preprocessing completed!")

Preprocessing 2015 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 9484
  - Final rows: 8904
  - Rows removed: 580
  - Total funding: $3,774,359,436

Preprocessing 2016 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 10502
  - Final rows: 9944
  - Rows removed: 558
  - Total funding: $4,477,508,466

Preprocessing 2017 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 10472
  - Final rows: 10130
  - Rows removed: 342
  - Total funding: $4,824,194,380

Preprocessing 2018 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 11533
  - Final rows: 11019
  - Rows removed: 514
  - Total funding: $5,097,004,938

Preprocessing 2019 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 11769
  - Final rows: 11195
  - Rows removed: 574
  - Total funding: $6,163,816,348

Preprocessing 2020 data...
  - Dr

## Step 3: Keyword Extraction and Normalization

In [9]:
def extract_keywords_from_dataframe(df, year):
    """
    Extract all unique keywords from a dataframe's Project Terms column.
    Keywords are separated by semicolons and need to be normalized.
    Also adds a 'keywords_list' column to the dataframe for efficient lookup.
    """
    print(f"Extracting keywords from {year} data...")
    
    all_keywords = set()
    keywords_lists = []
    
    for project_terms in df['Project Terms']:
        if pd.notna(project_terms):
            # Split on semicolon and normalize
            keywords = [kw.strip().lower() for kw in str(project_terms).split(';')]
            # Remove empty keywords
            keywords = [kw for kw in keywords if kw and kw != '']
            all_keywords.update(keywords)
            keywords_lists.append(keywords)
        else:
            keywords_lists.append([])
    
    # Add the keywords list as a new column
    df['keywords_list'] = keywords_lists
    
    print(f"  - Found {len(all_keywords)} unique keywords")
    print("  - Added keywords_list column to dataframe")
    return all_keywords

In [10]:
clean_dataframes.keys()

dict_keys([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025])

In [11]:
clean_dataframes.get(2021).keys()

Index(['Project Terms', 'Total Cost'], dtype='object')

In [12]:
# Extract keywords from each year and update the dataframes
yearly_keywords = {}
for year, df in clean_dataframes.items():
    yearly_keywords[year] = extract_keywords_from_dataframe(df, year)
    # The dataframe is modified in-place by the function, so clean_dataframes[year] now has the keywords_list column
    print(f"  - Updated dataframe columns: {list(df.columns)}")
    print()

Extracting keywords from 2015 data...
  - Found 24718 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2016 data...
  - Found 25872 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2017 data...
  - Found 26402 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2018 data...
  - Found 27543 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2019 data...
  - Found 28221 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 

In [13]:
# Print sample keywords from 2021
list(yearly_keywords.get(2021))[:10]

['b-cell antigen receptor',
 'tupaiidae',
 'transfer agreement',
 'differential equation',
 'body on a chip',
 'rhomboid',
 'metric system',
 'anti-pd-1/pd-l1',
 'ptpn11 gene',
 'cross-species transmission']

In [14]:
clean_dataframes.get(2021).head()

Unnamed: 0,Project Terms,Total Cost,keywords_list
2,Accounting;Address;Adult;Adverse effects;Adverse event;Age;Alcohol consumption;Alcohols;American...,693883.0,"[accounting, address, adult, adverse effects, adverse event, age, alcohol consumption, alcohols,..."
3,Affect;African American;Area;Attitude;Big Data;Biological;Biological Markers;Biological Process;...,374998.0,"[affect, african american, area, attitude, big data, biological, biological markers, biological ..."
4,Activities of Daily Living;Address;Affect;Age;Aging;Anxiety;Biological;Biological Aging;Body mas...,885747.0,"[activities of daily living, address, affect, age, aging, anxiety, biological, biological aging,..."
5,Acute;Adult;Affect;Age;Age of Onset;Aging;Animal Model;Animals;Area;Behavior;Behavioral;Biology;...,839120.0,"[acute, adult, affect, age, age of onset, aging, animal model, animals, area, behavior, behavior..."
6,Address;Affect;Affective;Age;Aging;Alzheimer's Disease;Animal Model;Animals;Autonomic nervous sy...,793643.0,"[address, affect, affective, age, aging, alzheimer's disease, animal model, animals, autonomic n..."


In [15]:
'accounting' in yearly_keywords.get(2021)

True

In [16]:
# Create master list of all unique keywords
all_unique_keywords = set()
for keywords in yearly_keywords.values():
    all_unique_keywords.update(keywords)

print(f"\nTotal unique keywords across all years: {len(all_unique_keywords)}")

# Display some sample keywords
sample_keywords = list(all_unique_keywords)[:10]
print("\nSample keywords:")
sample_keywords


Total unique keywords across all years: 48275

Sample keywords:


['myelocyte',
 'b-cell antigen receptor',
 'tupaiidae',
 'differential equation',
 'transfer agreement',
 'body on a chip',
 'trapp transport protein particle',
 'rhomboid',
 'image translation',
 'gerotherapeutic']

## Step 4: Calculate Annual Keyword Metrics

In [17]:
def calculate_keyword_metrics_for_year(df, year, all_keywords):
    """
    For each keyword, calculate:
    - Number of grants containing the keyword
    - Total funding for grants containing the keyword
    """
    print(f"Calculating metrics for {year}...")
    
    keyword_metrics = {}
    total_grants = len(df)
    total_funding = df['Total Cost'].sum()
    
    print(f"  - Processing {len(all_keywords)} keywords for {total_grants} grants")
    
    # Use tqdm for progress bar on large keyword sets
    for keyword in tqdm(all_keywords, desc=f"Processing {year}"):
        # Find grants that contain this keyword using list membership
        mask = df['keywords_list'].apply(lambda kw_list: keyword in kw_list)
        grants_with_keyword = df[mask]
        
        grant_count = len(grants_with_keyword)
        funding_total = grants_with_keyword['Total Cost'].sum() if grant_count > 0 else 0
        
        keyword_metrics[keyword] = {
            'grant_count': grant_count,
            'funding_total': funding_total,
            'grant_percentage': grant_count / total_grants,
            'funding_proportion': funding_total / total_funding
        }
    
    return keyword_metrics, total_grants, total_funding

In [18]:
# Calculate metrics for each year
annual_metrics = {}
annual_totals = {}

for year, df in clean_dataframes.items():
    metrics, total_grants, total_funding = calculate_keyword_metrics_for_year(
        df, year, all_unique_keywords
    )
    annual_metrics[year] = metrics
    annual_totals[year] = {
        'total_grants': total_grants,
        'total_funding': total_funding
    }
    print(f"  - Completed {year}: {total_grants} grants, ${total_funding:,.0f} total funding\n")

print("Annual metrics calculation completed!")

Calculating metrics for 2015...
  - Processing 48275 keywords for 8904 grants


Processing 2015: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [04:35<00:00, 175.36it/s]


  - Completed 2015: 8904 grants, $3,774,359,436 total funding

Calculating metrics for 2016...
  - Processing 48275 keywords for 9944 grants


Processing 2016: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:10<00:00, 155.34it/s]


  - Completed 2016: 9944 grants, $4,477,508,466 total funding

Calculating metrics for 2017...
  - Processing 48275 keywords for 10130 grants


Processing 2017: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:07<00:00, 157.25it/s]


  - Completed 2017: 10130 grants, $4,824,194,380 total funding

Calculating metrics for 2018...
  - Processing 48275 keywords for 11019 grants


Processing 2018: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:42<00:00, 141.10it/s]


  - Completed 2018: 11019 grants, $5,097,004,938 total funding

Calculating metrics for 2019...
  - Processing 48275 keywords for 11195 grants


Processing 2019: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:53<00:00, 136.50it/s]


  - Completed 2019: 11195 grants, $6,163,816,348 total funding

Calculating metrics for 2020...
  - Processing 48275 keywords for 11190 grants


Processing 2020: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:49<00:00, 138.08it/s]


  - Completed 2020: 11190 grants, $5,935,565,816 total funding

Calculating metrics for 2021...
  - Processing 48275 keywords for 11279 grants


Processing 2021: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:54<00:00, 136.21it/s]


  - Completed 2021: 11279 grants, $6,121,521,522 total funding

Calculating metrics for 2022...
  - Processing 48275 keywords for 11402 grants


Processing 2022: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [06:00<00:00, 133.89it/s]


  - Completed 2022: 11402 grants, $7,449,682,187 total funding

Calculating metrics for 2023...
  - Processing 48275 keywords for 11126 grants


Processing 2023: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [06:01<00:00, 133.36it/s]


  - Completed 2023: 11126 grants, $6,506,773,772 total funding

Calculating metrics for 2024...
  - Processing 48275 keywords for 10194 grants


Processing 2024: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [05:32<00:00, 145.05it/s]


  - Completed 2024: 10194 grants, $6,166,728,801 total funding

Calculating metrics for 2025...
  - Processing 48275 keywords for 8171 grants


Processing 2025: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [04:27<00:00, 180.72it/s]

  - Completed 2025: 8171 grants, $5,819,319,619 total funding

Annual metrics calculation completed!





In [41]:
# Save annual_metrics as JSON
print("Saving annual_metrics to JSON...")
metrics_json_path = 'export/nih/annual_metrics.json'

# Convert numpy types to native Python types for JSON serialization
annual_metrics_serializable = {}
for year, keywords_dict in annual_metrics.items():
    annual_metrics_serializable[str(year)] = {}
    for keyword, metrics in keywords_dict.items():
        annual_metrics_serializable[str(year)][keyword] = {
            'grant_count': int(metrics['grant_count']),
            'funding_total': float(metrics['funding_total']),
            'grant_percentage': float(metrics['grant_percentage']),
            'funding_proportion': float(metrics['funding_proportion'])
        }

with open(metrics_json_path, 'w') as f:
    json.dump(annual_metrics_serializable, f)
print(f"  - Annual metrics saved: {metrics_json_path}")

# Save annual_totals as JSON
print("Saving annual_totals to JSON...")
totals_json_path = 'export/nih/annual_totals.json'

annual_totals_serializable = {}
for year, totals in annual_totals.items():
    annual_totals_serializable[str(year)] = {
        'total_grants': int(totals['total_grants']),
        'total_funding': float(totals['total_funding'])
    }

with open(totals_json_path, 'w') as f:
    json.dump(annual_totals_serializable, f)
print(f"  - Annual totals saved: {totals_json_path}")

print("\nJSON files saved successfully! ðŸ’¾")

Saving annual_metrics to JSON...
  - Annual metrics saved: export/nih/annual_metrics.json
Saving annual_totals to JSON...
  - Annual totals saved: export/nih/annual_totals.json

JSON files saved successfully! ðŸ’¾


In [32]:
annual_metrics.get(2025).get('extend lifespan')

{'grant_count': 71,
 'funding_total': np.float64(60885527.0),
 'grant_percentage': 0.008689266919593685,
 'funding_proportion': np.float64(0.010462653881599763)}

In [33]:
annual_totals

{2015: {'total_grants': 8904, 'total_funding': np.float64(3774359436.0)},
 2016: {'total_grants': 9944, 'total_funding': np.float64(4477508466.0)},
 2017: {'total_grants': 10130, 'total_funding': np.float64(4824194380.0)},
 2018: {'total_grants': 11019, 'total_funding': np.float64(5097004938.0)},
 2019: {'total_grants': 11195, 'total_funding': np.float64(6163816348.0)},
 2020: {'total_grants': 11190, 'total_funding': np.float64(5935565816.0)},
 2021: {'total_grants': 11279, 'total_funding': np.float64(6121521522.0)},
 2022: {'total_grants': 11402, 'total_funding': np.float64(7449682187.0)},
 2023: {'total_grants': 11126, 'total_funding': np.float64(6506773772.0)},
 2024: {'total_grants': 10194, 'total_funding': np.float64(6166728801.0)},
 2025: {'total_grants': 8171, 'total_funding': np.float64(5819319619.0)}}

## Step 5: Compute Proportional Comparisons

In [34]:
# Calculate 2015-2024 average metrics
print("Calculating 2015-2024 average metrics...")

baseline_years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
average_metrics = {}

print("Average period (2015-2024):")
for year in baseline_years:
    print(f"  - {year}: {annual_totals[year]['total_grants']:,} grants, ${annual_totals[year]['total_funding']:,.0f} funding")

Calculating 2015-2024 average metrics...
Average period (2015-2024):
  - 2015: 8,904 grants, $3,774,359,436 funding
  - 2016: 9,944 grants, $4,477,508,466 funding
  - 2017: 10,130 grants, $4,824,194,380 funding
  - 2018: 11,019 grants, $5,097,004,938 funding
  - 2019: 11,195 grants, $6,163,816,348 funding
  - 2020: 11,190 grants, $5,935,565,816 funding
  - 2021: 11,279 grants, $6,121,521,522 funding
  - 2022: 11,402 grants, $7,449,682,187 funding
  - 2023: 11,126 grants, $6,506,773,772 funding
  - 2024: 10,194 grants, $6,166,728,801 funding


In [35]:
# Calculate average metrics for each keyword across 2015-2024
for keyword in tqdm(all_unique_keywords, desc="Calculating average metrics"):
    # Get proportions and percentages for each year
    yearly_funding_props = [annual_metrics[year][keyword]['funding_proportion'] for year in baseline_years]
    yearly_grant_pcts = [annual_metrics[year][keyword]['grant_percentage'] for year in baseline_years]
    
    # Calculate averages
    avg_funding_proportion = sum(yearly_funding_props) / len(baseline_years)
    avg_grant_percentage = sum(yearly_grant_pcts) / len(baseline_years)
    
    average_metrics[keyword] = {
        'funding_proportion': avg_funding_proportion,
        'grant_percentage': avg_grant_percentage
    }

Calculating average metrics: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 48275/48275 [00:00<00:00, 251858.68it/s]


In [36]:
average_metrics_sample = dict(list(average_metrics.items())[:5])
print("\nSample average metrics (2015-2024):")
for keyword, metrics in average_metrics_sample.items():
    print(f"  - {keyword}: {metrics}")


Sample average metrics (2015-2024):
  - myelocyte: {'funding_proportion': np.float64(1.0867991564125036e-05), 'grant_percentage': 1.7869109669714832e-05}
  - b-cell antigen receptor: {'funding_proportion': np.float64(0.0017097373779808335), 'grant_percentage': 0.0017550187895784934}
  - tupaiidae: {'funding_proportion': np.float64(0.0003328511380182717), 'grant_percentage': 0.00020637449116083298}
  - differential equation: {'funding_proportion': np.float64(0.0005490994136209613), 'grant_percentage': 0.0006870576719080163}
  - transfer agreement: {'funding_proportion': np.float64(0.00015668870558611296), 'grant_percentage': 7.591977742124416e-05}


## Step 6: Compile Final Results

In [37]:
# Create final comparison dataframe
print("Compiling final results...")

results_data = []

for keyword in all_unique_keywords:
    # Get 2025 metrics
    metrics_2025 = annual_metrics[2025][keyword]
    metrics_average = average_metrics[keyword]
    
    # Calculate changes
    funding_prop_change = metrics_2025['funding_proportion'] - metrics_average['funding_proportion']
    funding_relative_change = (funding_prop_change / metrics_average['funding_proportion'] * 100) if metrics_average['funding_proportion'] != 0 else np.nan
    grant_pct_change = metrics_2025['grant_percentage'] - metrics_average['grant_percentage']
    grant_relative_change = (grant_pct_change / metrics_average['grant_percentage'] * 100) if metrics_average['grant_percentage'] != 0 else np.nan
    
    results_data.append({
        'keyword': keyword,
        'funding_proportion_2025': metrics_2025['funding_proportion'],
        'funding_proportion_2015_2024_avg': metrics_average['funding_proportion'],
        'funding_relative_change': funding_relative_change,
        'funding_proportion_change': funding_prop_change,
        'grant_percentage_2025': metrics_2025['grant_percentage'],
        'grant_percentage_2015_2024_avg': metrics_average['grant_percentage'],
        'grant_percentage_change': grant_pct_change,
        'grant_relative_change': grant_relative_change
    })

# Create DataFrame and sort by funding impact
results_df = pd.DataFrame(results_data)

print(f"Results compiled for {len(results_df)} keywords")
print(f"Results dataframe shape: {results_df.shape}")

Compiling final results...
Results compiled for 48275 keywords
Results dataframe shape: (48275, 9)


In [38]:
# Filter for keywords with meaningful presence (more than 0.1% funding in both periods)
significant_keywords = results_df[
    (results_df['funding_proportion_2025'] >= 0.001) & 
    (results_df['funding_proportion_2015_2024_avg'] >= 0.001)
].copy()

print("\n=== SIGNIFICANT KEYWORDS ANALYSIS ===")
print("Keywords meeting significance criteria: {len(significant_keywords):,}")
print("(More than 0.1% funding in both periods)")

print("\nTop 20 keywords by 2025 funding proportion:")
top_significant = significant_keywords.nlargest(20, 'funding_proportion_2025')
display_cols = ['keyword', 'funding_proportion_2025', 'funding_proportion_2015_2024_avg', 
                'funding_proportion_change']
print(top_significant[display_cols].to_string(index=False))


=== SIGNIFICANT KEYWORDS ANALYSIS ===
Keywords meeting significance criteria: {len(significant_keywords):,}
(More than 0.1% funding in both periods)

Top 20 keywords by 2025 funding proportion:
    keyword  funding_proportion_2025  funding_proportion_2015_2024_avg  funding_proportion_change
       data                 0.680227                          0.663730                   0.016497
    testing                 0.650408                          0.674517                  -0.024109
      novel                 0.576807                          0.574065                   0.002741
      goals                 0.559675                          0.587930                  -0.028255
development                 0.554145                          0.557199                  -0.003053
   research                 0.553996                          0.516676                   0.037320
   improved                 0.539220                          0.475109                   0.064111
   modeling          

In [39]:
# Save results to CSV files
print("Saving results to CSV files...")

# Save complete results
results_df.to_csv('export/nih/nih_keyword_analysis_complete.csv', index=False)
print(f"  - Complete results saved: export/nih/nih_keyword_analysis_complete.csv ({len(results_df)} rows)")

# Save significant keywords only
significant_keywords.to_csv('export/nih/nih_keyword_analysis_significant.csv', index=False)
print(f"  - Significant keywords saved: export/nih/nih_keyword_analysis_significant.csv ({len(significant_keywords)} rows)")

# Save top changers for easy review
top_changers = pd.concat([
    significant_keywords.nlargest(50, 'funding_proportion_change'),
    significant_keywords.nsmallest(50, 'funding_proportion_change')
]).drop_duplicates()

top_changers.to_csv('export/nih/nih_keyword_analysis_top_changes.csv', index=False)
print(f"  - Top changes saved: export/nih/nih_keyword_analysis_top_changes.csv ({len(top_changers)} rows)")

print("\nAnalysis complete! ðŸŽ‰")

Saving results to CSV files...
  - Complete results saved: export/nih/nih_keyword_analysis_complete.csv (48275 rows)
  - Significant keywords saved: export/nih/nih_keyword_analysis_significant.csv (6988 rows)
  - Top changes saved: export/nih/nih_keyword_analysis_top_changes.csv (100 rows)

Analysis complete! ðŸŽ‰
