## Step 1: Setup and Data Loading

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully")

Libraries imported successfully


In [5]:
# Load CSV files, skipping the first 6 lines of metadata
file_paths = {
    2021: './data/2021-total-cost-keywords.csv',
    2022: './data/2022-total-cost-keywords.csv', 
    2023: './data/2023-total-cost-keywords.csv',
    2024: './data/2024-total-cost-keywords.csv',
    2025: './data/2025-total-cost-keywords.csv'
}

# Load all dataframes
dataframes = {}
for year, path in file_paths.items():
    print(f"Loading {year} data...")
    df = pd.read_csv(path, skiprows=6)  # Skip first 6 lines of metadata
    print(f"  - Raw shape: {df.shape}")
    dataframes[year] = df
    
print("\nAll files loaded successfully!")

Loading 2021 data...
  - Raw shape: (11811, 4)
Loading 2022 data...
  - Raw shape: (11921, 4)
Loading 2023 data...
  - Raw shape: (11647, 4)
Loading 2024 data...
  - Raw shape: (10603, 4)
Loading 2025 data...
  - Raw shape: (8493, 4)

All files loaded successfully!
  - Raw shape: (11647, 4)
Loading 2024 data...
  - Raw shape: (10603, 4)
Loading 2025 data...
  - Raw shape: (8493, 4)

All files loaded successfully!


## Step 2: Data Preprocessing

In [6]:
# Examine the structure of one dataframe
print("Sample dataframe structure (2021):")
print(f"Columns: {list(dataframes[2021].columns)}")
print(f"\nFirst few rows:")
print(dataframes[2021].head())
print(f"\nData types:")
print(dataframes[2021].dtypes)

Sample dataframe structure (2021):
Columns: ['Project Terms', 'Total Cost', 'Total Cost(Sub Projects)', 'Unnamed: 3']

First few rows:
                                                                                         Project Terms  \
0                                                                                                        
1                                                                                                        
2  Accounting;Address;Adult;Adverse effects;Adverse event;Age;Alcohol consumption;Alcohols;American...   
3  Affect;African American;Area;Attitude;Big Data;Biological;Biological Markers;Biological Process;...   
4  Activities of Daily Living;Address;Affect;Age;Aging;Anxiety;Biological;Biological Aging;Body mas...   

  Total Cost Total Cost(Sub Projects)  Unnamed: 3  
0     119029                                  NaN  
1     111964                                  NaN  
2     693883                                  NaN  
3     374998        

In [7]:
def preprocess_dataframe(df, year):
    """
    Clean and preprocess the dataframe:
    1. Drop unnecessary columns
    2. Remove rows with empty Project Terms or Total Cost
    3. Clean and validate data types
    """
    print(f"Preprocessing {year} data...")
    
    # Make a copy to avoid modifying original
    df_clean = df.copy()
    
    # Drop unnecessary columns
    columns_to_drop = []
    if 'Total Cost(Sub Projects)' in df_clean.columns:
        columns_to_drop.append('Total Cost(Sub Projects)')
    if 'Unnamed: 3' in df_clean.columns:
        columns_to_drop.append('Unnamed: 3')
    
    if columns_to_drop:
        df_clean = df_clean.drop(columns=columns_to_drop)
        print(f"  - Dropped columns: {columns_to_drop}")
    
    # Remove rows where Project Terms or Total Cost are empty/null
    initial_rows = len(df_clean)
    
    # Remove rows with empty or whitespace-only Project Terms
    df_clean = df_clean[df_clean['Project Terms'].notna()]
    df_clean = df_clean[df_clean['Project Terms'].str.strip() != '']
    
    # Remove rows with empty or null Total Cost
    df_clean = df_clean[df_clean['Total Cost'].notna()]
    
    # Convert Total Cost to numeric, handling any string formatting
    df_clean['Total Cost'] = pd.to_numeric(df_clean['Total Cost'], errors='coerce')
    df_clean = df_clean[df_clean['Total Cost'].notna()]
    
    final_rows = len(df_clean)
    rows_removed = initial_rows - final_rows
    
    print(f"  - Initial rows: {initial_rows}")
    print(f"  - Final rows: {final_rows}")
    print(f"  - Rows removed: {rows_removed}")
    print(f"  - Total funding: ${df_clean['Total Cost'].sum():,.0f}")
    
    return df_clean

# Preprocess all dataframes
clean_dataframes = {}
for year, df in dataframes.items():
    clean_dataframes[year] = preprocess_dataframe(df, year)
    print()

print("Data preprocessing completed!")

Preprocessing 2021 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 11811
  - Final rows: 11279
  - Rows removed: 532
  - Total funding: $6,121,521,522

Preprocessing 2022 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 11921
  - Final rows: 11402
  - Rows removed: 519
  - Total funding: $7,449,682,187

Preprocessing 2023 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 11647
  - Final rows: 11126
  - Rows removed: 521
  - Total funding: $6,506,773,772

Preprocessing 2024 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 10603
  - Final rows: 10194
  - Rows removed: 409
  - Total funding: $6,166,728,801

Preprocessing 2025 data...
  - Dropped columns: ['Total Cost(Sub Projects)', 'Unnamed: 3']
  - Initial rows: 8493
  - Final rows: 8171
  - Rows removed: 322
  - Total funding: $5,819,319,619

Data preprocessing completed!


## Step 3: Keyword Extraction and Normalization

In [8]:
def extract_keywords_from_dataframe(df, year):
    """
    Extract all unique keywords from a dataframe's Project Terms column.
    Keywords are separated by semicolons and need to be normalized.
    Also adds a 'keywords_list' column to the dataframe for efficient lookup.
    """
    print(f"Extracting keywords from {year} data...")
    
    all_keywords = set()
    keywords_lists = []
    
    for project_terms in df['Project Terms']:
        if pd.notna(project_terms):
            # Split on semicolon and normalize
            keywords = [kw.strip().lower() for kw in str(project_terms).split(';')]
            # Remove empty keywords
            keywords = [kw for kw in keywords if kw and kw != '']
            all_keywords.update(keywords)
            keywords_lists.append(keywords)
        else:
            keywords_lists.append([])
    
    # Add the keywords list as a new column
    df['keywords_list'] = keywords_lists
    
    print(f"  - Found {len(all_keywords)} unique keywords")
    print(f"  - Added keywords_list column to dataframe")
    return all_keywords

In [9]:
clean_dataframes.keys()

dict_keys([2021, 2022, 2023, 2024, 2025])

In [10]:
clean_dataframes.get(2021).keys()

Index(['Project Terms', 'Total Cost'], dtype='object')

In [11]:
# Extract keywords from each year and update the dataframes
yearly_keywords = {}
for year, df in clean_dataframes.items():
    yearly_keywords[year] = extract_keywords_from_dataframe(df, year)
    # The dataframe is modified in-place by the function, so clean_dataframes[year] now has the keywords_list column
    print(f"  - Updated dataframe columns: {list(df.columns)}")
    print()

Extracting keywords from 2021 data...
  - Found 28914 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2022 data...
  - Found 29244 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2023 data...
  - Found 29677 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2024 data...
  - Found 29244 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 2023 data...
  - Found 29677 unique keywords
  - Added keywords_list column to dataframe
  - Updated dataframe columns: ['Project Terms', 'Total Cost', 'keywords_list']

Extracting keywords from 

In [12]:
# Print sample keywords from 2021
list(yearly_keywords.get(2021))[:10]

['hygromycin a',
 'trimethoprim-sulfamethoxazole',
 'drug resistance in tuberculosis',
 'hormone receptor',
 'neuregulins',
 'cloud storage',
 'artificial intelligence',
 'ambiguous genitalia',
 'biomphalaria',
 'electron transport complex iii']

In [13]:
clean_dataframes.get(2021).head()

Unnamed: 0,Project Terms,Total Cost,keywords_list
2,Accounting;Address;Adult;Adverse effects;Adverse event;Age;Alcohol consumption;Alcohols;American...,693883.0,"[accounting, address, adult, adverse effects, adverse event, age, alcohol consumption, alcohols,..."
3,Affect;African American;Area;Attitude;Big Data;Biological;Biological Markers;Biological Process;...,374998.0,"[affect, african american, area, attitude, big data, biological, biological markers, biological ..."
4,Activities of Daily Living;Address;Affect;Age;Aging;Anxiety;Biological;Biological Aging;Body mas...,885747.0,"[activities of daily living, address, affect, age, aging, anxiety, biological, biological aging,..."
5,Acute;Adult;Affect;Age;Age of Onset;Aging;Animal Model;Animals;Area;Behavior;Behavioral;Biology;...,839120.0,"[acute, adult, affect, age, age of onset, aging, animal model, animals, area, behavior, behavior..."
6,Address;Affect;Affective;Age;Aging;Alzheimer's Disease;Animal Model;Animals;Autonomic nervous sy...,793643.0,"[address, affect, affective, age, aging, alzheimer's disease, animal model, animals, autonomic n..."


In [14]:
'accounting' in yearly_keywords.get(2021)

True

In [15]:

# Create master list of all unique keywords
all_unique_keywords = set()
for keywords in yearly_keywords.values():
    all_unique_keywords.update(keywords)

print(f"\nTotal unique keywords across all years: {len(all_unique_keywords)}")

# Display some sample keywords
sample_keywords = list(all_unique_keywords)[:10]
print("\nSample keywords:")
sample_keywords


Total unique keywords across all years: 41742

Sample keywords:


['hygromycin a',
 'trimethoprim-sulfamethoxazole',
 'drug resistance in tuberculosis',
 'hormone receptor',
 'neuregulins',
 'gemin5 gene',
 'cloud storage',
 'artificial intelligence',
 'ambiguous genitalia',
 'biomphalaria']

## Step 4: Calculate Annual Keyword Metrics

In [16]:
def calculate_keyword_metrics_for_year(df, year, all_keywords):
    """
    For each keyword, calculate:
    - Number of grants containing the keyword
    - Total funding for grants containing the keyword
    """
    print(f"Calculating metrics for {year}...")
    
    keyword_metrics = {}
    total_grants = len(df)
    total_funding = df['Total Cost'].sum()
    
    print(f"  - Processing {len(all_keywords)} keywords for {total_grants} grants")
    
    # Use tqdm for progress bar on large keyword sets
    for keyword in tqdm(all_keywords, desc=f"Processing {year}"):
        # Find grants that contain this keyword using list membership
        mask = df['keywords_list'].apply(lambda kw_list: keyword in kw_list)
        grants_with_keyword = df[mask]
        
        grant_count = len(grants_with_keyword)
        funding_total = grants_with_keyword['Total Cost'].sum() if grant_count > 0 else 0
        
        keyword_metrics[keyword] = {
            'grant_count': grant_count,
            'funding_total': funding_total,
            'grant_percentage': grant_count / total_grants,
            'funding_proportion': funding_total / total_funding
        }
    
    return keyword_metrics, total_grants, total_funding

In [17]:
# Calculate metrics for each year
annual_metrics = {}
annual_totals = {}

for year, df in clean_dataframes.items():
    metrics, total_grants, total_funding = calculate_keyword_metrics_for_year(
        df, year, all_unique_keywords
    )
    annual_metrics[year] = metrics
    annual_totals[year] = {
        'total_grants': total_grants,
        'total_funding': total_funding
    }
    print(f"  - Completed {year}: {total_grants} grants, ${total_funding:,.0f} total funding\n")

print("Annual metrics calculation completed!")

Calculating metrics for 2021...
  - Processing 41742 keywords for 11279 grants


Processing 2021: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [05:06<00:00, 136.08it/s]
Processing 2021: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [05:06<00:00, 136.08it/s]


  - Completed 2021: 11279 grants, $6,121,521,522 total funding

Calculating metrics for 2022...
  - Processing 41742 keywords for 11402 grants


Processing 2022: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [05:20<00:00, 130.17it/s]
Processing 2022: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [05:20<00:00, 130.17it/s]


  - Completed 2022: 11402 grants, $7,449,682,187 total funding

Calculating metrics for 2023...
  - Processing 41742 keywords for 11126 grants


Processing 2023: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [05:05<00:00, 136.46it/s]
Processing 2023: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [05:05<00:00, 136.46it/s]


  - Completed 2023: 11126 grants, $6,506,773,772 total funding

Calculating metrics for 2024...
  - Processing 41742 keywords for 10194 grants


Processing 2024: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [04:40<00:00, 148.79it/s]
Processing 2024: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [04:40<00:00, 148.79it/s]


  - Completed 2024: 10194 grants, $6,166,728,801 total funding

Calculating metrics for 2025...
  - Processing 41742 keywords for 8171 grants


Processing 2025: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [03:44<00:00, 185.99it/s]

  - Completed 2025: 8171 grants, $5,819,319,619 total funding

Annual metrics calculation completed!





In [23]:
annual_metrics.get(2025).get('extend lifespan')

{'grant_count': 71,
 'funding_total': np.float64(60885527.0),
 'grant_percentage': 0.008689266919593685,
 'funding_proportion': np.float64(0.010462653881599763)}

In [24]:
annual_totals

{2021: {'total_grants': 11279, 'total_funding': np.float64(6121521522.0)},
 2022: {'total_grants': 11402, 'total_funding': np.float64(7449682187.0)},
 2023: {'total_grants': 11126, 'total_funding': np.float64(6506773772.0)},
 2024: {'total_grants': 10194, 'total_funding': np.float64(6166728801.0)},
 2025: {'total_grants': 8171, 'total_funding': np.float64(5819319619.0)}}

## Step 5: Compute Proportional Comparisons

In [25]:
# Calculate 2021-2024 average metrics
print("Calculating 2021-2024 average metrics...")

baseline_years = [2021, 2022, 2023, 2024]
average_metrics = {}

print(f"Average period (2021-2024):")
for year in baseline_years:
    print(f"  - {year}: {annual_totals[year]['total_grants']:,} grants, ${annual_totals[year]['total_funding']:,.0f} funding")

Calculating 2021-2024 average metrics...
Average period (2021-2024):
  - 2021: 11,279 grants, $6,121,521,522 funding
  - 2022: 11,402 grants, $7,449,682,187 funding
  - 2023: 11,126 grants, $6,506,773,772 funding
  - 2024: 10,194 grants, $6,166,728,801 funding


In [26]:
# Calculate average metrics for each keyword across 2021-2024
for keyword in tqdm(all_unique_keywords, desc="Calculating average metrics"):
    # Get proportions and percentages for each year
    yearly_funding_props = [annual_metrics[year][keyword]['funding_proportion'] for year in baseline_years]
    yearly_grant_pcts = [annual_metrics[year][keyword]['grant_percentage'] for year in baseline_years]
    
    # Calculate averages
    avg_funding_proportion = sum(yearly_funding_props) / len(baseline_years)
    avg_grant_percentage = sum(yearly_grant_pcts) / len(baseline_years)
    
    average_metrics[keyword] = {
        'funding_proportion': avg_funding_proportion,
        'grant_percentage': avg_grant_percentage
    }


Calculating average metrics: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [00:00<00:00, 188713.16it/s]
Calculating average metrics: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 41742/41742 [00:00<00:00, 188713.16it/s]


In [28]:
average_metrics_sample = dict(list(average_metrics.items())[:5])
print("\nSample average metrics (2021-2024):")
for keyword, metrics in average_metrics_sample.items():
    print(f"  - {keyword}: {metrics}")


Sample average metrics (2021-2024):
  - hygromycin a: {'funding_proportion': np.float64(3.161170211666577e-05), 'grant_percentage': 6.680006146139561e-05}
  - trimethoprim-sulfamethoxazole: {'funding_proportion': np.float64(0.00016886371541529179), 'grant_percentage': 0.0002525351956814835}
  - drug resistance in tuberculosis: {'funding_proportion': np.float64(0.0015145684614997837), 'grant_percentage': 0.001641937980746032}
  - hormone receptor: {'funding_proportion': np.float64(0.0009790053132811688), 'grant_percentage': 0.0012835211932472712}
  - neuregulins: {'funding_proportion': np.float64(9.515393005277545e-05), 'grant_percentage': 0.0001386232164763954}


## Step 6: Compile Final Results

In [37]:
# Create final comparison dataframe
print("Compiling final results...")

results_data = []

for keyword in all_unique_keywords:
    # Get 2025 metrics
    metrics_2025 = annual_metrics[2025][keyword]
    metrics_average = average_metrics[keyword]
    
    # Calculate changes
    funding_prop_change = metrics_2025['funding_proportion'] - metrics_average['funding_proportion']
    funding_relative_change = (funding_prop_change / metrics_average['funding_proportion'] * 100) if metrics_average['funding_proportion'] != 0 else np.nan
    grant_pct_change = metrics_2025['grant_percentage'] - metrics_average['grant_percentage']
    grant_relative_change = (grant_pct_change / metrics_average['grant_percentage'] * 100) if metrics_average['grant_percentage'] != 0 else np.nan
    
    results_data.append({
        'keyword': keyword,
        'funding_proportion_2025': metrics_2025['funding_proportion'],
        'funding_proportion_2021_2024_avg': metrics_average['funding_proportion'],
        'funding_relative_change': funding_relative_change,
        'funding_proportion_change': funding_prop_change,
        'grant_percentage_2025': metrics_2025['grant_percentage'],
        'grant_percentage_2021_2024_avg': metrics_average['grant_percentage'],
        'grant_percentage_change': grant_pct_change,
        'grant_relative_change': grant_relative_change
    })

# Create DataFrame and sort by funding impact
results_df = pd.DataFrame(results_data)

print(f"Results compiled for {len(results_df)} keywords")
print(f"Results dataframe shape: {results_df.shape}")

Compiling final results...
Results compiled for 41742 keywords
Results dataframe shape: (41742, 9)


In [45]:
# Filter for keywords with meaningful presence (appear in at least 10 grants or $1M+ funding)
significant_keywords = results_df[
    (results_df['funding_proportion_2025'] >= 0.001) & 
    (results_df['funding_proportion_2021_2024_avg'] >= 0.001)
].copy()

print(f"\n=== SIGNIFICANT KEYWORDS ANALYSIS ===")
print(f"Keywords meeting significance criteria: {len(significant_keywords):,}")
print(f"(At least 10 grants in any period OR $1M+ funding in any period)")

print(f"\nTop 20 keywords by 2025 funding proportion:")
top_significant = significant_keywords.nlargest(20, 'funding_proportion_2025')
display_cols = ['keyword', 'funding_proportion_2025', 'funding_proportion_2021_2024_avg', 
                'funding_proportion_change']
print(top_significant[display_cols].to_string(index=False))


=== SIGNIFICANT KEYWORDS ANALYSIS ===
Keywords meeting significance criteria: 7,386
(At least 10 grants in any period OR $1M+ funding in any period)

Top 20 keywords by 2025 funding proportion:
    keyword  funding_proportion_2025  funding_proportion_2021_2024_avg  funding_proportion_change
       data                 0.680227                          0.672186                   0.008041
    testing                 0.650408                          0.660386                  -0.009978
      novel                 0.576807                          0.580245                  -0.003438
      goals                 0.559675                          0.596788                  -0.037113
development                 0.554145                          0.557139                  -0.002994
   research                 0.553996                          0.532708                   0.021287
   improved                 0.539220                          0.496699                   0.042521
   modeling          

In [46]:
# Save results to CSV files
print("Saving results to CSV files...")

# Save complete results
results_df.to_csv('nih_keyword_analysis_complete.csv', index=False)
print(f"  - Complete results saved: nih_keyword_analysis_complete.csv ({len(results_df)} rows)")

# Save significant keywords only
significant_keywords.to_csv('nih_keyword_analysis_significant.csv', index=False)
print(f"  - Significant keywords saved: nih_keyword_analysis_significant.csv ({len(significant_keywords)} rows)")

# Save top changers for easy review
top_changers = pd.concat([
    significant_keywords.nlargest(50, 'funding_proportion_change'),
    significant_keywords.nsmallest(50, 'funding_proportion_change')
]).drop_duplicates()

top_changers.to_csv('nih_keyword_analysis_top_changes.csv', index=False)
print(f"  - Top changes saved: nih_keyword_analysis_top_changes.csv ({len(top_changers)} rows)")

print("\nAnalysis complete! ðŸŽ‰")

Saving results to CSV files...
  - Complete results saved: nih_keyword_analysis_complete.csv (41742 rows)
  - Significant keywords saved: nih_keyword_analysis_significant.csv (7386 rows)
  - Top changes saved: nih_keyword_analysis_top_changes.csv (100 rows)

Analysis complete! ðŸŽ‰
  - Complete results saved: nih_keyword_analysis_complete.csv (41742 rows)
  - Significant keywords saved: nih_keyword_analysis_significant.csv (7386 rows)
  - Top changes saved: nih_keyword_analysis_top_changes.csv (100 rows)

Analysis complete! ðŸŽ‰
