<a href="https://colab.research.google.com/github/chris-creditdesign/nih-grant-terminations/blob/main/most_common_terms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Setup
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

df = pd.read_csv("/content/2024-03-31-NIH Grant Terminations_ Auto-Generated - grant_data.csv")

# Drop any rows where the 'cancellation_source' column is not 'HHS reported' or 'Self reported'
df_confirmed = df[df['cancellation_source'].isin(['HHS reported', 'Self reported', 'Self and HHS reported'])]
total_grants_count = len(df_confirmed)
print(f"There are {total_grants_count} rows in the dataframe.")

There are 704 rows in the dataframe.


In [10]:
# Create a new list to store the extracted terms
all_terms = []

# Iterate through the 'terms' column
for terms_string in df_confirmed["terms"]:
  # Split the string into individual terms using commas as delimiters
  terms_list = [term.strip().lower() for term in str(terms_string).split(',')]

  for term in terms_list:
    all_terms.append(term)

all_terms_series = pd.Series(all_terms)

In [55]:
all_terms_count = all_terms_series.value_counts().sort_values(ascending=False)

In [122]:
print(f"There are {len(all_terms_series)} terms listed.")
print(f"With {len(all_terms_count)} individual terms.")
print(f"{len(all_terms_count[all_terms_count > 1])} terms are listed more than once.")

There are 68445 terms listed.
With 7734 individual terms.
4376 terms are listed more than once.


In [75]:
trans_filter_terms = [
    'affirming care',
    'assigned at birth',
    'assigned male at birth',
    'assigned female at birth',
    'gender diversity',
    'gender identity',
    'gender affirming care',
    'non-binary',
    'nonbinary',
    'transgender'
]

In [99]:
covid_filter_terms = [
  'covid-19 assay',
  'covid-19 complications',
  'covid-19 detection',
  'covid-19 diagnosis',
  'covid-19 diagnostic',
  'covid-19 disparity',
  'covid-19 impact',
  'covid-19 incidence',
  'covid-19 misinformation',
  'covid-19 monitoring',
  'covid-19 morbidity',
  'covid-19 mortality',
  'covid-19 outbreak',
  'covid-19 pandemic effects',
  'covid-19 pandemic',
  'covid-19 pathogenesis',
  'covid-19 patient',
  'covid-19 pneumonia',
  'covid-19 point of care',
  'covid-19 prevalence',
  'covid-19 prevention network',
  'covid-19 prevention',
  'covid-19 risk',
  'covid-19 screening',
  'covid-19 severity',
  'covid-19 stress',
  'covid-19 surveillance',
  'covid-19 survivors',
  'covid-19 susceptibility',
  'covid-19 test',
  'covid-19 testing',
  'covid-19 therapeutics',
  'covid-19 treatment',
  'covid-19 vaccination',
  'covid-19 vaccine',
  'covid-19'
 ]

In [96]:
hiv_filter_terms = [
  'hiv anti-retroviral',
  'hiv antiretroviral',
  'hiv diagnosis',
  'hiv disparities',
  'hiv drug resistance',
  'hiv infections',
  'hiv prevention trial',
  'hiv prevention trials network',
  'hiv prevention',
  'hiv problem',
  'hiv reservoir',
  'hiv resistance',
  'hiv risk',
  'hiv seronegativity',
  'hiv seropositivity',
  'hiv seroprevalence',
  'hiv therapy',
  'hiv transmission',
  'hiv vaccine trials network',
  'hiv vaccine',
  'hiv',
  'hiv-1 integrase',
  'hiv-1',
  'hiv-associated neurocognitive disorder',
  'hiv-exposed uninfected infant',
  'hiv-infected adolescents',
  'hiv/aids',
  'hiv/hcv',
  'hiv/std',
  'hiv/tb'
]

In [100]:
climate_filter_terms = [
  'climate adaptation',
  'climate change',
  'climate crisis',
  'climate data',
  'climate disaster',
  'climate impact',
  'climate policy',
  'climate science',
  'climate variability',
  'climate zone',
  'climate',
  'climate-related health'
]

In [102]:
filters = {
    "Trans Health Related": trans_filter_terms,
    "COVID-19 Related": covid_filter_terms,
    "HIV Related": hiv_filter_terms,
    "Climate related": climate_filter_terms
}

In [114]:
all_terms_count_df = all_terms_count.reset_index()
all_terms_count_df.columns = ['term', 'count']

In [132]:
print("Top 10 terms:")
for row in all_terms_count_df[:10].iterrows():
  print(f"{row[1]['term']}: {row[1]['count']}")

Top 10 terms:
research: 546
data: 458
address: 436
experience: 423
population: 394
individual: 390
health: 382
improved: 379
goals: 379
outcome: 373


In [124]:
for filter in filters.items():
  print(f"{filter[0]}:")
  top_5 = all_terms_count_df[all_terms_count_df['term'].isin(filter[1])].sort_values(by='count', ascending=False)[:5]
  for row in top_5.iterrows():
    print(f"{row[1]['term']}: {row[1]['count']}")

  print("\n")

Trans Health Related:
transgender: 125
gender identity: 98
nonbinary: 42
gender diversity: 23
gender affirming care: 21


COVID-19 Related:
covid-19: 94
covid-19 pandemic: 66
covid-19 vaccine: 30
covid-19 vaccination: 22
covid-19 impact: 17


HIV Related:
hiv: 208
hiv infections: 91
hiv risk: 55
hiv seronegativity: 43
hiv seropositivity: 35


Climate related:
climate: 28
climate change: 12
climate-related health: 8
climate zone: 3
climate adaptation: 3


