### This code is meant to extract specific information from the NIH website related to various RADx projects
#### Each block of code scrapes the website and pulls out a specific batch of info and puts it into an array
#### Once each array is made, everything has been placed into a pandas dataframe

In [2]:
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import requests


url = 'https://www.nih.gov/research-training/medical-research-initiatives/radx/funding#radx-tech-atp-funded'
req = requests.get(url)

print(req)

<Response [200]>


In [3]:
soup = BeautifulSoup(req.content)
#print(soup.prettify())

In [4]:
#This cell has code to retrieve the project titles from the page and place them all into a list "pTitles"

pTitles = []
proj = []
curProj = 'RADx-UP' 
elements = soup.find_all('div', class_ = 'callout callout-blue-light callout-no-rounded-corners callout-with-blue-bottom-border' )
for element in elements:
    if "Description:" in element.get_text():
        #This will account for RADx-TECH and skip it
        curProj = 'RADx-rad'
        continue
    else:
        pTitle = element.get_text().split('Awardee')[0][16:] #gives the text within the project title
        pTitles.append(pTitle)
        proj.append(curProj)

#print(len(pTitles))
#print(proj)


In [5]:
#This cell has code to retrieve the awardees from the page and place them all into a list "awardees"
awardees = []
elements = soup.find_all('div', class_ = 'callout callout-blue-light callout-no-rounded-corners callout-with-blue-bottom-border' )
for element in elements:
    if "Description:" in element.get_text():
        #need to append a space here into the table to account for RADx-TECH
        continue
    else:
        awardee = element.get_text().split('Awardee')[1].split('Project Number') #index 1 because the 0th index is all text before awardee, split again at project number
        cleanAwardee = awardee[0].strip(':').strip()
        awardees.append(cleanAwardee)
#print(len(awardees))

In [6]:
#This cell will get the project number and make another list with the corresponding link

pNums = []
pNumLinks = []
elements = soup.find_all('div', class_ = 'callout callout-blue-light callout-no-rounded-corners callout-with-blue-bottom-border' )
for element in elements:
    if "Description:" in element.get_text():
        #need to append a space here into the table to account for RADx-TECH
        continue
    else:
        pNum = element.get_text().split('Project Number')[1].split('Funding Opportunity Announcement')
        pNumClean = pNum[0].strip(':').strip() #0th index holds the information of interest
        if "Summary" in pNumClean:
            pNumClean = pNumClean.split('Summary') #the last table does not have a funding opportunity announcement, this will account for that
            pNumClean2 = pNumClean[0]
            pNums.append(pNumClean2)
            pNumLinks.append('NA') #the last table has no project number link

        else:
            pNums.append(pNumClean)
            links = element.find_all('a')
            for link in links:
                if pNumClean in link.get_text(): #this will grab the project number link if it is present and matches the project number
                    pNumLinks.append(link.get('href'))

        
#print(len(pNumLinks))

        

In [7]:
#This cell will get the Funding Opportunity Announcement number(s)
foas = []
foaLinks  = []

elements = soup.find_all('div', class_ = 'callout callout-blue-light callout-no-rounded-corners callout-with-blue-bottom-border' )
for element in elements:
    if "Description:" in element.get_text(): #skips all RADX-Tech sections
        #parses through the text and skips all with description in it to skip RADx-Tech sections
        continue
    if 'Funding Opportunity Announcement' in element.get_text(): #check to see if FOA is in the table/text
        foa = element.get_text().split('Funding Opportunity Announcement')[1].split('Summary') #get the list with FOA in it
        foaClean = foa[0].strip(':').strip() #grab the 0th element which contains the text including FOA, and clean it
        if '\xa0' in foaClean:
            foaClean = foaClean.replace('\xa0', ' ') #accounts for FOAs with two or more, cleans off excess non breaking spaces
        links = element.find_all('a')
        if len(links) == 3 and "PA-" in links[1].get_text(): #check to see if there are multiple links in FOA
            foaLinks.append(links[1].get('href') +  " " + links[2].get('href')) #grab both as one element in the list
        else:
            for link in links: #others, just parse through each link and get the one corresponding to the FOA
                if foaClean in link.get_text():
                    foaLinks.append(link.get('href'))
        foas.append(foaClean)
    else: #if there is no FOA, append NA
        foas.append('NA')
        foaLinks.append('NA')

#print(foas)
#print(foaLinks)    

In [8]:
#This cell will get the Summary for each project
summarys = []

elements = soup.find_all('div', class_ = 'callout callout-blue-light callout-no-rounded-corners callout-with-blue-bottom-border' )
for element in elements:
    if "Description:" in element.get_text(): #skips all RADX-Tech sections
        #parses through the text and skips all with description in it to skip RADx-Tech sections
        continue
    summ = element.get_text().split('Summary') #create a list splitting text into two elements, 2nd element being the summary
    summClean = summ[1].strip(':').strip() #grab the summary and clean it by selecting the 2nd element and stripping off excess chars and spaces
    summarys.append(summClean) #add each summary to the summarys list

#print(len(summarys))

In [9]:
#3 projects are RADx-UP, RADx-TECH, and RADx-rad.

myDf = pd.DataFrame(columns=['radx_project', 'project_title', 'awardee', 'project_number', 'pNum_link', 'foa', 'foa_link', 'summary'])
myDf['radx_project'] = proj
myDf['project_title'] = pTitles
myDf['awardee'] = awardees
myDf['project_number'] = pNums
myDf['pNum_link'] = pNumLinks
myDf['foa'] = foas
myDf['foa_link'] = foaLinks
myDf['summary'] = summarys
myDf

Unnamed: 0,radx_project,project_title,awardee,project_number,pNum_link,foa,foa_link,summary
0,RADx-UP,Safer At School Early Alert (SASEA),University of California San Diego,1U01HD108787-01,https://reporter.nih.gov/search/44g5CsIy-EGIvK...,RFA-OD-21-008,https://grants.nih.gov/grants/guide/rfa-files/...,"This project plans to expand, evaluate, and im..."
1,RADx-UP,Sin Duda: a community-driven approach to expan...,Johns Hopkins University,1U01MD017412-01,https://reporter.nih.gov/search/KxUeA7_GskO4nP...,RFA-OD-21-008,https://grants.nih.gov/grants/guide/rfa-files/...,This project will expand on their Phase 1 proj...
2,RADx-UP,Using the multiphase optimization strategy (MO...,New York University,1U01MD017418-01,https://reporter.nih.gov/search/HI_N1l8IJEunmc...,RFA-OD-21-008,https://grants.nih.gov/grants/guide/rfa-files/...,This study addresses community-engaged interve...
3,RADx-UP,COVID-19 testing and vaccination social networ...,University of Chicago,1U01MD017414-01,https://reporter.nih.gov/search/DIQGq754ekySmh...,RFA-OD-21-008,https://grants.nih.gov/grants/guide/rfa-files/...,This study will test the efficacy of an implem...
4,RADx-UP,Motivational enhancement to augment contingenc...,University of Oregon,1U01DA055982-01,https://reporter.nih.gov/search/xV5Tg1M_dkWjeW...,RFA-OD-21-008,https://grants.nih.gov/grants/guide/rfa-files/...,This study will be a randomized comparative ef...
...,...,...,...,...,...,...,...,...
173,RADx-rad,Broad-spectrum Detection of VOC and Non-VOC Bi...,University of Washington,3U01HL152401-02S1,https://projectreporter.nih.gov/project_info_d...,PA-20-272 and NOT-OD-21-035,https://grants.nih.gov/grants/guide/pa-files/p...,This project will develop a biomimetic olfacti...
174,RADx-rad,A Rapid Saliva Antigen Test for SARS-CoV-2 Det...,Brigham and Women’s Hospital,3U54HL119145-07S1,https://projectreporter.nih.gov/project_info_d...,PA-20-272 and NOT-OD-21-035,https://grants.nih.gov/grants/guide/pa-files/p...,This project will develop an ultrasensitive po...
175,RADx-rad,A Rapid Breathalyzer Diagnostics Platform for ...,Rutgers University,5U01HL150852-02,https://projectreporter.nih.gov/project_info_d...,PA-20-272 and NOT-OD-21-035,https://grants.nih.gov/grants/guide/pa-files/p...,This project will develop a novel testing plat...
176,RADx-rad,"RADx-rad: A Rapid, Sensitive, Point-of-care, A...",Boston Biomedical Innovation Center (B-BIC),5U54HL119145-07,https://projectreporter.nih.gov/project_info_d...,PA-20-272 and NOT-OD-21-035,https://grants.nih.gov/grants/guide/pa-files/p...,"This project will develop a simple, low-cost, ..."


In [10]:
#Write the dataframe to a csv.
myDf.to_csv('radx_projects.csv')