# Downloading all 510ks and De Novos from FDA website

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
import PyPDF2
import fitz
import re

In [2]:
knums = []
with open('raw_files/pmn96cur.txt', 'r', errors='ignore') as fp:
    lines = fp.readlines()
    for line in lines[1:]:
        features = line.split('|')
        if len(features) >= 11:
            date = features[11]
            year = int(date.split('/')[2])
            if year >= 2015:
                knums.append(features[0])


In [3]:
len(knums)

17817

In [8]:
entries = []

for i, approval in enumerate(knums):
    if i % 1000 == 0: print(i)
    if approval.startswith('DEN'):
        fda_link = link_deno
    elif approval.startswith('K'):
        fda_link = link_510k
    else:
        break
        
    # Download HTML from FDA website
    if os.path.exists('FDA_htmls/{}.html'.format(approval)):
        continue
    r = requests.get(fda_link+approval)
    with open('FDA_htmls/{}.html'.format(approval), 'w') as fp:
        fp.write(r.text)
        
    # Parse metadata from FDA website
    with open('FDA_htmls/{}.html'.format(approval), 'r') as fp:
        soup = BeautifulSoup(fp)
    entry = {}
    entry['query_id'] = approval
    for table in soup.findAll('table', {'align': 'center', 'style': 'text-transform: capitalize'}):
        for tr in table.findAll('tr'):
            try:
                th = tr.find('th').text
            except:
                continue

            for k, v in mapping.items():
                if k in th:
                    if v == 'summary_link':
                        link = tr.find('a', href=True)['href']
                        if len(link) == 0:
                            link = tr.find('A', href=True)['HREF']
                        entry[v] = link.strip().lower().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                    else:
                        text = tr.find('td').text
                        if len(text) > 0:
                            entry[v] = text.strip().lower().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    if len(entry) > 0:                    
        entries.append(entry)
        if i % 100 == 0:
            df = pd.DataFrame(entries)
            df.to_csv('scrape.csv')
        
    else:
        raise ValueError('No values found for HTML file {}'.format(approval))
    
    # Download PDF from FDA website
    if 'summary_link' not in entry:
        continue
    r = requests.get(entry['summary_link'])
    with open('FDA_pdfs/{}.pdf'.format(entry['approval_number']), 'wb') as fp:
        fp.write(r.content)
    
    # Extract text from FDA website
    try:
        with fitz.open('FDA_pdfs/{}.pdf'.format(entry['approval_number'])) as fp:
            pdf_text = ""
            for i in fp:
                pdf_text +=i.getText()
        pdf_text = re.sub('[^0-9a-zA-Z]+', ' ', pdf_text)
        with open('FDA_texts/{}.txt'.format(entry['approval_number']), 'w') as fp:
            fp.write(pdf_text)
    except:
        continue

df = pd.DataFrame(entries)
df.to_csv('aiml_dfs/scrape.csv')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


# Download PMAs from FDA database

In [10]:
nums = []
with open('raw_files/pma.txt', 'r', errors='ignore') as fp:
    lines = fp.readlines()
    for line in lines[1:]:
        features = line.split('|')
        if len(features) >= 17:
            date = features[17]
            year = int(date.split('/')[2])
            if year >= 2015:
                nums.append(features[0])


In [11]:
link_pma = 'https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpma/pma.cfm?id='


In [20]:
entries = []

for i, approval in enumerate(nums):
    if i % 1000 == 0: print(i)
        
    # Download HTML from FDA website
    if os.path.exists('FDA_htmls/{}.html'.format(approval)):
        continue
    r = requests.get(link_pma+approval)
    with open('FDA_htmls/{}.html'.format(approval), 'w') as fp:
        fp.write(r.text)
        
    # Parse metadata from FDA website
    with open('FDA_htmls/{}.html'.format(approval), 'r') as fp:
        soup = BeautifulSoup(fp)
    entry = {}
    entry['query_id'] = approval
    for table in soup.findAll('table', {'align': 'center', 'style': 'text-transform: capitalize'}):
        for tr in table.findAll('tr'):
            if 'Summary' in tr.text:
                link = tr.find('a', href=True)['href']
                if len(link) == 0:
                    link = tr.find('A', href=True)['HREF']
                entry['summary_link'] = link.strip().lower().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                #print(entry['summary_link'])
    
#     else:
#         raise ValueError('No values found for HTML file {}'.format(approval))
    
    # Download PDF from FDA website
    if 'summary_link' not in entry:
        continue
    r = requests.get(entry['summary_link'])
    with open('FDA_pdfs/{}.pdf'.format(entry['query_id']), 'wb') as fp:
        fp.write(r.content)
    
    # Extract text from FDA website
    try:
        with fitz.open('FDA_pdfs/{}.pdf'.format(entry['query_id'])) as fp:
            pdf_text = ""
            for i in fp:
                pdf_text +=i.getText()
        pdf_text = re.sub('[^0-9a-zA-Z]+', ' ', pdf_text)
        with open('FDA_texts/{}.txt'.format(entry['query_id']), 'w') as fp:
            fp.write(pdf_text)
    except:
        continue

# df = pd.DataFrame(entries)
# df.to_csv('scrape.csv')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
