In [None]:
# Libs for web scraping
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re


In [None]:
url = "https://sheffield.ac.uk/nice-dsu/tsds/full-list"
folder_name = "NICE_TSD_PDFs"

# Create the folder if it doesn't exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Created folder: {folder_name}")

Created folder: NICE_TSD_PDFs


In [4]:
response = requests.get(url)
response.raise_for_status()

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')
soup

<!DOCTYPE html>

<html class="no-js" dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
<head>
<!-- OneTrust Cookies Consent Notice start for sheffield.ac.uk -->
<script src="https://cdn-ukwest.onetrust.com/consent/a3110129-5f04-4e81-9c5e-b494a7836315/OtAutoBlock.js" type="text/javascript"></script>
<script charset="UTF-8" data-domain-script="a3110129-5f04-4e81-9c5e-b494a7836315" src="https://cdn-ukwest.onetrust.com/scripttemplates/otSDKStub.js" type="text/javascript"></script>
<script type="text/javascript">
function OptanonWrapper() { }
</script>
<!-- OneTrust Cookies Consent Notice end for sheffield.ac.uk -->
<meta charset="utf-8"/>
<meta content="All the technical support documents produced by the Nice DSU. The TSDs have the aim of providing further information about how to implement the approaches described in the current Guide to the Methods of Technology Appraisal (2022)." name="description"/>
<link href="https://sheffield.ac.uk/nice-dsu/tsds/full-list" rel="canonical"/>
<meta

In [6]:
links = soup.find_all('a', href=re.compile(r'/download\?attachment', re.IGNORECASE))
links

[<a class="uoslink" data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="edf8dcab-0b02-4f69-94dd-d07a898b7785" href="/media/99966/download?attachment" title="TSD 27">Prioritising studies and outcomes for consideration in NICE HealthTech literature reviews</a>,
 <a class="uoslink" data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="87656a69-d711-4a1c-b723-195e2d5cab27" href="/media/94031/download?attachment" title="TSD26">Expert elicitation for long-term survival outcomes</a>,
 <a data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="c12c8af9-1d09-4b4a-83dc-36b03b08bf5f" href="/media/83861/download?attachment" title="TSD 25: Evidence Synthesis of Diagnostic Test Accuracy for Decision Making">Evidence Synthesis of Diagnostic Test Accuracy for Decision Making</a>,
 <a data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="93b93149-53b5-4764-8bf1-8a9913adcc77" href=

In [7]:
len(links)

27

In [12]:
def clean_filename(text):
    """Removes illegal characters and technical suffixes from filenames."""
    # Remove the (PDF, XXXKB) part often found in the link text
    text = re.sub(r'\(PDF,.*?\)', '', text, flags=re.IGNORECASE)
    # Remove characters that aren't allowed in filenames
    text = re.sub(r'[\\/*?:"<>|]', "", text)
    return text.strip()

In [None]:
def download_pdfs():
    print(f"Connecting to {url}...")
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all anchor tags that link to TSD document download
    links = soup.find_all('a', href=re.compile(r'/download\?attachment', re.IGNORECASE))
    
    print(f"Found {len(links)} PDF links. Starting download...")

    for link in links:
        pdf_url = urljoin(url, link['href'])
        
        # Use the link text as the filename for better metadata/organization
        raw_name = link.get_text() or pdf_url.split('/')[-1]
        filename = clean_filename(raw_name) + ".pdf"
        filepath = os.path.join(folder_name, filename)

        try:
            print(f"Downloading: {filename}...")
            pdf_data = requests.get(pdf_url)
            pdf_data.raise_for_status()

            with open(filepath, 'wb') as f:
                f.write(pdf_data.content)
        except Exception as e:
            print(f"Failed to download {pdf_url}: {e}")

    print("\nDownload complete. Check the 'NICE_TSD_PDFs' folder.")

In [17]:
download_pdfs()

Connecting to https://sheffield.ac.uk/nice-dsu/tsds/full-list...
Found 27 PDF links. Starting download...
Downloading: Prioritising studies and outcomes for consideration in NICE HealthTech literature reviews.pdf...
Downloading: Expert elicitation for long-term survival outcomes.pdf...
Downloading: Evidence Synthesis of Diagnostic Test Accuracy for Decision Making.pdf...
Downloading: Adjusting survival time estimates in the presence of treatment switching [Update of TSD16].pdf...
Downloading: A guide to calculating severity shortfall for nice evaluations.pdf...
Downloading: Mapping to estimate health state utilities.pdf...
Downloading: Flexible methods for survival analysis.pdf...
Downloading: Multivariate meta-analysis of summary data for combining treatment effects on correlated outcomes and evaluating surrogate endpoints.pdf...
Downloading: Partitioned survival analysis as a decision modelling tool.pdf...
Downloading: Methods for population-adjusted indirect comparisons in submissio

# Ingestion