# Web Scraping

In [2]:
# Libs for web scraping
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re


In [3]:
url = "https://sheffield.ac.uk/nice-dsu/tsds/full-list"
folder_name = "NICE_TSD_PDFs"

# Create the folder if it doesn't exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Created folder: {folder_name}")

In [4]:
response = requests.get(url)
response.raise_for_status()

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')
soup

<!DOCTYPE html>

<html class="no-js" dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
<head>
<!-- OneTrust Cookies Consent Notice start for sheffield.ac.uk -->
<script src="https://cdn-ukwest.onetrust.com/consent/a3110129-5f04-4e81-9c5e-b494a7836315/OtAutoBlock.js" type="text/javascript"></script>
<script charset="UTF-8" data-domain-script="a3110129-5f04-4e81-9c5e-b494a7836315" src="https://cdn-ukwest.onetrust.com/scripttemplates/otSDKStub.js" type="text/javascript"></script>
<script type="text/javascript">
function OptanonWrapper() { }
</script>
<!-- OneTrust Cookies Consent Notice end for sheffield.ac.uk -->
<meta charset="utf-8"/>
<meta content="All the technical support documents produced by the Nice DSU. The TSDs have the aim of providing further information about how to implement the approaches described in the current Guide to the Methods of Technology Appraisal (2022)." name="description"/>
<link href="https://sheffield.ac.uk/nice-dsu/tsds/full-list" rel="canonical"/>
<meta

In [6]:
links = soup.find_all('a', href=re.compile(r'/download\?attachment', re.IGNORECASE))
links

[<a class="uoslink" data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="edf8dcab-0b02-4f69-94dd-d07a898b7785" href="/media/99966/download?attachment" title="TSD 27">Prioritising studies and outcomes for consideration in NICE HealthTech literature reviews</a>,
 <a class="uoslink" data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="87656a69-d711-4a1c-b723-195e2d5cab27" href="/media/94031/download?attachment" title="TSD26">Expert elicitation for long-term survival outcomes</a>,
 <a data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="c12c8af9-1d09-4b4a-83dc-36b03b08bf5f" href="/media/83861/download?attachment" title="TSD 25: Evidence Synthesis of Diagnostic Test Accuracy for Decision Making">Evidence Synthesis of Diagnostic Test Accuracy for Decision Making</a>,
 <a data-entity-substitution="media_download" data-entity-type="media" data-entity-uuid="93b93149-53b5-4764-8bf1-8a9913adcc77" href=

In [7]:
len(links)

27

In [12]:
def clean_filename(text):
    """Removes illegal characters and technical suffixes from filenames."""
    # Remove the (PDF, XXXKB) part often found in the link text
    text = re.sub(r'\(PDF,.*?\)', '', text, flags=re.IGNORECASE)
    # Remove characters that aren't allowed in filenames
    text = re.sub(r'[\\/*?:"<>|]', "", text)
    return text.strip()

In [None]:
def download_pdfs():
    print(f"Connecting to {url}...")
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all anchor tags that link to TSD document download
    links = soup.find_all('a', href=re.compile(r'/download\?attachment', re.IGNORECASE))
    
    print(f"Found {len(links)} PDF links. Starting download...")

    for link in links:
        pdf_url = urljoin(url, link['href'])
        
        # Use the link text as the filename for better metadata/organization
        raw_name = link.get_text() or pdf_url.split('/')[-1]
        filename = clean_filename(raw_name) + ".pdf"
        filepath = os.path.join(folder_name, filename)

        try:
            print(f"Downloading: {filename}...")
            pdf_data = requests.get(pdf_url)
            pdf_data.raise_for_status()

            with open(filepath, 'wb') as f:
                f.write(pdf_data.content)
        except Exception as e:
            print(f"Failed to download {pdf_url}: {e}")

    print("\nDownload complete. Check the 'NICE_TSD_PDFs' folder.")

In [17]:
download_pdfs()

Connecting to https://sheffield.ac.uk/nice-dsu/tsds/full-list...
Found 27 PDF links. Starting download...
Downloading: Prioritising studies and outcomes for consideration in NICE HealthTech literature reviews.pdf...
Downloading: Expert elicitation for long-term survival outcomes.pdf...
Downloading: Evidence Synthesis of Diagnostic Test Accuracy for Decision Making.pdf...
Downloading: Adjusting survival time estimates in the presence of treatment switching [Update of TSD16].pdf...
Downloading: A guide to calculating severity shortfall for nice evaluations.pdf...
Downloading: Mapping to estimate health state utilities.pdf...
Downloading: Flexible methods for survival analysis.pdf...
Downloading: Multivariate meta-analysis of summary data for combining treatment effects on correlated outcomes and evaluating surrogate endpoints.pdf...
Downloading: Partitioned survival analysis as a decision modelling tool.pdf...
Downloading: Methods for population-adjusted indirect comparisons in submissio

# Ingestion

## PyMuPDF

In [5]:
import fitz  # PyMuPDF
from pathlib import Path

Path.joinpath(Path.cwd(), folder_name)

WindowsPath('c:/Users/Public/Documents/MARIO/NICE_TSD_PDFs')

In [6]:
file = "NICE_TSD_PDFs\A general linear modelling framework for pair-wise and network meta-analysis of randomised controlled trials.pdf"
doc = fitz.open(file)

  file = "NICE_TSD_PDFs\A general linear modelling framework for pair-wise and network meta-analysis of randomised controlled trials.pdf"


In [8]:
doc.metadata

{'format': 'PDF 1.5',
 'title': 'Microsoft Word - TSD2 General meta analysis corrected 2Sep2016v2.docx',
 'author': 'epsdsd',
 'subject': '',
 'keywords': '',
 'creator': '',
 'producer': 'PDF Writer - bioPDF / http://www.biopdf.com / CP / The University Of Bristol',
 'creationDate': "D:20160902140031+01'00'",
 'modDate': "D:20160902140031+01'00'",
 'trapped': '',
 'encryption': None}

In [25]:
from pprint import pprint
pprint(doc.load_page(10).get_text())

('11 \n'
 ' \n'
 '2 DEVELOPMENT OF THE CORE MODELS: BINOMIAL DATA \n'
 'WITH LOGIT LINK \n'
 'Consider a set of M trials comparing two treatments 1 and 2 in a '
 'pre-specified target patient \n'
 'population, which are to be synthesised in a meta-analysis. A fixed effect '
 'analysis would \n'
 'assume that each study i generates an estimate of the same parameter d12, '
 'subject to sampling \n'
 'error. In a random effects model, each study i provides an estimate of the '
 'study-specific \n'
 'treatment effects δi,12 which are assumed not to be equal but rather '
 'exchangeable. This means \n'
 'that all δi,12 are ‘similar’ in a way which assumes that the trial labels, '
 'i, attached to the treatment \n'
 'effects δi,12 are irrelevant. In other words, the information that the '
 'trials provide is independent \n'
 'of the order in which they were carried out, over the population of '
 'interest.30 The \n'
 'exchangeability assumption is equivalent to saying that the trial-specific 

## Docling

In [26]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
doc = converter.convert(file)

  from .autonotebook import tqdm as notebook_tqdm
[32m[INFO] 2026-01-28 19:43:44,101 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-28 19:43:44,292 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-28 19:43:44,311 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.5.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-28 19:43:46,746 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2026-01-28 19:43:50,088 [RapidOCR] download_file.py:95: Successfully saved to: C:\Users\Public\Documents\MARIO\MAR\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-28 19:43:50,094 [RapidOCR] main.py:50: Using C:\Users\Public\Documents\MARIO\MAR\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-28 19:43:50,559 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-28 19:43:50

In [30]:
print(doc.document.export_to_markdown())

## NICE DSU TECHNICAL SUPPORT DOCUMENT 2: A GENERALISED LINEAR MODELLING FRAMEWORK FOR PAIRWISE AND NETWORK META-ANALYSIS OF RANDOMISED CONTROLLED TRIALS

## R EPORT BY THE D ECISION S UPPORT U NIT

August 2011 (last updated September 2016)

Sofia Dias 1 , Nicky J Welton 1 , Alex J Sutton 2 , AE Ades 1

1 School of Social and Community Medicine, University of Bristol, Canynge Hall, 39 Whatley Road, Bristol BS8 2PS, UK

2 Department  of  Health  Sciences,  University  of  Leicester,  2nd  Floor  Adrian  Building, University Road, Leicester LE1 7RH, UK

Decision Support Unit, ScHARR, University of Sheffield, Regent Court, 30 Regent Street Sheffield, S1 4DA;

Tel (+44) (0)114 222 0734

E-mail dsuadmin@sheffield.ac.uk

## A BOUT THE D ECISION S UPPORT U NIT

The Decision Support Unit (DSU) is a collaboration between the Universities of Sheffield, York and Leicester. We also have members at the University of Bristol, London School of Hygiene and Tropical Medicine and Brunel University.

The

In [33]:
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import PdfFormatOption

pipeline_options = PdfPipelineOptions()

pipeline_options.do_formula_enrichment = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)


In [34]:
doc = converter.convert(file)

[32m[INFO] 2026-01-28 20:03:56,773 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-28 20:03:56,774 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-28 20:03:56,844 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\Public\Documents\MARIO\MAR\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-28 20:03:56,844 [RapidOCR] main.py:50: Using C:\Users\Public\Documents\MARIO\MAR\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-28 20:03:57,009 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-28 20:03:57,010 [RapidOCR] device_config.py:50: Using CPU device[0m
[32m[INFO] 2026-01-28 20:03:57,034 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\Public\Documents\MARIO\MAR\Lib\site-packages\rapidocr\models\ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2026-01-28 20:03:57,035 [RapidOCR] main.py:50: Using C:\Users\Public\Doc

OSError: [WinError 1314] A required privilege is not held by the client: '..\\..\\blobs\\a6344aac8c09253b3b630fb776ae94478aa0275b' -> 'C:\\Users\\DevanshUpadhyay\\.cache\\huggingface\\hub\\models--docling-project--CodeFormulaV2\\snapshots\\ecedbe111d15c2dc60bfd4a823cbe80127b58af4\\.gitattributes'