#### This short script displays examples of downloading and transforming datasets from various websites.

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import json
import csv
os.getcwd()

'C:\\Users\\AndrewMcDevitt\\Documents\\Service Development\\automate_datasets'

# Index of Multiple Deprivation (2019)

#### Using requests library to retrieve relevant dataset

In [11]:
IMD_url = "https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx"
response = requests.get(IMD_url)

with open("File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx","wb") as file:
    file.write(response.content)

# Hospital Episode Statistics 

#### Using BeautifulSoup to retrieve relevant pdf, .csv and .xlsx files

In [12]:
HES_url = "https://digital.nhs.uk/data-and-information/publications/statistical/hospital-accident--emergency-activity/2021-22"

# Send a GET request to the webpage and get its HTML content
response = requests.get(HES_url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Find all anchor tags that potentially contain the download links
download_links = soup.find_all("a") #<a> =dataset tag

# Define a function to download a dataset
def download_dataset(url, filename):
    response = requests.get(url)
    with open(filename, "wb") as file:
        file.write(response.content)

# Loop through the download links and download each dataset
for link in download_links:
    link_url = link.get("href")
    if link_url.endswith(".csv") or link_url.endswith(".xlsx"):  # Adjust the condition to match the dataset format
        dataset_name = link_url.split("/")[-1]  # Extract filename from URL
        json_data = json.dumps(dataset_name)
        download_dataset(link_url, dataset_name)

## CVD Prevent 

#### Scraping data, removing irrelevant files, extracting information and creating new file names

In [4]:
base_url = "https://api.cvdprevent.nhs.uk/indicator"

# this is the date used until march 2023
time_period_id = 8

def download_datasets(url):
    # Loop through indicator values (1 to 34)
    for indicator in range(1, 35):
        # Loop through system level IDs (1 to 8)
        for system_level_id in range(1, 9):
            # Construct URL using dictionaries from the above elements in the for loop
            url = f"{base_url}/{indicator}/rawDataCSV?timePeriodID={time_period_id}&systemLevelID={system_level_id}"
        
            response = requests.get(url)
        
            if response.status_code == 200:
                # Process .csv data if response status = 200 and save as a csv file
                filename = f"indicator_{indicator}_level_{system_level_id}.csv"
                with open(filename, "wb") as csv_file:
                    csv_file.write(response.content)
                #print(f"Downloaded {filename}")
            else:
                print(f"Failed to fetch URL for indicator {indicator}, level {system_level_id}")
                
download_datasets(base_url)

In [5]:
## Remove irrelevant files (there are no level 2-3 or indicator 5-6 or 16-17)

# Get the list of files in the working directory
files_in_directory = os.listdir()

# filter on keywords as there is no data on the website for these indicators and levels
keywords_to_filter = ["level_2", "level_3", "indicator_5", "indicator_6", "indicator_16", "indicator_17"]

def remove_irrelevant_files(files_in_directory):
    # Loop to search for where keywords in file name match keywords in keywords_to_filter. if so -> remove the file from the wd
    for file_name in files_in_directory:
        if any(keyword in file_name for keyword in keywords_to_filter):
            try:
                os.remove(file_name)
            except Exception as e:
                print(f"Failed to remove {file_name}: {e}")
                
remove_irrelevant_files(files_in_directory)

In [7]:
## Rename files

# Lists of indicators and levels present on the cvd website as of 05/09/23 (update this is more are introduced)
indicator_list = [1, 7, 20, 11, 4, 32, 2, 3, 21, 18, 10, 9, 23, 34, 22, 14, 30, 33, 13, 15, 8, 29, 19, 31, 12, 24, 26, 25, 27, 28]
sorted_indicator_list = sorted(indicator_list)
level_list = [1, 4, 5, 6, 7, 8]

# Mapping of indicator values to their associated names
indicator_names = {
    1: "AF: prevalence",
    2: "Hypertension: treatment to recommended age specific (79 and under) threshold",
    3: "Hypertension: treatment to recommended age specific (80 and over) threshold",
    4: "Hypertension: BP monitoring",
    7: "AF: treatment with anticoagulants",
    8: "CKD: prevalence",
    9: "FH: Possible, probable and confirmed prevalence",
    10: "FH: genetically confirmed prevalence",
    11: "Hypertension: prevalence",
    12: "CVD: prevalence",
    13: "CKD: uncoded case finder",
    14: "Cholestrol: QRISK 20% or more treated with LLT",
    15: "CKD: high risk case finder",
    18: "Cholestrol: investigation for FH",
    19: "CKD: treatment with renin-angiotensin system antagonists",
    20: "Hypertension: high risk case finder",
    21: "Hypertension: potential overtreatment",
    22: "Cholestrol: QRISK 10% or more treated with LLT",
    23: "Colestrol: CKD treated with LLT",
    24: "Smoking: record of smoking status",
    25: "NDH: high risk case finder",
    26: "Smoking: current smokers offered support_treatment",
    27: "Diabetes: uncoded case finder",
    28: "Diabetes: high risk case finder",
    29: "CKD: recorded eGFR",
    30: "Cholestrol: CVD treated to threshold",
    31: "CKD: ACR less than 70mg_mmol with BP to target",
    32: "Hypertension: treatment to recommended age specific thresholds (all ages)",
    33: "Cholestrol: primary intervention of CVD treat treated with LLT",
    34: "Cholestrol: CVD treated with LLT"  
}

# Mapping of level values to their associated names
level_names = {
    1: "England",
    4: "PCN",
    5: "Practice",
    6: "Region",
    7: "ICB",
    8: "Sub-ICB",  
}

## find files in working directory called "indicator"
directory = os.getcwd()
files = os.listdir(directory)
indicator_files = [filename for filename in files if "indicator" in filename]


## function used to extract file_name and separate indicator and level number
def extract_info(filename):
    parts = filename.split("_") #split the file on _ to allow for indexing
    indicator_number = int(parts[1]) #index to find indicator number
    level_number = int(parts[-1].split(".")[0]) #index to find level number
    return indicator_number, level_number

## function used to create a new file name for the for loop
def create_new_filename(indicator_number, level_number):
    indicator_name = indicator_names.get(indicator_number, "Unknown Indicator") #search for indicator num in indicator name dict
    level_name = level_names.get(level_number, "Unknown Level") #search for level num in level names dict
    new_filename = f"{indicator_name}_level_{level_name}.csv" #create a new file with the new indicator and level names
    return new_filename

def rename_files(directory, indicator_files):
    # Rename files in the indicator_files list
    for old_filename in indicator_files:
        indicator_number, level_number = extract_info(old_filename)
        new_filename = create_new_filename(indicator_number, level_number)
        new_filename = new_filename.replace(":", "") #rename files from : to white space as this breaks the unicode same with "/"
    
        ## use paths (wd). select current working directory and file_name. rename with os.
        old_path = os.path.join(directory, old_filename)
        new_path = os.path.join(directory, new_filename)
        os.rename(old_path, new_path)
        
        print(f"Renamed: {old_filename} -> {new_filename}")

rename_files(directory, indicator_files)

Renamed: indicator_10_level_1.csv -> FH genetically confirmed prevalence_level_England.csv
Renamed: indicator_10_level_4.csv -> FH genetically confirmed prevalence_level_PCN.csv
Renamed: indicator_10_level_5.csv -> FH genetically confirmed prevalence_level_Practice.csv
Renamed: indicator_10_level_6.csv -> FH genetically confirmed prevalence_level_Region.csv
Renamed: indicator_10_level_7.csv -> FH genetically confirmed prevalence_level_ICB.csv
Renamed: indicator_10_level_8.csv -> FH genetically confirmed prevalence_level_Sub-ICB.csv
Renamed: indicator_11_level_1.csv -> Hypertension prevalence_level_England.csv
Renamed: indicator_11_level_4.csv -> Hypertension prevalence_level_PCN.csv
Renamed: indicator_11_level_5.csv -> Hypertension prevalence_level_Practice.csv
Renamed: indicator_11_level_6.csv -> Hypertension prevalence_level_Region.csv
Renamed: indicator_11_level_7.csv -> Hypertension prevalence_level_ICB.csv
Renamed: indicator_11_level_8.csv -> Hypertension prevalence_level_Sub-ICB.