# Download

This notebook downloads the data from Project CCHAIN.

In [4]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

## All CSV files from a URL

In [7]:
# source url
url = "https://data.humdata.org/dataset/project-cchain"

# output folder
download_folder = "../data/01_raw"
os.makedirs(download_folder, exist_ok=True)

# function
def download_csv_files(url, download_folder):
    try:

        # get url content
        response = requests.get(url)
        response.raise_for_status()
        
        # parse html
        soup = BeautifulSoup(response.content, "html.parser")
        
        # find links with .csv extension
        csv_links = soup.find_all("a", href=True)
        csv_files = [link["href"] for link in csv_links if link["href"].endswith(".csv")]

        # download csv
        for relative_url in csv_files:
            csv_url = urljoin(url, relative_url)
            file_name = os.path.basename(csv_url)
            file_path = os.path.join(download_folder, file_name)
            
            csv_response = requests.get(csv_url)
            csv_response.raise_for_status()  # Ensure the request was successful
            with open(file_path, "wb") as file:
                file.write(csv_response.content)
            
            print(f"Downloaded: {file_name}")

        print("All CSV files downloaded successfully!")
    except Exception as e:
        print(f"Error occurred: {e}")

# apply
download_csv_files(url, download_folder)

Downloaded: disease_pidsr_totals.csv
Downloaded: disease_fhsis_totals.csv
Downloaded: disease_psa_totals.csv
Downloaded: disease_lgu_disaggregated_totals.csv
Downloaded: climate_atmosphere.csv
Downloaded: climate_atmosphere_downscaled.csv
Downloaded: climate_air_quality.csv
Downloaded: climate_indices.csv
Downloaded: climate_land.csv
Downloaded: climate_timestep_check.csv
Downloaded: geoportal_doh_poi_health.csv
Downloaded: osm_poi_amenity.csv
Downloaded: osm_poi_health.csv
Downloaded: osm_poi_sanitation.csv
Downloaded: osm_poi_water_body.csv
Downloaded: osm_poi_total.csv
Downloaded: esa_worldcover.csv
Downloaded: project_noah_hazards.csv
Downloaded: google_open_buildings.csv
Downloaded: mapbox_health_facility_brgy_isochrones.csv
Downloaded: mapbox_health_facility_city_isochrones.csv
Downloaded: worldpop_population.csv
Downloaded: ookla_internet_speed.csv
Downloaded: nighttime_lights.csv
Downloaded: tm_relative_wealth_index.csv
Downloaded: tm_open_buildings.csv
Downloaded: calendar.csv

## Selected files only

In [9]:
# list of urls
url_list = [
    "https://data.humdata.org/dataset/5b580664-365e-4d7e-b5e5-2990df8f12a5/resource/eb1341ec-296f-442c-a741-6e78fca31332/download/osm_poi_sanitation.csv",
    # add more urls here
]

# output folder
download_folder = "../data/01_raw"
os.makedirs(download_folder, exist_ok=True)

# function
def download_files(url_list, download_folder):
    try:
        
        # loop through urls and downlaod
        for url in url_list:
            file_name = os.path.basename(url)
            file_path = os.path.join(download_folder, file_name)
            
            print(f"Downloading: {url}")
            response = requests.get(url)
            response.raise_for_status()  
            
            with open(file_path, "wb") as file:
                file.write(response.content)
            
            print(f"Downloaded: {file_name}")
        
        print("All files downloaded successfully!")
    except Exception as e:
        print(f"Error occurred: {e}")

# apply
download_files(url_list, download_folder)

Downloading: https://data.humdata.org/dataset/5b580664-365e-4d7e-b5e5-2990df8f12a5/resource/eb1341ec-296f-442c-a741-6e78fca31332/download/osm_poi_sanitation.csv
Downloaded: osm_poi_sanitation.csv
All files downloaded successfully!


You can then add these files to Google Cloud Storage. 