# Automatically download all the datasets needed

You can use this notebook to customize and download automatically the datasets

## Setup

In [None]:
%pip install requests -q
%pip install tqdm -q

In [3]:
import os
import requests
from tqdm import tqdm
import zipfile
import shutil

In [None]:
download_folder='./datasets'

# Pollution
pollution_dates='2019-01-01','2024-12-31' # start,end; yyyy-mm-dd;
pollution_links = [
    f'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/dati-centraline-bologna-storico/exports/csv?lang=it&qv1=(data_inizio%3A%5B{pollution_dates[0]}T23%3A00%3A00Z%20TO%20{pollution_dates[1]}T22%3A59%3A59Z%5D)&timezone=Europe%2FRome&use_labels=true&delimiter=%3B'
]

# Traffic
traffic_years = 2019,2020,2021,2022,2023,2024
reading_link_before_2022 = 'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-autoveicoli-tramite-spire-anno-{year}/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B'
reading_link_from_2022 = 'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/rilevazione-flusso-veicoli-tramite-spire-anno-{year}/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B'
accuracy_link = 'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/accuratezza-spire-anno-{year}/exports/csv?lang=it&timezone=Europe%2FRome&use_labels=true&delimiter=%3B'
reading_links = [reading_link_before_2022.format(year=year) if year<2022 else reading_link_from_2022.format(year=year) for year in traffic_years]
accuracy_links = [accuracy_link.format(year=year) for year in traffic_years]

# Weather
weather_years = 2019,2020,2021,2022,2023,2024
weather_link = 'https://dati-simc.arpae.it/opendata/erg5v2/timeseries/01421/01421_{year}.zip'
weather_links = [weather_link.format(year=year) for year in weather_years]

download_data = { # the keys are folder names. If the value is a dict it is a subfolder, else the list of files to download.
    'pollution': pollution_links,
    'traffic': {
        'readings': reading_links,
        'accuracies': accuracy_links
        },
    'weather': weather_links
}

file_names = {
    'pollution': ['pollution.csv'],
    'traffic': {
        'readings': [f'{year}_traffic_reading.csv' for year in traffic_years],
        'accuracies': [f'{year}_traffic_accuracy.csv' for year in traffic_years]
        },
    'weather': [f'{year}_weather.zip' for year in weather_years]
}

## Downloads

In [5]:
def download_file(url, folder, filename):
    """Download a file from the given URL and save it in the specified folder with the given filename."""
    if not os.path.exists(folder):
        os.makedirs(folder)  # Create folder if it does not exist

    file_path = os.path.join(folder, filename)
    
    # Start downloading with a progress bar using tqdm
    response = requests.get(url, stream=True)  # Use stream to avoid loading the entire file in memory at once

    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        
        # Use tqdm for the progress bar
        with open(file_path, 'wb') as file, tqdm(
            desc=f"Downloading {filename}",
            total=total_size, 
            unit='B', 
            unit_scale=True, 
            ncols=100
        ) as bar:
            for data in response.iter_content(chunk_size=1024):
                bar.update(len(data))  # Update the progress bar
                file.write(data)
        print(f"File downloaded: {file_path}")
    else:
        print(f"Failed to download {url} (Status code: {response.status_code})")

def download_files(download_data, file_names, base_folder):
    """Download files based on the download_data and file_names dictionaries."""
    for category, links_or_subfolders in download_data.items():
        category_folder = os.path.join(base_folder, category)
        
        # Handle subfolder structure
        if isinstance(links_or_subfolders, dict):
            for subfolder, links in links_or_subfolders.items():
                subfolder_name = os.path.join(category_folder, subfolder)
                for url, filename in zip(links, file_names[category][subfolder]):
                    if not os.path.exists(os.path.join(subfolder_name, filename)):
                        download_file(url, subfolder_name, filename)
                    else: 
                        print(f"File already exists: {filename}")
        else:
            for url, filename in zip(links_or_subfolders, file_names[category]):
                if not os.path.exists(os.path.join(category_folder, os.path.basename(filename).split('.')[0]+'.csv')): # some files can be zips but w elook for csv
                    download_file(url, category_folder, filename)
                else: 
                    print(f"File already exists: {filename}")

In [6]:
download_files(download_data, file_names, download_folder)

Downloading pollution.csv: 47.4MB [03:08, 251kB/s] 


File downloaded: .\datasets/pollution/pollution.csv


Downloading 2019_traffic_reading.csv: 77.1MB [01:23, 927kB/s] 


File downloaded: .\datasets/traffic/readings/2019_traffic_reading.csv


Downloading 2020_traffic_reading.csv: 76.1MB [01:25, 895kB/s] 


File downloaded: .\datasets/traffic/readings/2020_traffic_reading.csv


Downloading 2021_traffic_reading.csv: 78.1MB [01:14, 1.05MB/s]


File downloaded: .\datasets/traffic/readings/2021_traffic_reading.csv


Downloading 2022_traffic_reading.csv: 81.8MB [01:13, 1.11MB/s]


File downloaded: .\datasets/traffic/readings/2022_traffic_reading.csv


Downloading 2023_traffic_reading.csv: 84.6MB [02:10, 648kB/s] 


File downloaded: .\datasets/traffic/readings/2023_traffic_reading.csv


Downloading 2024_traffic_reading.csv: 83.2MB [01:35, 871kB/s] 


File downloaded: .\datasets/traffic/readings/2024_traffic_reading.csv


Downloading 2019_traffic_accuracy.csv: 40.5MB [00:51, 793kB/s] 


File downloaded: .\datasets/traffic/accuracies/2019_traffic_accuracy.csv


Downloading 2020_traffic_accuracy.csv: 41.7MB [00:45, 909kB/s] 


File downloaded: .\datasets/traffic/accuracies/2020_traffic_accuracy.csv


Downloading 2021_traffic_accuracy.csv: 42.4MB [00:40, 1.06MB/s]


File downloaded: .\datasets/traffic/accuracies/2021_traffic_accuracy.csv


Downloading 2022_traffic_accuracy.csv: 44.7MB [00:50, 885kB/s] 


File downloaded: .\datasets/traffic/accuracies/2022_traffic_accuracy.csv


Downloading 2023_traffic_accuracy.csv: 47.2MB [00:55, 858kB/s] 


File downloaded: .\datasets/traffic/accuracies/2023_traffic_accuracy.csv


Downloading 2024_traffic_accuracy.csv: 46.6MB [00:46, 995kB/s] 


File downloaded: .\datasets/traffic/accuracies/2024_traffic_accuracy.csv


Downloading 2019_weather.zip: 100%|███████████████████████████████| 135k/135k [00:00<00:00, 239kB/s]


File downloaded: .\datasets/weather/2019_weather.zip


Downloading 2020_weather.zip: 100%|███████████████████████████████| 134k/134k [00:00<00:00, 234kB/s]


File downloaded: .\datasets/weather/2020_weather.zip


Downloading 2021_weather.zip: 100%|███████████████████████████████| 134k/134k [00:00<00:00, 239kB/s]


File downloaded: .\datasets/weather/2021_weather.zip


Downloading 2022_weather.zip: 100%|███████████████████████████████| 135k/135k [00:00<00:00, 176kB/s]


File downloaded: .\datasets/weather/2022_weather.zip


Downloading 2023_weather.zip: 100%|███████████████████████████████| 136k/136k [00:00<00:00, 245kB/s]


File downloaded: .\datasets/weather/2023_weather.zip


Downloading 2024_weather.zip: 100%|███████████████████████████████| 134k/134k [00:00<00:00, 238kB/s]


File downloaded: .\datasets/weather/2024_weather.zip


## Extract zip archives

The weather downloads are zip files containing dayly data and hourly data. We only want to keep the hourly data.

In [7]:
def extract_and_rename_zip(zip_path, weather_folder, keep_zips=True):
    """Extract the ZIP file, keep only the CSV file ending with 'h', rename it, and move the ZIP to a zip folder."""
    temp_folder = os.path.join(weather_folder, 'temp')
    os.makedirs(temp_folder, exist_ok=True)

    # Open and extract files, then close ZIP before moving it
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_folder)
        extracted_files = zip_ref.namelist()  # Get list of extracted files

    # Find the CSV file that ends with 'h'
    hourly_file = None
    for file in extracted_files:
        if file.endswith('h.csv'):
            hourly_file = os.path.join(temp_folder, file)
            break

    if hourly_file:
        # Create a new filename based on the original ZIP file name (without the .zip extension)
        new_filename = f"{os.path.splitext(os.path.basename(zip_path))[0]}.csv"
        new_file_path = os.path.join(weather_folder, new_filename)

        # Rename the hourly CSV file to the new name
        os.rename(hourly_file, new_file_path)
        print(f"Renamed and saved: {new_file_path}")
    else:
        print(f"No hourly CSV found in {zip_path}")

    # Handle ZIP file based on `keep_zips`
    if keep_zips:
        zip_folder = os.path.join(weather_folder, 'zip')
        os.makedirs(zip_folder, exist_ok=True)
        shutil.move(zip_path, os.path.join(zip_folder, os.path.basename(zip_path)))
        print(f"Moved {zip_path} to {zip_folder}")
    else:
        os.remove(zip_path)
        print(f"Deleted {zip_path}")

    # Clean up: Remove the temporary folder
    shutil.rmtree(temp_folder, ignore_errors=True)


In [8]:
weather_folder = os.path.join(download_folder, 'weather')

for zip_file in os.listdir(weather_folder):
    zip_path = os.path.join(weather_folder, zip_file)
    if os.path.isfile(zip_path) and zip_file.endswith('.zip'):
        extract_and_rename_zip(zip_path, weather_folder, keep_zips=False)

Renamed and saved: .\datasets/weather/2023_weather.csv
Deleted .\datasets/weather/2023_weather.zip
Renamed and saved: .\datasets/weather/2022_weather.csv
Deleted .\datasets/weather/2022_weather.zip
Renamed and saved: .\datasets/weather/2021_weather.csv
Deleted .\datasets/weather/2021_weather.zip
Renamed and saved: .\datasets/weather/2024_weather.csv
Deleted .\datasets/weather/2024_weather.zip
Renamed and saved: .\datasets/weather/2019_weather.csv
Deleted .\datasets/weather/2019_weather.zip
Renamed and saved: .\datasets/weather/2020_weather.csv
Deleted .\datasets/weather/2020_weather.zip
