# Download Sensor.Community Data
This notebook is just a quick way to automatically download sensor.community monthly data from https://archive.sensor.community/csv_per_month/
Run it until it's downloaded enough data for you and then cancel it.
It sometimes throws an `IncompleteRead` exception which I haven't figured out how to fix yet, just rerun it.

In [3]:
from bs4 import BeautifulSoup
import dataclasses
from datetime import datetime
from tqdm.auto import tqdm
from copy import copy
from zipfile import ZipFile, is_zipfile
import re

import requests
import shutil
from pathlib import Path
import requests

session = requests.session()

@dataclasses.dataclass
class MonthPage:
    dt: datetime
    url: str

@dataclasses.dataclass
class MonthFile:
    dt: datetime
    sensor_type: str
    filetype: str
    url: str

def download_file_copy_file(session, url, local_filename):
    with session.get(url, stream=True) as r:
        size = r.headers.get("Content-Length", None)
        if size: size = int(size)
        print(f"Downloading {url} to {local_filename} size: {size/1e6:.2f}GB")
        with open(local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename


def download_file_stream(session, url, local_filename):
    with session.get(url, stream=True) as r:
        r.raise_for_status()
        size = r.headers.get("Content-Length", None)
        if size: size = int(size)
        print(f"Downloading {url} to {local_filename} size: {size/1e6:.2f}MB")

        pbar = tqdm(total = size, unit = "B", unit_scale = True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=100_000):
                f.write(chunk)
                pbar.update(len(chunk))
    return local_filename

data_dir = Path("./data/").expanduser()
base = "https://archive.sensor.community/csv_per_month/"
soup = BeautifulSoup(session.get(base).content, "lxml")
links = soup.find_all("a", dict(href=re.compile(r"^\d\d\d\d-\d\d/$")))
months = [
            MonthPage(url=m["href"], dt=datetime.strptime(m["href"][:-1], "%Y-%m"))
            for m in links
        ]

file_regex = re.compile(r"^(\d\d\d\d-\d\d)_([^\.]+)\.(.+)$")
for month in months[::-1]:
    # print(month, base + month.url)
    soup = BeautifulSoup(session.get(base + month.url).content, "lxml")
    links = soup.find_all("a", dict(href=file_regex))
    for link in links:
        date, sensor_type, filetype = file_regex.match(link["href"]).groups()
        month_file = MonthFile(date, sensor_type, filetype, base + month.url + link["href"])

        p = data_dir / f"inputs/sensor_community/{month_file.dt}/{month_file.dt}_{sensor_type}.{filetype}"
        p.parent.mkdir(exist_ok = True, parents = True)
        if not p.exists() or not is_zipfile(p): 
            download_file_stream(session, month_file.url, p)

        if p.suffix == ".zip":
            unzipped_filename = p.parent / f"{p.stem}.csv"
            if not unzipped_filename.exists():
                print(f"Unzipping data")
                with ZipFile(p) as zip:
                    zip.extractall(path=p.parent)
        
print("Done!")

Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_bme280.zip to data/inputs/sensor_community/2023-08/2023-08_bme280.zip size: 2332.02MB


  0%|          | 0.00/2.33G [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_bmp180.zip to data/inputs/sensor_community/2023-08/2023-08_bmp180.zip size: 25.67MB


  0%|          | 0.00/25.7M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_bmp280.zip to data/inputs/sensor_community/2023-08/2023-08_bmp280.zip size: 52.94MB


  0%|          | 0.00/52.9M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_dht22.zip to data/inputs/sensor_community/2023-08/2023-08_dht22.zip size: 1651.87MB


  0%|          | 0.00/1.65G [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_ds18b20.zip to data/inputs/sensor_community/2023-08/2023-08_ds18b20.zip size: 3.26MB


  0%|          | 0.00/3.26M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_hpm.zip to data/inputs/sensor_community/2023-08/2023-08_hpm.zip size: 0.67MB


  0%|          | 0.00/670k [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_htu21d.zip to data/inputs/sensor_community/2023-08/2023-08_htu21d.zip size: 18.84MB


  0%|          | 0.00/18.8M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_pms1003.zip to data/inputs/sensor_community/2023-08/2023-08_pms1003.zip size: 0.84MB


  0%|          | 0.00/836k [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_pms3003.zip to data/inputs/sensor_community/2023-08/2023-08_pms3003.zip size: 0.81MB


  0%|          | 0.00/811k [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_pms5003.zip to data/inputs/sensor_community/2023-08/2023-08_pms5003.zip size: 38.03MB


  0%|          | 0.00/38.0M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_pms6003.zip to data/inputs/sensor_community/2023-08/2023-08_pms6003.zip size: 0.10MB


  0%|          | 0.00/101k [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_pms7003.zip to data/inputs/sensor_community/2023-08/2023-08_pms7003.zip size: 30.02MB


  0%|          | 0.00/30.0M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_ppd42ns.zip to data/inputs/sensor_community/2023-08/2023-08_ppd42ns.zip size: 1.61MB


  0%|          | 0.00/1.61M [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-08/2023-08_sds011.zip to data/inputs/sensor_community/2023-08/2023-08_sds011.zip size: 4357.02MB


  0%|          | 0.00/4.36G [00:00<?, ?B/s]

Unzipping data
Downloading https://archive.sensor.community/csv_per_month/2023-07/2023-07_bme280.zip to data/inputs/sensor_community/2023-07/2023-07_bme280.zip size: 2270.40MB


  0%|          | 0.00/2.27G [00:00<?, ?B/s]

ChunkedEncodingError: ('Connection broken: IncompleteRead(406061056 bytes read, 1864335850 more expected)', IncompleteRead(406061056 bytes read, 1864335850 more expected))