get data

pollutants

In [1]:
import pandas as pd
import requests
from io import StringIO
from datetime import datetime, timedelta

# Website link/datasheet -> loaction of stations findable from here too: https://luftdaten.berlin.de/pollution/


def clean_station_data(df: pd.DataFrame, measurement: str) -> pd.DataFrame:
    """
    Cleans a station dataframe:
    - Drops first three metadata rows
    - Converts from wide to long format
    - Parses measurement time
    - Adds a 'measurement' column
    """
    # Drop first three rows
    df = df.iloc[3:].copy()
    
    # Rename columns: first column is 'time', others stay as station names
    df = df.rename(columns={df.columns[0]: 'time'})
    
    # Melt the dataframe to long format
    df_long = df.melt(id_vars='time', var_name='station', value_name='value')
    
    # Parse time column
    df_long['time'] = pd.to_datetime(df_long['time'], format='%d.%m.%Y %H:%M')
    
    # Add measurement column
    df_long['measurement'] = measurement

    # Convert values to numeric (in case there are missing or non-numeric entries)
    df_long['value'] = pd.to_numeric(df_long['value'], errors='coerce')
    

    
    return df_long


def download_and_clean(pollutant: str, start: datetime, end: datetime) -> pd.DataFrame:
    """
    Downloads and cleans data for a given pollutant from Berlin Luftdaten.
    """
    base_url = f"https://luftdaten.berlin.de/core/{pollutant}.csv?stationgroup=all&period=1h&timespan=custom&start[date]={{start_date}}&start[hour]={{start_hour}}&end[date]={{end_date}}&end[hour]={{end_hour}}"
    
    delta = timedelta(days=30)
    all_data = []

    current_start = start
    while current_start < end:
        current_end = min(current_start + delta, end)
        
        # Format dates
        start_date = current_start.strftime("%d.%m.%Y")
        start_hour = current_start.strftime("%H")
        end_date = current_end.strftime("%d.%m.%Y")
        end_hour = current_end.strftime("%H")
        
        # Build URL
        url = base_url.format(start_date=start_date, start_hour=start_hour,
                              end_date=end_date, end_hour=end_hour)
        
        print(f"Downloading {pollutant.upper()} data from {start_date} {start_hour} to {end_date} {end_hour}")
        
        # Download CSV
        response = requests.get(url)
        if response.status_code == 200:
            df = pd.read_csv(StringIO(response.text), sep=";")
            cleaned = clean_station_data(df, pollutant)
            all_data.append(cleaned)
        else:
            print(f"Error downloading {pollutant}: {response.status_code}")
        
        current_start = current_end + timedelta(hours=1)
    
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        return pd.DataFrame(columns=['time', 'station', 'value', 'measurement'])


# Define period
start = datetime(2024, 1, 1, 0)
end = datetime(2024, 12, 31, 23)

# Download & clean each pollutant
pollutants = ['no2', 'pm10', 'o3']
dfs = [download_and_clean(p, start, end) for p in pollutants]

# Combine into one long dataframe
full_df = pd.concat(dfs, ignore_index=True)

print(full_df.head(20))


Downloading NO2 data from 01.01.2024 00 to 31.01.2024 00
Downloading NO2 data from 31.01.2024 01 to 01.03.2024 01
Downloading NO2 data from 01.03.2024 02 to 31.03.2024 02
Downloading NO2 data from 31.03.2024 03 to 30.04.2024 03
Downloading NO2 data from 30.04.2024 04 to 30.05.2024 04
Downloading NO2 data from 30.05.2024 05 to 29.06.2024 05
Downloading NO2 data from 29.06.2024 06 to 29.07.2024 06
Downloading NO2 data from 29.07.2024 07 to 28.08.2024 07
Downloading NO2 data from 28.08.2024 08 to 27.09.2024 08
Downloading NO2 data from 27.09.2024 09 to 27.10.2024 09
Downloading NO2 data from 27.10.2024 10 to 26.11.2024 10
Downloading NO2 data from 26.11.2024 11 to 26.12.2024 11
Downloading NO2 data from 26.12.2024 12 to 31.12.2024 23
Downloading PM10 data from 01.01.2024 00 to 31.01.2024 00
Downloading PM10 data from 31.01.2024 01 to 01.03.2024 01
Downloading PM10 data from 01.03.2024 02 to 31.03.2024 02
Downloading PM10 data from 31.03.2024 03 to 30.04.2024 03
Downloading PM10 data from 

In [2]:
full_df.to_csv("./data/2024-pollutants.csv")

weather

In [3]:
import pandas as pd
from wetterdienst.provider.dwd.observation import DwdObservationRequest
import datetime as dt

# REFERENCE! https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/
# API version 0.113.0
# Berlin station IDs
berlin_stations = ("00399", "00400", "00403", "00410", "00420", "00424", "00427", "00430", "00433")

# Map of parameters to dataset codes
parameters = [
    ("hourly", "temperature_air"),
    ("hourly", "pressure"),  # Air pressure
    ("hourly", "moisture"),  # Humidity
    ("hourly", "precipitation"),  # precipitation height
    ("hourly", "wind_synop")
]

request = DwdObservationRequest(
    parameters = parameters,
    start_date="2024-01-01",
    end_date="2024-12-31"
)
stations = request.filter_by_station_id(station_id=berlin_stations)
df = stations.values.all().df.drop_nulls()
df

cols = df.columns

In [4]:
df = pd.DataFrame(df)

df.columns = cols

df.to_csv("./data/2024-weather.csv")

Citizen Science Data- Done on cluster.

https://archive.sensor.community/csv_per_month/

In [5]:
from pathlib import Path

# ==============================
# CONFIGURATION
# ==============================
BASE_URL = "https://archive.sensor.community/csv_per_month"
DATA_DIR = Path("data/citsci")
DATA_DIR.mkdir(parents=True, exist_ok=True)

SENSOR_TYPE = "sds011"
YEAR = 2024

# ==============================
# DOWNLOAD LOOP
# ==============================
for month in range(1, 13):
    month_str = f"{month:02d}"
    url = f"{BASE_URL}/{YEAR}-{month_str}/{YEAR}-{month_str}_{SENSOR_TYPE}.zip"
    dest = DATA_DIR / f"{YEAR}-{month_str}_{SENSOR_TYPE}.zip"

    if dest.exists():
        print(f"✅ Already downloaded: {dest.name}")
        continue

    print(f"⬇️  Downloading {url} ...")
    try:
        with requests.get(url, stream=True, timeout=60) as r:
            r.raise_for_status()
            with open(dest, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        print(f"💾 Saved to {dest}")
    except Exception as e:
        print(f"⚠️ Failed to download {url}: {e}")

print("✅ All downloads complete.")


⬇️  Downloading https://archive.sensor.community/csv_per_month/2024-01/2024-01_sds011.zip ...
💾 Saved to data/citsci/2024-01_sds011.zip
⬇️  Downloading https://archive.sensor.community/csv_per_month/2024-02/2024-02_sds011.zip ...


KeyboardInterrupt: 