This scripts retrieve the air quality data ("CO", "NO2", "O3", "PM10", "SO2") from the german environment ministery for all the measurements stations.
You can adjust the time **period** parameter :
- "1SMW" -> hourly average
- "1TMW" -> daily average

In [None]:
import time
from datetime import datetime
from datetime import date
import pandas as pd
import requests
import io
from random import randint
import config

import sqlalchemy
%matplotlib inline

In [None]:
engine = sqlalchemy.create_engine(f"sqlite:///{config.DB_PATH}")

## set time period

In [None]:
period = "1SMW"

## get stations list

In [None]:
def create_stations_list():
    stations_limits_url = "https://www.umweltbundesamt.de/js/uaq/data/stations/limits"

    stations_limits = requests.get(stations_limits_url).json()

    df_stations = pd.DataFrame(stations_limits['stations_idx'])

    df_stations.columns = ['idx','name','region','region_name','longitude','latitude']

    df_stations.set_index('idx',inplace=True)

    df_stations.to_sql(name='stations', con=engine,if_exists='replace')

## get data

In [None]:
components_idx= ["CO", "NO2", "O3", "PM10", "SO2"]
df_stations = pd.read_sql_table('stations', engine)

# filter on Berlin and Brandenbourg Stations
#df_stations = df_stations[df_stations.region.isin(['BE','BB'])]
df_stations.shape

In [None]:
#set timestamp
def set_ts(y,m,d):
    dt_obj = datetime.strptime(f'{str(d)}.{str(m)}.{str(y)} 07:00:00,00',
                               '%d.%m.%Y %H:%M:%S,%f')
    millisec = dt_obj.timestamp()# * 1000
    return int(millisec)

def set_url(station_id,pollutant, ts_from,ts_to,period):
    url = f'https://www.umweltbundesamt.de/uaq/csv/stations/data?station[]={station_id}&pollutant[]={pollutant}&scope[]={period}&group[]=station&range[]={ts_from},{ts_to}'
    return url

def get_station_data(station_id,pollutant,period):
    ts_from = set_ts(2016,1,1)
    ts_to = set_ts(date.today().year,date.today().month,date.today().day)
    url = set_url(station_id,pollutant, ts_from,ts_to,period)
    print(url)
    s = requests.get(url).text 
    df_data = pd.read_csv(io.StringIO(s),sep=';')
    print(df_data.shape)
    return df_data

In [None]:
def get_data_pollutant(pollutant,period):
    df_all_data =pd.DataFrame()

    for station_id in df_stations.idx.unique():
        try:

            data = get_station_data(station_id,pollutant,period)
            if data.shape[0]>0:
                data.reset_index(inplace=True)
                data.set_index('index',inplace=True)
                data = data[['Stationscode',"Zeit",]]
                data.to_sql(name=f'dwd_data_{pollutant}_{period}', con=engine,if_exists='append')
            
            
        except Exception as e:
            print(e)   
        finally:
            time.sleep(.1)#+randint(2,4))
    
   
   
    print(f'{pollutant}:{df_all_data.shape}')
    

In [None]:
#for pollutant in components_idx:
    #get_data_pollutant(pollutant,period) 

In [None]:
get_data_pollutant('PM10',period)

In [None]:
res = engine.execute("SELECT name FROM sqlite_master WHERE type='table';")

for r in res:
    print(r[0])

In [None]:
data = get_station_data('DETH091','PM10',period)

In [None]:
data.columns