# Scraping Data from Deutscher Wetterdienst (DWD)
CDC = Climate Data Center

We are only going to look at the **recent** data provided by DWD, which includes the last 500 days until yesterday.

In [96]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

In [32]:
DATA_DIR = "retrieved_data/"

In [5]:
URL_hourly_recent = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/recent/"
URL_station_info = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/recent/TU_Stundenwerte_Beschreibung_Stationen.txt"

## Load the list of weather stations

In [86]:
def parse_weather_station_info(text):
    data = []
    for line in text[2:]:
        e = line.split()
        station_id = e.pop(0)
        start_date = e.pop(0)
        end_date = e.pop(0)
        altitude = e.pop(0)
        latitude = e.pop(0)
        longitude = e.pop(0)
        state = e.pop(-1)
        station_name = " ".join(e)

        row = [station_id, start_date, end_date, altitude, latitude, longitude, station_name, state]
        data.append(row)

    columns = ["station_id", "start_date", "end_date", "altitude", "latitude", "longitude", "name", "state"]
    df = pd.DataFrame(data, columns=columns)
    return df

req = requests.get(URL_station_info)
lines = req.text.splitlines()
df = parse_weather_station_info(lines)

## Scrape all zip links that are available

In [88]:
def scrape_product_links(soup):
    anchors = soup.find_all("a")
    links = []
    
    for a in anchors:
        ref = a.get("href")
        if ref.startswith("stundenwerte_TU_") and ref.endswith("_akt.zip"):
            links.append(ref)
    return links

req = requests.get(URL_hourly_recent)
soup = BeautifulSoup(req.content, "html.parser")
file_urls = scrape_product_links(soup)
    

## Download all zip files

In [97]:
def download_all_product_files(file_urls):
    total = len(file_urls)
    for i, url in enumerate(file_urls, start=1):
        req = requests.get(URL_hourly_recent + url)

        with open(DATA_DIR + url, "wb") as file:
            file.write(req.content)
            
        # unzip the file and only keep the extracted content
        with ZipFile(DATA_DIR + url, "r") as zippy:
            dirname = DATA_DIR + url[:-4]
            try:
                os.mkdir(dirname)
                zippy.extractall(dirname)
                os.remove(DATA_DIR + url)
            except Exception as e:
                print(e)
        
        if i % 50 == 0 or i == total:
            print("%d of %d files downloaded" % (i , total))
    
download_all_product_files(file_urls)

50 of 508 files downloaded
100 of 508 files downloaded
150 of 508 files downloaded
200 of 508 files downloaded
250 of 508 files downloaded
300 of 508 files downloaded
350 of 508 files downloaded
400 of 508 files downloaded
450 of 508 files downloaded
500 of 508 files downloaded
508 of 508 files downloaded


## Extract temperature und humidity data from all station data
Now, for every available weather station there exists a folder containing measurement as well as meta data about the corresponding station.