In [None]:
# import necessary libaries/functions first
# for web scraping in general
import requests                                         # for sending HTTP/1.1 requests easily
from bs4 import BeautifulSoup                           # for scraping information from website 
import pandas as pd                                     # for dataframe handling
import time                                             # insert extra pauses while scraping

# selenium to scrape more nested data
from selenium import webdriver                          # webdriver 
from selenium.webdriver.common.by import By             # By-Function
from selenium.webdriver.common.keys import Keys         # to work via XPATH oder CSS
from selenium.webdriver.chrome.options import Options   # for webdriver options
from selenium.webdriver.chrome.service import Service   # needed if you want to operate headless

In [None]:
# define function navigating to target website of latest groundwater level data
# scrape latest data and extract all known groundwater stations
### you can easily adjuts this part to other parameters which are published on the website by changing the URL (e.g. surface water level)

def load_current_data():
    # url of current data
    url = 'https://wasserportal.berlin.de/messwerte.php?anzeige=tabelle&thema=gws&nstoffid=10'  # change url if you want to scrape another parameter (e.g. surface water level)
                                                                                                # url = 'https://wasserportal.berlin.de/messwerte.php?anzeige=tabelle&thema=ows'
    
    # Create object page to load html
    page = requests.get(url)

    # use parser-lxml Change html to Python friendly format
    # Get page's information
    soup = BeautifulSoup(page.text, 'lxml')

    raw_table = soup.find("table", id="pegeltab")
    raw_table

    # Get titles of the column with the tag <th>
    headers = []
    for i in raw_table.find_all("th"):
        title = i.text
        headers.append(title);

    # Clean headers
    headers = [x.replace('-', '') for x in headers]
    headers = [x.replace('  ', ' ') for x in headers]

    # Create a dataframe
    df_current_data = pd.DataFrame(columns = headers)

    # Create a for loop to fill df_current_data
    for j in raw_table.find_all("tr")[1:]:
        row_data = j.find_all("td")
        row = [i.text for i in row_data]
        length = len(df_current_data)
        df_current_data.loc[length] = row

    df_current_data_clean = df_current_data.drop(columns = ['Ganglinie']) # maybe you have to change it to "Ganglinien" depending on table you want to scrape
    return df_current_data_clean

# run function to get current data
df_current_data_clean = load_current_data()

df_current_data_clean.head()

In [None]:
# define function to extract  stations
def find_stations(df_current_data_clean):
    stations = df_current_data_clean['Messstellennummer']
    return stations

# run function to get stations
stations = find_stations(df_current_data_clean)

stations.head()

In [None]:
#Set some seleniun chrome options
chromeOptions = Options()
chromeOptions.headless = False # change to True if you do not want to see the steps done on screen

# define path where to save the downloaded files
prefs = {"download.default_directory" : "/your/favorite/path"}
chromeOptions.add_experimental_option("prefs",prefs)

# create driver object
driver = webdriver.Chrome(executable_path='/path/to/your/chromdriver/chromedriver.exe', 
                          options=chromeOptions)

# define function to navigate through website and click the download button
# if you want to download another data, you have to check the XPATH of the buttons
def download_via_browser(Download_URL):

    driver.implicitly_wait(10) # implicit wait until the asked features is loaded
    time.sleep(2) # 2 seconds waiting before download to avoid annoying the host admin
    driver.get(Download_URL)
    print ("starting Driver")
    button_1 = driver.find_element(by = By.XPATH, value ="/html/body/div[2]/div/div/div/div[4]/div[2]/a[2]") # find button to download section
    time.sleep(2) # seconds
    button_1.click()
    button_2 = driver.find_element(by = By.XPATH, value ='//*[@id="form_gw_c"]/button') #find download button
    time.sleep(2) # seconds
    button_2.click()

# run function in loop for every station (we don't care about speed in here)
for m in stations.index:
        url_no = str(stations[m])
        # print(url_no)
        Download_URL = ("https://wasserportal.berlin.de/station.php?anzeige=d&thema=gws&station="+str(url_no))
        download_via_browser(Download_URL)

