# get_stations

# Code for KNMI Monthly Station Data Analysis

Written by Cecile Dai.

Uses information from https://climexp.knmi.nl/about.cgi?id=someone@somewhere

## Setup

In [174]:
# Imports
import numpy as np
import pandas as pd
import sys

from bs4 import BeautifulSoup
from requests_html import HTMLSession
from pprint import pprint

# Link to the KNMI page where you select the stations and data to view a monthly time series
station_link = 'https://climexp.knmi.nl/selectstation.cgi?id=someone@somewhere'

# initialize HTTP session so data/cookies are persisted
session = HTMLSession()

Parse all tags from a web page.

In [175]:
# Function to get all form tags from a given web page
def get_all_forms(url):
    # GET request
    res = session.get(url)
    soup = BeautifulSoup(res.html.html, "html.parser")
    return soup.find_all("form")

Get all form details.

In [176]:
# Returns the HTML details of a form, including action, method and list of form controls (inputs, etc)
def get_form_details(form):
    details = {}
    # get the form action (requested URL)
    action = form.attrs.get("action").lower()
    # get the form method (POST, GET, DELETE, etc)
    # if not specified, GET is the default in HTML
    method = form.attrs.get("method", "get").lower()
    # get all form inputs
    inputs = []
    for input_tag in form.find_all("input"):
        # get type of input form control
        input_type = input_tag.attrs.get("type", "text")
        # get name attribute
        input_name = input_tag.attrs.get("name")
        # get the default value of that input tag
        input_value =input_tag.attrs.get("value", "")
        # add everything to that list
        inputs.append({"type": input_type, "name": input_name, "value": input_value})

    # Get all 'select' and 'textarea' input types
    for select in form.find_all("select"):
        # get the name attribute
        select_name = select.attrs.get("name")
        # set the type as select
        select_type = "select"
        select_options = []
        # the default select value
        select_default_value = ""
        # iterate over options and get the value of each
        for select_option in select.find_all("option"):
            # get the option value used to submit the form
            option_value = select_option.attrs.get("value")
            if option_value:
                select_options.append(option_value)
                if select_option.attrs.get("selected"):
                    # if 'selected' attribute is set, set this option as default    
                    select_default_value = option_value
        if not select_default_value and select_options:
            # if the default is not set, and there are options, take the first option as default
            select_default_value = select_options[0]
        # add the select to the inputs list
        inputs.append({"type": select_type, "name": select_name, "values": select_options, "value": select_default_value})
    for textarea in form.find_all("textarea"):
        # get the name attribute
        textarea_name = textarea.attrs.get("name")
        # set the type as textarea
        textarea_type = "textarea"
        # get the textarea value
        textarea_value = textarea.attrs.get("value", "")
        # add the textarea to the inputs list
        inputs.append({"type": textarea_type, "name": textarea_name, "value": textarea_value})
        # put everything to the resulting dictionary
    details["action"] = action
    details["method"] = method
    details["inputs"] = inputs
    return details

Run the following block to print out form details, with input types and values.

In [177]:
# get all form tags
forms = get_all_forms(station_link)
# iteratte over forms
for i, form in enumerate(forms, start=1):
    form_details = get_form_details(form)
    print("="*50, f"form #{i}", "="*50)
    print("action: {action}\n\n".format(action=form_details["action"]))
    print("method: {method}\n\n".format(method=form_details["method"]))
    for input in form_details["inputs"]:
        print("input: {i}\n".format(i=input))

action: getstations.cgi


method: post


input: {'type': 'hidden', 'name': 'email', 'value': 'someone@somewhere'}

input: {'type': 'radio', 'name': 'climate', 'value': 'precipitation'}

input: {'type': 'radio', 'name': 'climate', 'value': 'precipitation_all'}

input: {'type': 'radio', 'name': 'climate', 'value': 'sealev'}

input: {'type': 'radio', 'name': 'climate', 'value': 'temperature'}

input: {'type': 'radio', 'name': 'climate', 'value': 'temperature_all'}

input: {'type': 'radio', 'name': 'climate', 'value': 'runoff'}

input: {'type': 'radio', 'name': 'climate', 'value': 'min_temperature'}

input: {'type': 'radio', 'name': 'climate', 'value': 'min_temperature_all'}

input: {'type': 'radio', 'name': 'climate', 'value': 'streamflow'}

input: {'type': 'radio', 'name': 'climate', 'value': 'max_temperature'}

input: {'type': 'radio', 'name': 'climate', 'value': 'max_temperature_all'}

input: {'type': 'radio', 'name': 'climate', 'value': 'snow'}

input: {'type': 'radio', 'name': 'clima

## Create Submission Form

Start off by noting the submission values you want. For the purposes of only getting a monthly station list in a certain area for precipitation, only "select: precipitation_all", "text: lat1", "text: lat2", "text: "lon1", and "text: lon2" will be filled out. These inputs select the precipitation measurements under GHCN-M (all) for all stations within a selected latitude and longitude area.

In [178]:
# Imports
from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urljoin
import webbrowser


# Select precipitation_all

# Define desired latitude and longitude. Change this for different areas.
lat1, lat2 = 10, 15
lon1, lon2 = 10, 15

In [179]:
# the data body we want to submit
data = {}

data["email"] = "someone@somewhere"
data["climate"] = "precipitation_all"
data["name"] = ""
data["num"] = 10
data["lat"] = ""
data["lon"] = ""
data["lat1"] = lat1
data["lat2"] = lat2
data["lon1"] = lon1
data["lon2"] = lon2
data["list"] = """# lon1 lon2 lat1 lat2 (optional)
station number (one per line)"""
data["min"] = 10
data["sum"] = 1
data["month"] = -1
data["yr1"] = ""
data["yr2"] = ""
data["dist"] = ""
data["elevmin"] = ""
data["elevmax"] = ""

print(data)

{'email': 'someone@somewhere', 'climate': 'precipitation_all', 'name': '', 'num': 10, 'lat': '', 'lon': '', 'lat1': 10, 'lat2': 15, 'lon1': 10, 'lon2': 15, 'list': '# lon1 lon2 lat1 lat2 (optional)\nstation number (one per line)', 'min': 10, 'sum': 1, 'month': -1, 'yr1': '', 'yr2': '', 'dist': '', 'elevmin': '', 'elevmax': ''}


Submit form with desired inputs and extract relevant information from HTTP response.

In [180]:
# join the url with the action (form request URL)
url = urljoin(station_link, form_details["action"])

# pprint(data)
# Submit form and get response in res
if form_details["method"] == "post":
    res = session.post(url, data=data)
elif form_details["method"] == "get":
    res = session.get(url, params=data)
elif form_details["method"] == "put":
    res = session.put(url, params=data)

soup = BeautifulSoup(res.content, "html.parser")

import re

start = re.search('</div>\n</form>', res.text)
stop = re.search('</div>\n<div class="col-md-4">', res.text)

# I will admit this is not the prettiest way to extract the desired text but the pure regex method just wasn't working :(
limit1 = re.split(', ', str(start.span()))
limit2 = re.split(', ', str(stop.span()))
l1 = int(limit1[1].replace(')', ''))
l2 = int(limit2[0].replace('(', ''))

print(res.text[l1:l2])

relevant_text = res.text[l1:l2]


Searching for stations in  10.00N: 15.00N,   10.00E:  15.00E<br>
Requiring at least   10 years with data<br>
Found      10 stations<br>
<br>
MAROUA-SALAK        CAMEROON  (CAMEROON)<br>
coordinates:  10.50N,   14.30E,  423m<br>
WMO station code: 64851 (<a href="getprcpall.cgi?id=someone@somewhere&WMO=64851&STATION=MAROUA-SALAK&extraargs=">get data</a>)<br>
Found   49 years with data in 1951-2000<br>
<br>
KAELE               CAMEROON  (CAMEROON)<br>
coordinates:  10.10N,   14.50E,  386m<br>
Near WMO station code: 64851.1 (<a href="getprcpall.cgi?id=someone@somewhere&WMO=64851.1&STATION=KAELE&extraargs=">get data</a>)<br>
Found   39 years with data in 1951-1989<br>
<br>
BOL BEREM           CHAD      (CHAD)<br>
coordinates:  13.40N,   14.70E,  292m<br>
WMO station code: 64702 (<a href="getprcpall.cgi?id=someone@somewhere&WMO=64702&STATION=BOL_BEREM&extraargs=">get data</a>)<br>
Found   65 years with data in 1908-1990<br>
<br>
GOURE               NIGER     (NIGER)<br>
coordinates:  14.00N

Parse through data to create a dataframe.

In [181]:
lines = relevant_text.replace('\n', '').split('<br>')
        
header = ["Station Name", "Latitude", "Longitude", "Elevation", "Station Code", "Starting Year", "Ending Year", "Monthly Series Link"]
station_names = []
lats = []
lons = []
elevs = []
codes = []
start_year = []
end_year = []
links = []

stripped_lines = []

for i in range(len(lines)):
    if len("".join(lines[i].split())) != 0 and i >= 3:
        stripped_lines.append(lines[i])
        # print(lines[i])
    else:
        continue

for j in range(len(stripped_lines)):
    # Station names
    if j%4 == 0:
        # print(" ".join(stripped_lines[j].split()))
        station_names.append(" ".join(stripped_lines[j].split()))

    # Coordinates and elevation
    elif j%4 == 1:
        latitude = re.search('\s+(.*)N', stripped_lines[j])
        # print(latitude.group(0).strip())
        lats.append(latitude.group(0).strip())

        longitude = re.search(',\s+(.*)E', stripped_lines[j])
        # print(longitude.group(0).replace(',','').strip())
        lons.append(longitude.group(0).replace(',','').strip())

        elevation = re.search('E,\s+(.*m)', stripped_lines[j])
        # print(elevation.group(0).replace("E,", '').strip())
        elevs.append(elevation.group(0).replace("E,", '').strip())

    # Station code and monthly series link
    elif j%4 == 2:
        code = re.search('([0-9\.]+)', stripped_lines[j])
        # print(code.group(0))
        codes.append(code.group(0))

        link = re.search('\"(.*)\"', stripped_lines[j])
        # print(link.group(0).replace('"', ''))
        links.append(link.group(0).replace('"', ''))

    # Data start and end year
    elif j%4 == 3:
        years = re.search('([0-9]{4}\-[0-9]{4})', stripped_lines[j])
        # print(years.group(0))
        # print(years.group(0)[:4])
        # print(years.group(0)[5:])
        start_year.append(years.group(0)[:4])
        end_year.append(years.group(0)[5:])

data = {'Station Name': station_names, 'Latitude': lats, 'Longitude': lons, 'Elevation': elevs, 'Station Code': codes, 'Starting Year': start_year, 'Ending Year': end_year, 'Monthly Series Link': links}

df = pd.DataFrame(data)
print(df)

                       Station Name Latitude Longitude Elevation Station Code  \
0  MAROUA-SALAK CAMEROON (CAMEROON)   10.50N    14.30E      423m        64851   
1         KAELE CAMEROON (CAMEROON)   10.10N    14.50E      386m      64851.1   
2             BOL BEREM CHAD (CHAD)   13.40N    14.70E      292m        64702   
3               GOURE NIGER (NIGER)   14.00N    10.30E      460m        61045   
4            N'GUIGMI NIGER (NIGER)   14.30N    13.10E      286m        61049   
5               DIFFA NIGER (NIGER)   13.40N    12.80E      305m        61085   
6         MAINE-SOROA NIGER (NIGER)   13.20N    12.00E      338m        61096   
7           NGURU NIGERIA (NIGERIA)   12.80N    10.50E      344m        65064   
8        POTISKUM NIGERIA (NIGERIA)   11.60N    11.00E      414m        65073   
9       MAIDUGURI NIGERIA (NIGERIA)   11.90N    13.10E      354m        65082   

  Starting Year Ending Year                                Monthly Series Link  
0          1951        2000

Run this code block to download the data as a .csv file.

In [173]:
filename = "all_stations_lat={a}to{b}_lon={c}to{d}.csv".format(a = lat1, b = lat2, c = lon1, d = lon2)
df.to_csv(filename, index=False)

## References

Code adapted from: https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python

Other Resources:

# get_data