# get_time_series

# Collect KNMI Climate Explorer Time Series Data

Written by Cecile Dai

Uses data from https://climexp.knmi.nl/start.cgi?id=someone@somewhere

Given a specified or list of coordinate longitude-latitude ranges, this script will collect all the raw monthly, annual cycle, and anomaly data for this range's values in the 20th Century Monthly Reanalysis fields. You may choose to download all this data in separate .csv files, as a single large one, or update an existing file written in the same format.

In [71]:
# Imports
import numpy as np
import pandas as pd
import sys

from bs4 import BeautifulSoup
from requests_html import HTMLSession
from pprint import pprint

from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urljoin
import webbrowser
import re

import requests
import datetime
import time

## Functions

### Submit Coordinate Range

This function creates and submits a form to https://climexp.knmi.nl/selectfield_rea.cgi?id=someone@somewhere with a specified latitude-longitude coordinate range for surface precipitation values.

In [72]:
# Function to create and submit form with selected latitude and longitude ranges
def get_series_submit(lat1, lat2, lon1, lon2, session, convert = 0, demand = 30):
    # the data body we want to submit when selecting the renanalysis field to analyze
    data_select = {}

    # Link to the KNMI page where you select the stations and data to view a monthly time series
    time_series_link = "https://climexp.knmi.nl/selectfield_rea.cgi?id=someone@somewhere"

    data_select["email"] = "someone@somewhere"
    data_select["field"] = "c3prate"

    series_url = urljoin(time_series_link, 'select.cgi')
    res_select = session.post(series_url, data=data_select)

    print(f"Response Status Code: {res_select.status_code}")

    if res_select.status_code != 200:
        print(f"Unable to access page.")
        return -1

    # the data body we want to submit when getting the time series for a particular coordinate range
    data_fields = {}

    data_fields["email"] = "someone@somewhere"
    data_fields["field"] = "c3prate"
    data_fields["maskmetadata"] = "" 
    data_fields["lat1"] = lat1
    data_fields["lat2"] = lat2
    data_fields["lon1"] = lon1
    data_fields["lon2"] = lon2
    data_fields["intertype"] = "nearest"
    data_fields["gridpoints"] = "false"
    data_fields["minfac"] = demand
    data_fields["masktype"] = "all"
    if convert == 0:
        data_fields["standardunits"] = ""
    else:
        data_fields["standardunits"] = "standardunits"

    fields_url = urljoin(series_url, 'get_index.cgi')
    res_fields = session.post(fields_url, data=data_fields)

    print(f"Response Status Code: {res_fields.status_code}")

    if res_fields.status_code != 200:
        print(f"Unable to access page.")
        return -1

    soup = BeautifulSoup(res_fields.content, "html.parser")

    divs = soup.find_all("div", {"class": "bijschrift"})

    return divs

### Submit Coordinate Range and Process All Data

This function creates and submits a form to https://climexp.knmi.nl/selectfield_rea.cgi?id=someone@somewhere with a specified latitude-longitude coordinate range as well as processes all the obtained raw, annual cycle, and anomaly data into dataframes. Using this function will allow you to skip over the different function calls but doing this will give you a higher change of missing potential errors. However, it is recommended over the separated approach, as it takes less time overall due to not needing to resubmit data to avoid timeouts.

In [73]:
def get_series_submit_to_dfs(lat1, lat2, lon1, lon2, session, convert = 0, demand = 30):
    # the data body we want to submit when selecting the renanalysis field to analyze
    data_select = {}

    # Link to the KNMI page where you select the stations and data to view a monthly time series
    time_series_link = "https://climexp.knmi.nl/selectfield_rea.cgi?id=someone@somewhere"

    data_select["email"] = "someone@somewhere"
    data_select["field"] = "c3prate"

    series_url = urljoin(time_series_link, 'select.cgi')
    res_select = session.post(series_url, data=data_select)

    print(f"Response Status Code: {res_select.status_code}")

    if res_select.status_code != 200:
        print(f"Unable to access field select page for lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}.")
        return -1

    # the data body we want to submit when getting the time series for a particular coordinate range
    data_fields = {}

    data_fields["email"] = "someone@somewhere"
    data_fields["field"] = "c3prate"
    data_fields["maskmetadata"] = "" 
    data_fields["lat1"] = lat1
    data_fields["lat2"] = lat2
    data_fields["lon1"] = lon1
    data_fields["lon2"] = lon2
    data_fields["intertype"] = "nearest"
    data_fields["gridpoints"] = "false"
    data_fields["minfac"] = demand
    data_fields["masktype"] = "all"
    if convert == 0:
        data_fields["standardunits"] = ""
    else:
        data_fields["standardunits"] = "standardunits"

    fields_url = urljoin(series_url, 'get_index.cgi')
    res_fields = session.post(fields_url, data=data_fields)

    print(f"Response Status Code: {res_fields.status_code}")

    if res_fields.status_code != 200:
        print(f"Unable to access page for lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}.")
        return -1

    soup = BeautifulSoup(res_fields.content, "html.parser")

    divs = soup.find_all("div", {"class": "bijschrift"})

    lat_min, lat_max, lon_min, lon_max, raw_data_link, annual_data_link, anom_data_link = get_series_links(divs)

    raw_data_link = "https://climexp.knmi.nl/" + raw_data_link
    annual_cycle_link = "https://climexp.knmi.nl/"+ annual_data_link
    anom_link = "https://climexp.knmi.nl/"+ anom_data_link

    raw_df = get_raw_df(raw_data_link, session)
    if raw_df.empty:
        print(f"Problem retrieving raw monthly data. Empty dataframe for lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}.")
    print(raw_df)
    
    annual_df = get_annual_df(annual_cycle_link, session)
    if annual_df.empty:
        print(f"Problem retrieving annual cycle data. Empty dataframe for lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}.")
    print(annual_df)
    
    anom_df = get_anom_df(anom_link, session)
    if anom_df.empty:
        print(f"Problem retrieving anomaly data. Empty dataframe for lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}")
    print(anom_df)

    return lat_min, lat_max, lon_min, lon_max, raw_df, annual_df, anom_df

### Get Adjusted Coordinate Ranges and Data Links

This function parses through the data received from the coordinate range submission to return the monthly raw, annual cycle, and anomaly data associated with that range. It also returns the adjusted coordinate ranges as the Monthly Reanalysis queries automatically adjust the user-submitted coordinate range into one fitting pre-determined coordinate boundaries.

In [74]:
def get_series_links(div_list):
    lat1, lat2, lon1, lon2 = -999, -999, -999, -999
    raw_data_link, annual_data_link, anom_data_link = "", "", ""

    for div in range(len(div_list)):
        # print(div_list[div])
        # print("")

        if div == 0:
            lons = re.search("lon=\s+(-?[0-9]+\.500\s+-?[0-9]+\.500),\s+lat=", str(div_list[0]))
            lats = re.search("lat=\s+(-?[0-9]+\.500\s+-?[0-9]+\.500),\n", str(div_list[0]))
            lons = lons.group(1).split()
            lats = lats.group(1).split()
            print(lons)
            print(lats)
            lat1 = lats[0]
            lat2 = lats[1]
            lon1 = lons[0]
            lon2 = lons[1]

            # raw_data_link = re.search("(data/ic3prate_-?[0-9]+\.?[0-9]+?-{1,2}[0-9]+\.?[0-9]+?E_-?[0-9]+\.?[0-9]+?-{1,2}[0-9]+\.?[0-9]+?N_n.dat)", str(div_list[0]))
            raw_data_link = re.search(">metadata</a>, <a href=(.*)>raw data<", str(div_list[0]))
            raw_data_link = raw_data_link.group(1).replace("\"", '')
            print(raw_data_link)
        elif div == 1:
            # annual_data_link = re.search("(data/ic3prate_-?[0-9]+--?[0-9]+E_-?[0-9]+--?[0-9]+N_n__yr.txt)", str(div_list[1]))
            annual_data_link = re.search(">pdf</a>,\s*\n<a href=(.*)>raw data<", str(div_list[1]))
            annual_data_link = annual_data_link.group(1).replace("\"", '')
            print(annual_data_link)
        elif div == 2:
            # anom_data_link = re.search("(data/ic3prate_-?[0-9]+--?[0-9]+E_-?[0-9]+--?[0-9]+N_n_a.txt)", str(div_list[2]))
            anom_data_link = re.search(">pdf</a>,\s*\n<a href=(.*)>raw data<", str(div_list[2]))
            anom_data_link = anom_data_link.group(1).replace("\"", '')
            print(anom_data_link)
        else:
            break

    return lat1, lat2, lon1, lon2, raw_data_link, annual_data_link, anom_data_link

### Get Raw Monthly, Annual Cycle, and Anomaly Data

These functions take links to the raw monthly, annual cycle, and anomaly data and return the information in separate dataframes.

In [76]:
def get_raw_df(raw_data_link, session):

    datapage = session.get(raw_data_link)
    # print(datapage)
        
    print("Response Status: {a}\n".format(a=datapage.status_code))
    if datapage.status_code != 200:
        return pd.DataFrame()

    # print(datapage.text)
    # raw_data_headers = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    raw_data_list = []
    lines = datapage.text.split("\n")
    for line in lines:
        elements = line.split()
        # print(elements)
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            # csvline = ','.join(elements)
            # print(csvline)
            raw_data_list.append(elements)
    raw_data_df = pd.DataFrame(raw_data_list, columns=['Year', 'January', 'February', 'March', 'April', 'May', 'June',
                                                       'July', 'August', 'September', 'October', 'November',
                                                       'December'])
    # print(raw_data_df)
    # print("confirm")
    return raw_data_df


def get_annual_df(annual_cycle_link, session):

    annualpage = session.get(annual_cycle_link)

    print("Response Status: {a}\n".format(a=annualpage.status_code))
    if annualpage.status_code != 200:
        return pd.DataFrame()

    # annualpage = requests.get(annual_cycle_link)
    # print(annualpage.text)
    raw_data_yr_list = []
    lines = annualpage.text.split("\n")
    for line in lines:
        elements = line.split()
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            monthint = re.findall('2{1}0{3,4}([1-9]{1}[012]?)01', elements[0])
            # print(monthint)
            if monthint and len(monthint) != 0:
                elements[0] = datetime.date(1900, int(monthint[0]), 1).strftime('%B')
                # print(len(elements))
                # print(elements)
                raw_data_yr_list.append(elements)
    raw_data_yr_df = pd.DataFrame(raw_data_yr_list, columns=['Month', 'Mean', '2.5%', '17%', '50%', '83%', '97.5%'])
    # print(raw_data_yr_df)
    # print("confirm")
    return raw_data_yr_df


def get_anom_df(anom_link, session):

    anompage = session.get(anom_link)

    print("Response Status: {a}\n".format(a=anompage.status_code))
    if anompage.status_code != 200:
        return pd.DataFrame()

    # anompage = requests.get(anom_link)
    # print(anompage.text)
    raw_data_anom_dict = {}
    raw_data_anom_list = []
    lines = anompage.text.split("\n")
    for line in lines:
        elements = line.split()
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            date = re.findall('([0-9]{4})\.([0-9]{4})', elements[0])
            if date and len(date[0]) == 2:
                # Not all months included
                if date[0][0] not in raw_data_anom_dict:
                    # Initialize with invalid numbers
                    raw_data_anom_dict[date[0][0]] = {'January': '-999.9', 'February': '-999.9', 'March': '-999.9',
                                                      'April': '-999.9', 'May': '-999.9', 'June': '-999.9',
                                                      'July': '-999.9', 'August': '-999.9', 'September': '-999.9',
                                                      'October': '-999.9', 'November': '-999.9', 'December': '-999.9'}
                monthint = round((float('0.' + date[0][1]) * 12) + 1)   # Multiply by 12 to get the actual month integer
                monthint = datetime.date(1900, int(monthint), 1).strftime('%B')
                raw_data_anom_dict[date[0][0]][monthint] = elements[1]
    for key in raw_data_anom_dict:
        dlist = []
        dlist.append(key)
        for d in raw_data_anom_dict[key]:
            dlist.append(raw_data_anom_dict[key][d])
        raw_data_anom_list.append(dlist)
    raw_data_anom_df = pd.DataFrame(raw_data_anom_list, columns=['Year', 'January', 'February', 'March', 'April', 'May',
                                                                 'June', 'July', 'August', 'September', 'October',
                                                                 'November', 'December'])
    return raw_data_anom_df

### Dataframe Compiler

This function compiles a list of dataframes of the same format into a single large dataframe, adding its associated coordinate range as new identifier columns.

In [77]:
# Reformats all the dataframes in the input list and adds station codes. Note that this assumes that everything was processed in the same order.
def concat_format_wide(df_list, lat_min_list, lat_max_list, lon_min_list, lon_max_list, existing_file=''):
    full_list_df = []
    dfs = []    #list of dataframes to concatenate
    for box in range(len(df_list)):
        lat_mins = [lat_min_list[box] for row in range(df_list[box].shape[0])]
        lat_maxes = [lat_max_list[box] for row in range(df_list[box].shape[0])]
        lon_mins = [lon_min_list[box] for row in range(df_list[box].shape[0])]
        lon_maxes = [lon_max_list[box] for row in range(df_list[box].shape[0])]
        new_df = df_list[box].copy()
        new_df.insert(0, 'Minimum Latitude', lat_mins)
        new_df.insert(1, 'Maximum Latitude', lat_maxes)
        new_df.insert(2, 'Minimum Longitude', lon_mins)
        new_df.insert(3, 'Maximum Longitude', lon_maxes)
        dfs.append(new_df)
    if len(existing_file) != 0:
        existing_df = pd.read_csv(existing_file)
        # Check if this data already exists in the file, remove from list to be concatenated if so
        for coord in range(len(df_list)):
            if not existing_df.loc(lat_min_list[coord]).loc(lat_max_list[coord]).loc(lon_min_list[coord].loc(lon_max_list[coord])).empty:
                dfs.remove(dfs[box])
        dfs.append(existing_df)
    full_list_df = pd.concat(dfs)
    return full_list_df
    # print(full_raw_list)

# Run Functions
Run the functions on the data to analyze, create and/or update the files with the desired data.

Upload Coordinate Boxes to Analyze

In [78]:
coordinates = pd.read_csv("for_analysis/BoxCoordinates.csv")
print(coordinates)

     Lat min  Lat max  Long min  Long max
0        2.5      3.5      29.5      30.5
1        2.5      3.5      30.5      31.5
2        2.5      3.5      31.5      32.5
3        2.5      3.5      32.5      33.5
4        2.5      3.5      33.5      34.5
..       ...      ...       ...       ...
122     -9.5     -8.5      32.5      33.5
123     -9.5     -8.5      33.5      34.5
124     -9.5     -8.5      34.5      35.5
125     -9.5     -8.5      35.5      36.5
126     -9.5     -8.5      36.5      37.5

[127 rows x 4 columns]


In [79]:
# initialize HTTP session so data/cookies are persisted
session = HTMLSession()
series_div_list = []

### Run Separately

Run the blocks below to submit data, process responses, and get dataframes in separate function calls. This will let you view the return values of each function in between the functions and potentially catch problems more easily as well as validate the responses. However, there may be computational difficulties or timeouts.

In [151]:
for index in range(coordinates.shape[0]):
    lat1 = coordinates.iloc[index]["Lat min"]
    lat2 = coordinates.iloc[index]["Lat max"]
    lon1 = coordinates.iloc[index]["Long min"]
    lon2 = coordinates.iloc[index]["Long max"]

    print(f"Processing page {index + 1} of {coordinates.shape[0]}: lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}")

    series_divs = (get_series_submit(lat1, lat2, lon1, lon2, session))

    if series_divs == -1:
        print(f"Bad response. Ending loop on index {index}...")
        break
    else:
        print("Success")
        series_div_list.append(series_divs)
    
print("Done getting coordinate box data.")

Processing page 1 of 127: lat1: 2.5, lat2: 3.5, lon1: 29.5, lon2: 30.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 2 of 127: lat1: 2.5, lat2: 3.5, lon1: 30.5, lon2: 31.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 3 of 127: lat1: 2.5, lat2: 3.5, lon1: 31.5, lon2: 32.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 4 of 127: lat1: 2.5, lat2: 3.5, lon1: 32.5, lon2: 33.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 5 of 127: lat1: 2.5, lat2: 3.5, lon1: 33.5, lon2: 34.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 6 of 127: lat1: 2.5, lat2: 3.5, lon1: 34.5, lon2: 35.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 7 of 127: lat1: 2.5, lat2: 3.5, lon1: 35.5, lon2: 36.5
Response Status Code: 200
Response Status Code: 200
Success
Processing page 8 of 127: lat1: 2.5, lat2: 3.5, lon1: 36.5, lon2: 37.5
Response Sta

In [1]:
lat_mins = []
lat_maxes = []
lon_mins = []
lon_maxes = []
raw_data_link_list = []
annual_data_link_list = []
anom_data_link_list = []

for index in range(coordinates.shape[0]):

    lat_min, lat_max, lon_min, lon_max, raw_data_link, annual_data_link, anom_data_link = get_series_links(series_div_list[index])
                                                                                                           
    if lat_min or lat_max or lon_min or lon_max == -999:
        print(f"Bad response. Ending loop on index {index}...")
        break
    if len(raw_data_link) or len(annual_data_link) or len(anom_data_link) == 0:
        print(f"Bad response. Ending loop on index {index}...")
        break
    else:
        lat_mins.append(lat_min)
        lat_maxes.append(lat_max)
        lon_mins.append(lon_min)
        lon_maxes.append(lon_maxes)
        raw_data_link_list.append(raw_data_link)
        annual_data_link_list.append(annual_data_link)
        anom_data_link_list.append(anom_data_link)

NameError: NameError: name 'coordinates' is not defined

In [50]:
all_raw_df_list = []
all_annual_df_list = []
all_anom_df_list = []

In [51]:
for index in range(len(raw_data_link_list)): #all_stations.shape[0]
    raw_data_link = "https://climexp.knmi.nl/" + raw_data_link_list[index]
    annual_cycle_link = "https://climexp.knmi.nl/"+ annual_data_link_list[index]
    anom_link = "https://climexp.knmi.nl/"+ anom_data_link_list[index]

    print(f'Now processing coordinate box {index} of {len(raw_data_link_list)}...')
    print(raw_data_link)
    print(annual_cycle_link)
    print(anom_link)
    print('')

    raw_df = get_raw_df(raw_data_link, session)
    if raw_df.empty:
        print(f"Problem retrieving raw monthly data. Ending on loop {index}...")
        break
    print(raw_df)
    all_raw_df_list.append(raw_df)
    
    annual_df = get_annual_df(annual_cycle_link, session)
    if annual_df.empty:
        print(f"Problem retrieving annual cycle data. Ending on loop {index}...")
        break
    print(annual_df)
    all_annual_df_list.append(annual_df)
    
    anom_df = get_anom_df(anom_link, session)
    if anom_df.empty:
        print(f"Problem retrieving anomaly data. Ending on loop {index}...")
        break
    print(anom_df)
    all_anom_df_list.append(anom_df)

print("Done retrieving data from all stations.")
print(f"Raw DF List: {len(all_raw_df_list)} of {len(raw_data_link_list)}\nAnnual Cycle DF List: {len(all_annual_df_list)} of {len(raw_data_link_list)}\nAnomaly DF List: {len(all_anom_df_list)} of {len(raw_data_link_list)}")

NameError: NameError: name 'raw_data_link_list' is not defined

### Run Together (recommended)

Alternatively, run the blocks below to get all the data frames from website submission in one function call.

In [80]:
lat_mins = []
lat_maxes = []
lon_mins = []
lon_maxes = []

all_raw_df_list = []
all_annual_df_list = []
all_anom_df_list = []

In [81]:
for index in range(coordinates.shape[0]):
    lat1 = coordinates.iloc[index]["Lat min"]
    lat2 = coordinates.iloc[index]["Lat max"]
    lon1 = coordinates.iloc[index]["Long min"]
    lon2 = coordinates.iloc[index]["Long max"]

    print("\n=====================================================================")
    print(f"Processing page {index + 1} of {coordinates.shape[0]}: lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}")

    lat_min, lat_max, lon_min, lon_max, raw_data_df, annual_df, anom_df = get_series_submit_to_dfs(lat1, lat2, lon1, lon2, session)

    print(f"lat_min: {lat_min}, lat_max: {lat_max}, lon_min: {lon_min}, lon_max: {lon_max}, raw_data_df: {raw_data_df.shape}, annual_df: {annual_df.shape}, anom_df: {anom_df.shape}")

    if lat_min == -999 or lat_max == -999 or lon_min == -999 or lon_max == -999:
        print(f"Bad response. Ending loop on index {index}...")
        break
    if raw_data_df.empty or annual_df.empty or anom_df.empty:
        print(f"Bad response. Ending loop on index {index}...")
        break
    else:
        lat_mins.append(lat_min)
        lat_maxes.append(lat_max)
        lon_mins.append(lon_min)
        lon_maxes.append(lon_max)
        all_raw_df_list.append(raw_data_df)
        all_annual_df_list.append(annual_df)
        all_anom_df_list.append(anom_df)

    print(f"Successfully processed coordinate box {index + 1} of {coordinates.shape[0]}")

print("Done processing all coordinate boxes.")


Processing page 1 of 127: lat1: 2.5, lat2: 3.5, lon1: 29.5, lon2: 30.5
Response Status Code: 200
Response Status Code: 200
['29.500', '30.500']
['2.500', '3.500']
data/ic3prate_29.5-30.5E_2.5-3.5N_n.dat
data/ic3prate_29.5-30.5E_2.5-3.5N_n__yr.txt
data/ic3prate_29.5-30.5E_2.5-3.5N_n_a.txt
Response Status: 200

     Year        January       February          March          April  \
0    1836  0.9459678E-05  0.1894828E-04  0.7164517E-04  0.1020292E-03   
1    1837  0.1689919E-04  0.3032589E-04  0.8032661E-04  0.1148792E-03   
2    1838  0.1727419E-04  0.3991518E-04  0.8220968E-04  0.1119792E-03   
3    1839  0.1911290E-04  0.2697768E-04  0.7022177E-04  0.1225333E-03   
4    1840  0.1995564E-04  0.1775431E-04  0.7165727E-04  0.1119583E-03   
..    ...            ...            ...            ...            ...   
175  2011  0.8358870E-05  0.1036607E-04  0.2770968E-04  0.6302082E-04   
176  2012  0.3068548E-05  0.1193104E-04  0.3188306E-04  0.8372917E-04   
177  2013  0.1506452E-04  0.169

In [82]:
all_raw_series_df = concat_format_wide(all_raw_df_list, lat_mins, lat_maxes, lon_mins, lon_maxes)
all_annual_series_df = concat_format_wide(all_annual_df_list, lat_mins, lat_maxes, lon_mins, lon_maxes)
all_anom_series_df = concat_format_wide(all_anom_df_list, lat_mins, lat_maxes, lon_mins, lon_maxes)

## Download as .csv

Run the block below to download all raw monthly data as separate .csv files.

In [0]:
for df in range(len(all_raw_df_list)):
    # Change the path variable to download to another location
    path = "for_analysis/"
    filename = f"{path}lat={lat_mins[df]}to{lat_maxes[df]}_lon={lon_mins[df]}to{lon_maxes[df]}timeseries_monthly_raw.csv"
    all_raw_df_list[df].to_csv(filename, index=False)

Run the block below to download all raw annual cycle data as separate .csv files.

In [0]:
for df in range(len(all_raw_df_list)):
    # Change the path variable to download to another location
    path = "for_analysis/"
    filename = f"{path}lat={lat_mins[df]}to{lat_maxes[df]}_lon={lon_mins[df]}to{lon_maxes[df]}timeseries_annual.csv"
    all_annual_df_list[df].to_csv(filename, index=False)

Run the block below to download all raw anomaly data as separate .csv files.

In [0]:
for df in range(len(all_raw_df_list)):
    # Change the path variable to download to another location
    path = "for_analysis/"
    filename = f"{path}lat={lat_mins[df]}to{lat_maxes[df]}_lon={lon_mins[df]}to{lon_maxes[df]}timeseries_monthly_raw.csv"
    all_anom_df_list[df].to_csv(filename, index=False)

Run the cell below to download the concatenated time series dataframes as separate .csv files.

In [83]:
path = "for_analysis/"
filename = f"{path}all_timeseries_monthly_raw.csv"
all_raw_series_df.to_csv(filename, index=False)

path = "for_analysis/"
filename = f"{path}all_timeseries_annual_cycle.csv"
all_annual_series_df.to_csv(filename, index=False)

path = "for_analysis/"
filename = f"{path}all_timeseries_anomaly.csv"
all_anom_series_df.to_csv(filename, index=False)

# get_monthly_station_data

# Collect KNMI Climate Explorer Monthly Station Data

Written by Cecile Dai

Uses data from https://climexp.knmi.nl/start.cgi?id=someone@somewhere

Given a specified or list of coordinate longitude-latitude ranges, this script will collect all the stations located here, raw monthly, annual cycle, and anomaly data for this range. You may choose to download all this data in separate .csv files, as a single large one, or update an existing file written in the same format.

In [1]:
 # Imports
import numpy as np
import pandas as pd
import sys

from bs4 import BeautifulSoup
from requests_html import HTMLSession
from pprint import pprint

from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urljoin
import webbrowser
import re

import requests
import datetime
import time

## Functions

### Submit Coordinate Range

This function creates and submits a form to https://climexp.knmi.nl/selectstation.cgi?id=someone@somewhere with a specified latitude-longitude coordinate range for surface precipitation values.

In [2]:
# Function to create and submit form with selected latitude and longitude ranges
def get_stations_submit(lat1, lat2, lon1, lon2, session):
    # the data body we want to submit
    data = {}

    # Link to the KNMI page where you select the stations and data to view a monthly time series
    station_link = 'https://climexp.knmi.nl/selectstation.cgi?id=someone@somewhere'

    data["email"] = "someone@somewhere"
    data["climate"] = "precipitation_all"
    data["name"] = ""
    data["num"] = 10
    data["lat"] = ""
    data["lon"] = ""
    data["lat1"] = lat1
    data["lat2"] = lat2
    data["lon1"] = lon1
    data["lon2"] = lon2
    data["list"] = """# lon1 lon2 lat1 lat2 (optional)
    station number (one per line)"""
    data["min"] = 10
    data["sum"] = 1
    data["month"] = -1
    data["yr1"] = ""
    data["yr2"] = ""
    data["dist"] = ""
    data["elevmin"] = ""
    data["elevmax"] = ""

    # print("Submission body:")
    # for key in data.keys():
    #     print(f"{key}: {data[key]}")

    # join the url with the action (form request URL)
    # url = urljoin(station_link, form_details["action"])
    url = urljoin(station_link, 'getstations.cgi')
    res = session.post(url, data=data)

    print(f"Response Status Code: {res.status_code}")

    if res.status_code != 200:
        print(f"Unable to access page. Stopped at lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}")
        return -1

    soup = BeautifulSoup(res.content, "html.parser")

    start = re.search('</div>\n</form>', res.text)
    stop = re.search('</div>\n<div class="col-md-4">', res.text)

    # I will admit this is not the prettiest way to extract the desired text but the pure regex method just wasn't working :(
    limit1 = re.split(', ', str(start.span()))
    limit2 = re.split(', ', str(stop.span()))
    l1 = int(limit1[1].replace(')', ''))
    l2 = int(limit2[0].replace('(', ''))

    # print(res.text[l1:l2])

    relevant_text = res.text[l1:l2]

    return relevant_text

### Get Stations

This function parses through the text obtained running the get_station_submit function to retrieve all the climate stations and their information in a dataframe.

In [3]:
def get_stations_df(text):
    lines = text.replace('\n', '').split('<br>')
        
    header = ["Station Name", "Latitude", "Longitude", "Elevation", "Station Code", "Starting Year", "Ending Year", "Monthly Series Link"]
    station_names = []
    lats = []
    lons = []
    elevs = []
    codes = []
    start_year = []
    end_year = []
    links = []

    stripped_lines = []

    for i in range(len(lines)):
        if len("".join(lines[i].split())) != 0 and i >= 3:
            stripped_lines.append(lines[i])
            # print(lines[i])
        else:
            continue

    for j in range(len(stripped_lines)):
        # Station names
        if j%4 == 0:
            # print(" ".join(stripped_lines[j].split()))
            station_names.append(" ".join(stripped_lines[j].split()))

        # Coordinates and elevation
        elif j%4 == 1:
            latitude = re.search('\s+(.*)N', stripped_lines[j])
            # print(latitude.group(0).strip())
            lats.append(latitude.group(0).strip())

            longitude = re.search(',\s+(.*)E', stripped_lines[j])
            # print(longitude.group(0).replace(',','').strip())
            lons.append(longitude.group(0).replace(',','').strip())

            elevation = re.search('E,\s+(.*m)', stripped_lines[j])
            # print(elevation.group(0).replace("E,", '').strip())
            elevs.append(elevation.group(0).replace("E,", '').strip())

        # Station code and monthly series link
        elif j%4 == 2:
            code = re.search('([0-9\.]+)', stripped_lines[j])
            # print(code.group(0))
            codes.append(code.group(0))

            link = re.search('\"(.*)\"', stripped_lines[j])
            # print(link.group(0).replace('"', ''))
            links.append(link.group(0).replace('"', ''))

        # Data start and end year
        elif j%4 == 3:
            years = re.search('([0-9]{4}\-[0-9]{4})', stripped_lines[j])
            # print(years.group(0))
            # print(years.group(0)[:4])
            # print(years.group(0)[5:])
            start_year.append(years.group(0)[:4])
            end_year.append(years.group(0)[5:])

    data = {'Station Name': station_names, 'Latitude': lats, 'Longitude': lons, 'Elevation': elevs, 'Station Code': codes, 'Starting Year': start_year, 'Ending Year': end_year, 'Monthly Series Link': links}

    df = pd.DataFrame(data)
    # print(df)
    return df

### Get All Station Data Links

This function takes a list of climate stations and returns separate lists of dataframes of all the raw monthly, annual cycle, and anomaly data at these stations.

In [4]:
def get_station_data(links, station_names, codes):

    raw_df_list = []
    annual_df_list = []
    anom_df_list = []

    # Visit all links, use array indices to keep track of stations
    for i in range(len(links)):
        station_data_url = "https://climexp.knmi.nl/" + links[i]
        station_data_name = station_names[i]
        station_data_code = codes[i]
        print("{a} | {b} | {c}".format(a=station_data_name, b=station_data_code, c=station_data_url))
        print("================================================================")

        raw_data_link = "https://climexp.knmi.nl/data/pa" + station_data_code + ".dat"
        annual_cycle_link = "https://climexp.knmi.nl/data/pa"+ station_data_code +"__yr.txt"
        anom_link = "https://climexp.knmi.nl/data/pa"+ station_data_code +"_a.txt"

        print("Station {a} of {b}".format(a = i + 1, b = len(links)))

        visit = requests.get(station_data_url)

        # wait if there is no response (due to timeouts)
        code = requests.head(raw_data_link)
        print("Response Status: {a}\n".format(a=code.status_code))
        if code.status_code != 200:
            print("Issue accessing raw monthly data. Waiting...")
            time.sleep(60)

        datapage = requests.get(raw_data_link)

        # print(datapage.text)
        # raw_data_headers = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
        raw_data_list = []
        lines = datapage.text.split("\n")
        for line in lines:
            elements = line.split()
            # print(elements)
            if len(elements) == 0:
                continue
            if elements[0] == "#":
                continue
            else:
                # csvline = ','.join(elements)
                # print(csvline)
                raw_data_list.append(elements)
        raw_data_df = pd.DataFrame(raw_data_list, columns = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
        raw_df_list.append(raw_data_df)
        print(raw_data_df)
        # print("confirm")

        # wait if there is no response (due to timeouts)
        code = requests.head(annual_cycle_link)
        print("Response Status: {a}\n".format(a=code.status_code))
        if code.status_code != 200:
            print("Issue accessing annual cycle data. Waiting...")
            time.sleep(60)

        annualpage = requests.get(annual_cycle_link)
        # print(annualpage.text)
        raw_data_yr_list = []
        lines = annualpage.text.split("\n")
        for line in lines:
            elements = line.split()
            if len(elements) == 0:
                continue
            if elements[0] == "#":
                continue
            else:
                monthint = re.findall('2{1}0{3,4}([1-9]{1}[012]?)01', elements[0])
                # print(monthint)
                if monthint and len(monthint) != 0:
                    elements[0] = datetime.date(1900, int(monthint[0]), 1).strftime('%B')
                    # print(len(elements))
                    # print(elements)
                    raw_data_yr_list.append(elements)
        raw_data_yr_df = pd.DataFrame(raw_data_yr_list, columns = ['Month', 'Mean', '2.5%', '17%', '50%', '83%', '97.5%'])
        annual_df_list.append(raw_data_yr_df)
        print(raw_data_yr_df)
        # print("confirm")

        # wait if there is no response (due to timeouts)
        code = requests.head(anom_link)
        print("Response Status: {a}\n".format(a=code.status_code))
        if code.status_code != 200:
            print("Issue accessing anomaly data. Waiting...")
            time.sleep(60)

        anompage = requests.get(anom_link)
        # print(anompage.text)
        # raw_data_anom_list = {'Year': [], 'January': [], 'February': [], 'March': [], 'April': [], 'May': [], 'June': [], 'July': [], 'August': [], 'September': [], 'October': [], 'November': [], 'December': []}
        raw_data_anom_dict = {}
        raw_data_anom_list = []
        lines = anompage.text.split("\n")
        for line in lines:
            elements = line.split()
            if len(elements) == 0:
                continue
            if elements[0] == "#":
                continue
            else:
                date = re.findall('([0-9]{4})\.([0-9]{4})', elements[0])
                if date and len(date[0]) == 2:
                    if date[0][0] not in raw_data_anom_dict:
                        raw_data_anom_dict[date[0][0]] = {'January': '-999.9', 'February': '-999.9', 'March':'-999.9', 'April': '-999.9', 'May': '-999.9', 'June': '-999.9', 'July': '-999.9', 'August': '-999.9', 'September': '-999.9', 'October': '-999.9', 'November': '-999.9', 'December': '-999.9'}
                    monthint = round((float('0.' + date[0][1]) * 12) + 1)
                    monthint = datetime.date(1900, int(monthint), 1).strftime('%B')
                    raw_data_anom_dict[date[0][0]][monthint] = elements[1]
        for key in raw_data_anom_dict:
            dlist = []
            dlist.append(key)
            for d in raw_data_anom_dict[key]:
                dlist.append(raw_data_anom_dict[key][d])
            raw_data_anom_list.append(dlist)
        raw_data_anom_df = pd.DataFrame(raw_data_anom_list, columns = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
        anom_df_list.append(raw_data_anom_df)
        print(raw_data_anom_df)
        # print("confirm")
    
    return raw_df_list, annual_df_list, anom_df_list


### Get Station Data Separately

The cell below contains functions to obtain the raw, annual cycle, and anomaly data individually given their associated links.

In [93]:
def get_raw_df(raw_data_link, session):
    # wait if there is no response (due to timeouts)
    # code = requests.head(raw_data_link)
    # print("Response Status: {a}\n".format(a=code.status_code))
    # if code.status_code != 200:
    #     # print("Issue accessing raw monthly data. Waiting...")
    #     # time.sleep(60)
    #     return pd.DataFrame()

    datapage = session.get(raw_data_link)
    # datapage = requests.get(raw_data_link)
    # if datapage.status_code

    # print(datapage)
        
    print("Response Status: {a}\n".format(a=datapage.status_code))
    if datapage.status_code != 200:
        return pd.DataFrame()

    # print(datapage.text)
    # raw_data_headers = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    raw_data_list = []
    lines = datapage.text.split("\n")
    for line in lines:
        elements = line.split()
        # print(elements)
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            # csvline = ','.join(elements)
            # print(csvline)
            raw_data_list.append(elements)
    raw_data_df = pd.DataFrame(raw_data_list, columns=['Year', 'January', 'February', 'March', 'April', 'May', 'June',
                                                       'July', 'August', 'September', 'October', 'November',
                                                       'December'])
    # print(raw_data_df)
    # print("confirm")
    return raw_data_df


def get_annual_df(annual_cycle_link, session):
    # wait if there is no response (due to timeouts)
    # code = requests.head(annual_cycle_link)
    # print("Response Status: {a}\n".format(a=code.status_code))
    # if code.status_code != 200:
    #     # print("Issue accessing annual cycle data. Waiting...")
    #     # time.sleep(60)
    #     return pd.DataFrame()

    annualpage = session.get(annual_cycle_link)

    print("Response Status: {a}\n".format(a=annualpage.status_code))
    if annualpage.status_code != 200:
        return pd.DataFrame()

    # annualpage = requests.get(annual_cycle_link)
    # print(annualpage.text)
    raw_data_yr_list = []
    lines = annualpage.text.split("\n")
    for line in lines:
        elements = line.split()
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            monthint = re.findall('2{1}0{3,4}([1-9]{1}[012]?)01', elements[0])
            # print(monthint)
            if monthint and len(monthint) != 0:
                elements[0] = datetime.date(1900, int(monthint[0]), 1).strftime('%B')
                # print(len(elements))
                # print(elements)
                raw_data_yr_list.append(elements)
    raw_data_yr_df = pd.DataFrame(raw_data_yr_list, columns=['Month', 'Mean', '2.5%', '17%', '50%', '83%', '97.5%'])
    # print(raw_data_yr_df)
    # print("confirm")
    return raw_data_yr_df


def get_anom_df(anom_link, session):
    # wait if there is no response (due to timeouts)
    # code = requests.head(anom_link)
    # print("Response Status: {a}\n".format(a=code.status_code))
    # if code.status_code != 200:
    #     # print("Issue accessing anomaly data. Waiting...")
    #     # time.sleep(60)
    #     return pd.DataFrame()

    anompage = session.get(anom_link)

    print("Response Status: {a}\n".format(a=anompage.status_code))
    if anompage.status_code != 200:
        return pd.DataFrame()

    # anompage = requests.get(anom_link)
    # print(anompage.text)
    # raw_data_anom_list = {'Year': [], 'January': [], 'February': [], 'March': [], 'April': [], 'May': [], 'June': [], 'July': [], 'August': [], 'September': [], 'October': [], 'November': [], 'December': []}
    raw_data_anom_dict = {}
    raw_data_anom_list = []
    lines = anompage.text.split("\n")
    for line in lines:
        elements = line.split()
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            date = re.findall('([0-9]{4})\.([0-9]{4})', elements[0])
            if date and len(date[0]) == 2:
                # Not all months included
                if date[0][0] not in raw_data_anom_dict:
                    # Initialize with invalid numbers
                    raw_data_anom_dict[date[0][0]] = {'January': '-999.9', 'February': '-999.9', 'March': '-999.9',
                                                      'April': '-999.9', 'May': '-999.9', 'June': '-999.9',
                                                      'July': '-999.9', 'August': '-999.9', 'September': '-999.9',
                                                      'October': '-999.9', 'November': '-999.9', 'December': '-999.9'}
                monthint = round((float('0.' + date[0][1]) * 12) + 1)   # Multiply by 12 to get the actual month integer
                monthint = datetime.date(1900, int(monthint), 1).strftime('%B')
                raw_data_anom_dict[date[0][0]][monthint] = elements[1]
    for key in raw_data_anom_dict:
        dlist = []
        dlist.append(key)
        for d in raw_data_anom_dict[key]:
            dlist.append(raw_data_anom_dict[key][d])
        raw_data_anom_list.append(dlist)
    raw_data_anom_df = pd.DataFrame(raw_data_anom_list, columns=['Year', 'January', 'February', 'March', 'April', 'May',
                                                                 'June', 'July', 'August', 'September', 'October',
                                                                 'November', 'December'])
    return raw_data_anom_df

### Dataframe Compiler

This function concatenates a list of dataframes of the same format into a single large dataframe, adding their associated station names for each as a new identifier column.

In [5]:
# Reformats all the dataframes in the input list and adds station codes. Note that this assumes that everything was processed in the same order.
def concat_format_wide(df_list, station_list, existing_file=''):
    full_list_df = []
    dfs = []    #list of dataframes to concatenate
    for station_df in range(len(station_list)):
        codes = [station_list[station_df] for row in range(df_list[station_df].shape[0])]
        new_df = df_list[station_df].copy()
        new_df.insert(0, 'Station', codes)
        dfs.append(new_df)
    if len(existing_file) != 0:
        existing_df = pd.read_csv(existing_file)
        # Check if this data already exists in the file, remove from list to be concatenated if so
        for station in range(len(station_list)):
            if not existing_df.loc(station_list[station]):
                dfs.remove(dfs[station])
        dfs.append(existing_df)
    full_list_df = pd.concat(dfs)
    return full_list_df
    # print(full_raw_list)

## Run Functions
Run the functions on the data to analyze, create and/or update the files with the desired data.

Upload Coordinate Boxes to Analyze

In [73]:
coordinates = pd.read_csv("for_analysis/BoxCoordinates.csv")
print(coordinates)

     Lat min  Lat max  Long min  Long max
0        2.5      3.5      29.5      30.5
1        2.5      3.5      30.5      31.5
2        2.5      3.5      31.5      32.5
3        2.5      3.5      32.5      33.5
4        2.5      3.5      33.5      34.5
..       ...      ...       ...       ...
122     -9.5     -8.5      32.5      33.5
123     -9.5     -8.5      33.5      34.5
124     -9.5     -8.5      34.5      35.5
125     -9.5     -8.5      35.5      36.5
126     -9.5     -8.5      36.5      37.5

[127 rows x 4 columns]


Start session and submit data.

In [74]:
# initialize HTTP session so data/cookies are persisted
session = HTMLSession()

stations_df_list = []

for index in range(coordinates.shape[0]):
    lat1 = coordinates.iloc[index]["Lat min"]
    lat2 = coordinates.iloc[index]["Lat max"]
    lon1 = coordinates.iloc[index]["Long min"]
    lon2 = coordinates.iloc[index]["Long max"]

    print(f"Processing page {index + 1} of {coordinates.shape[0]}: lat1: {lat1}, lat2: {lat2}, lon1: {lon1}, lon2: {lon2}")

    pagetext = get_stations_submit(lat1, lat2, lon1, lon2, session)
    # print(pagetext)

    if pagetext == -1:
        print(f"Bad response. Ending loop on index {index}...")
        break
    else:
        print("Success")
        df = get_stations_df(pagetext)
        stations_df_list.append(df)
    
print("Done getting station data.")
        

Processing page 1 of 127: lat1: 2.5, lat2: 3.5, lon1: 29.5, lon2: 30.5
Response Status Code: 200
Success
Processing page 2 of 127: lat1: 2.5, lat2: 3.5, lon1: 30.5, lon2: 31.5
Response Status Code: 200
Success
Processing page 3 of 127: lat1: 2.5, lat2: 3.5, lon1: 31.5, lon2: 32.5
Response Status Code: 200
Success
Processing page 4 of 127: lat1: 2.5, lat2: 3.5, lon1: 32.5, lon2: 33.5
Response Status Code: 200
Success
Processing page 5 of 127: lat1: 2.5, lat2: 3.5, lon1: 33.5, lon2: 34.5
Response Status Code: 200
Success
Processing page 6 of 127: lat1: 2.5, lat2: 3.5, lon1: 34.5, lon2: 35.5
Response Status Code: 200
Success
Processing page 7 of 127: lat1: 2.5, lat2: 3.5, lon1: 35.5, lon2: 36.5
Response Status Code: 200
Success
Processing page 8 of 127: lat1: 2.5, lat2: 3.5, lon1: 36.5, lon2: 37.5
Response Status Code: 200
Success
Processing page 9 of 127: lat1: 1.5, lat2: 2.5, lon1: 29.5, lon2: 30.5
Response Status Code: 200
Success
Processing page 10 of 127: lat1: 1.5, lat2: 2.5, lon1: 

Run cell below to save all stations in the specified coordinate ranges to a .csv file.

In [31]:
path = "for_analysis/"
filename = f"{path}box_coordinate_stations.csv"
all_stations = pd.concat(stations_df_list)
all_stations.to_csv(filename, index=False)

Get data from all stations in coordinate range.

In [94]:
all_raw_df_list = []
all_annual_df_list = []
all_anom_df_list = []

In [101]:
for station in range(all_stations.shape[0]): #all_stations.shape[0]
    station_data_code = all_stations.iloc[station]["Station Code"]
    raw_data_link = "https://climexp.knmi.nl/data/pa" + station_data_code + ".dat"
    annual_cycle_link = "https://climexp.knmi.nl/data/pa"+ station_data_code +"__yr.txt"
    anom_link = "https://climexp.knmi.nl/data/pa"+ station_data_code +"_a.txt"
    page_link = "https://climexp.knmi.nl/" + all_stations.iloc[station]["Monthly Series Link"]

    print(f'Now processing station {index} of {all_stations.shape[0]}...')
    print(raw_data_link)
    print(annual_cycle_link)
    print(anom_link)
    print('')

    visit_page = session.get(page_link)

    raw_df = get_raw_df(raw_data_link, session)
    if raw_df.empty:
        print(f"Problem retrieving raw monthly data. Ending on loop {station}...")
        break
    all_raw_df_list.append(raw_df)
    
    annual_df = get_annual_df(annual_cycle_link, session)
    if annual_df.empty:
        print(f"Problem retrieving annual cycle data. Ending on loop {station}...")
        break
    all_annual_df_list.append(annual_df)
    
    anom_df = get_anom_df(anom_link, session)
    if anom_df.empty:
        print(f"Problem retrieving anomaly data. Ending on loop {station}...")
        break
    all_anom_df_list.append(anom_df)

print("Done retrieving data from all stations.")
print(f"Raw DF List: {len(all_raw_df_list)} of {all_stations.shape[0]}\nAnnual Cycle DF List: {len(all_annual_df_list)} of {all_stations.shape[0]}\nAnomaly DF List: {len(all_anom_df_list)} of {all_stations.shape[0]}\n")

Now processing...
https://climexp.knmi.nl/data/pa63630.1.dat
https://climexp.knmi.nl/data/pa63630.1__yr.txt
https://climexp.knmi.nl/data/pa63630.1_a.txt

Response Status: 200

Response Status: 200

Response Status: 200

Now processing...
https://climexp.knmi.nl/data/pa63630.5.dat
https://climexp.knmi.nl/data/pa63630.5__yr.txt
https://climexp.knmi.nl/data/pa63630.5_a.txt

Response Status: 200

Response Status: 200

Response Status: 200

Now processing...
https://climexp.knmi.nl/data/pa63658.3.dat
https://climexp.knmi.nl/data/pa63658.3__yr.txt
https://climexp.knmi.nl/data/pa63658.3_a.txt

Response Status: 200

Response Status: 200

Response Status: 200

Now processing...
https://climexp.knmi.nl/data/pa63612.dat
https://climexp.knmi.nl/data/pa63612__yr.txt
https://climexp.knmi.nl/data/pa63612_a.txt

Response Status: 200

Response Status: 200

Response Status: 200

Now processing...
https://climexp.knmi.nl/data/pa63628.1.dat
https://climexp.knmi.nl/data/pa63628.1__yr.txt
https://climexp.kn

In [104]:
# all_raw_df = pd.concat(all_raw_df_list)
# all_annual_df = pd.concat(all_annual_df_list)
# all_anom_df = pd.concat(all_anom_df_list)

all_raw_df = concat_format_wide(all_raw_df_list, all_stations["Station Code"].tolist())
all_annual_df = concat_format_wide(all_annual_df_list, all_stations["Station Code"].tolist())
all_anom_df = concat_format_wide(all_anom_df_list, all_stations["Station Code"].tolist())

Run the cell below to download all the monthly raw, annual cycle, and anomaly data in three separate .csv files.

In [105]:
path = "for_analysis/"
filename = f"{path}all_monthly_raw.csv"
all_raw_df.to_csv(filename, index=False)

path = "for_analysis/"
filename = f"{path}all_annual_cycle.csv"
all_annual_df.to_csv(filename, index=False)

path = "for_analysis/"
filename = f"{path}all_anomaly.csv"
all_anom_df.to_csv(filename, index=False)

# analysis

In [0]:
 # Imports
import numpy as np
import pandas as pd
import sys

from bs4 import BeautifulSoup
from requests_html import HTMLSession
from pprint import pprint

from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urljoin
import webbrowser
import re

import requests
import datetime
import time

In [0]:
station_monthly_raws = pd.read_csv("for_analysis/all_monthly_raw.csv")
station_annual_raws = pd.read_csv("for_analysis/all_annual_cycle.csv")
station_anom_raws = pd.read_csv("for_analysis/all_anomaly.csv")
series_monthly_raws = pd.read_csv("for_analysis/all_timeseries_monthly_raw.csv")
series_annual_raws = pd.read_csv("for_analysis/all_timeseries_annual_cycle.csv")
series_anom_raws = pd.read_csv("for_analysis/all_timeseries_anomaly.csv")
coordinates = pd.read_csv("for_analysis/BoxCoordinates.csv")
stations = pd.read_csv("for_analysis/box_coordinate_stations.csv")

# utilities

# Unrelated Utilities

Below are some utilities I used to organize my working files. Completely unrelated to the KNMI Climate Explorer web scraping tools.

In [104]:
# remove("raw_data\station_61045_raw_data")

from pathlib import Path

file_path = Path('C:\\Users\\Cecile Dai\\Downloads\\all_stations_lat=-5--4_lon=20-25.csv.csv')


try:
    # file_path.unlink()
    file_path.unlink()
except OSError as e:
    print("Error: %s : %s" % (file_path, e.strerror))

https://cosmosdb.github.io/labs/dotnet/labs/02-load_data_with_adf.html

# get_stations_SCRAPPED

# Working Code for KNMI Monthly Station Data Analysis

Now outdated. Written by Cecile Dai.

Uses information from https://climexp.knmi.nl/about.cgi?id=someone@somewhere

## Stations

Get the all stations in a specified latitude-longitude coordinate range.

### Setup

Start by getting the form tags so that we know what we need to submit to obtain the desired information.

In [0]:
# Imports
import numpy as np
import pandas as pd
import sys

from bs4 import BeautifulSoup
from requests_html import HTMLSession
from pprint import pprint

# Link to the KNMI page where you select the stations and data to view a monthly time series
station_link = 'https://climexp.knmi.nl/selectstation.cgi?id=someone@somewhere'

# initialize HTTP session so data/cookies are persisted
session = HTMLSession()

Parse all tags from a web page.

In [0]:
# Function to get all form tags from a given web page
def get_all_forms(url):
    # GET request
    res = session.get(url)
    soup = BeautifulSoup(res.html.html, "html.parser")
    return soup.find_all("form")

Get all form details.

In [0]:
# Returns the HTML details of a form, including action, method and list of form controls (inputs, etc)
def get_form_details(form):
    details = {}
    # get the form action (requested URL)
    action = form.attrs.get("action").lower()
    # get the form method (POST, GET, DELETE, etc)
    # if not specified, GET is the default in HTML
    method = form.attrs.get("method", "get").lower()
    # get all form inputs
    inputs = []
    for input_tag in form.find_all("input"):
        # get type of input form control
        input_type = input_tag.attrs.get("type", "text")
        # get name attribute
        input_name = input_tag.attrs.get("name")
        # get the default value of that input tag
        input_value =input_tag.attrs.get("value", "")
        # add everything to that list
        inputs.append({"type": input_type, "name": input_name, "value": input_value})

    # Get all 'select' and 'textarea' input types
    for select in form.find_all("select"):
        # get the name attribute
        select_name = select.attrs.get("name")
        # set the type as select
        select_type = "select"
        select_options = []
        # the default select value
        select_default_value = ""
        # iterate over options and get the value of each
        for select_option in select.find_all("option"):
            # get the option value used to submit the form
            option_value = select_option.attrs.get("value")
            if option_value:
                select_options.append(option_value)
                if select_option.attrs.get("selected"):
                    # if 'selected' attribute is set, set this option as default    
                    select_default_value = option_value
        if not select_default_value and select_options:
            # if the default is not set, and there are options, take the first option as default
            select_default_value = select_options[0]
        # add the select to the inputs list
        inputs.append({"type": select_type, "name": select_name, "values": select_options, "value": select_default_value})
    for textarea in form.find_all("textarea"):
        # get the name attribute
        textarea_name = textarea.attrs.get("name")
        # set the type as textarea
        textarea_type = "textarea"
        # get the textarea value
        textarea_value = textarea.attrs.get("value", "")
        # add the textarea to the inputs list
        inputs.append({"type": textarea_type, "name": textarea_name, "value": textarea_value})
        # put everything to the resulting dictionary
    details["action"] = action
    details["method"] = method
    details["inputs"] = inputs
    return details

Run the following block to print out form details, with input types and values.

In [0]:
# get all form tags
forms = get_all_forms(station_link)
# iteratte over forms
for i, form in enumerate(forms, start=1):
    form_details = get_form_details(form)
    print("="*50, f"form #{i}", "="*50)
    print("action: {action}\n\n".format(action=form_details["action"]))
    print("method: {method}\n\n".format(method=form_details["method"]))
    for input in form_details["inputs"]:
        print("input: {i}\n".format(i=input))

### Create Submission Form

Start off by noting the submission values you want. For the purposes of only getting a monthly station list in a certain area for precipitation, only "select: precipitation_all", "text: lat1", "text: lat2", "text: "lon1", and "text: lon2" will be filled out. These inputs select the precipitation measurements under GHCN-M (all) for all stations within a selected latitude and longitude area.

In [0]:
# Imports
from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urljoin
import webbrowser


# Select precipitation_all

# Define desired latitude and longitude. Change this for different areas.
# lat1, lat2 = 0, 5
# lon1, lon2 = 30, 35

lat1, lat2 = 10, 15
lon1, lon2 = 10, 15

In [0]:
# the data body we want to submit
data = {}

data["email"] = "someone@somewhere"
data["climate"] = "precipitation_all"
data["name"] = ""
data["num"] = 10
data["lat"] = ""
data["lon"] = ""
data["lat1"] = lat1
data["lat2"] = lat2
data["lon1"] = lon1
data["lon2"] = lon2
data["list"] = """# lon1 lon2 lat1 lat2 (optional)
station number (one per line)"""
data["min"] = 10
data["sum"] = 1
data["month"] = -1
data["yr1"] = ""
data["yr2"] = ""
data["dist"] = ""
data["elevmin"] = ""
data["elevmax"] = ""

print(data)

### Parse Submission Response

Submit form with desired inputs and extract relevant information from HTTP response.

In [0]:
# join the url with the action (form request URL)
url = urljoin(station_link, form_details["action"])

# pprint(data)
# Submit form and get response in res
if form_details["method"] == "post":
    res = session.post(url, data=data)
elif form_details["method"] == "get":
    res = session.get(url, params=data)
elif form_details["method"] == "put":
    res = session.put(url, params=data)

soup = BeautifulSoup(res.content, "html.parser")

import re

start = re.search('</div>\n</form>', res.text)
stop = re.search('</div>\n<div class="col-md-4">', res.text)

# I will admit this is not the prettiest way to extract the desired text but the pure regex method just wasn't working :(
limit1 = re.split(', ', str(start.span()))
limit2 = re.split(', ', str(stop.span()))
l1 = int(limit1[1].replace(')', ''))
l2 = int(limit2[0].replace('(', ''))

print(res.text[l1:l2])

relevant_text = res.text[l1:l2]

### Get Stations

Parse through response data and retrieve station links in a dataframe.

In [0]:
lines = relevant_text.replace('\n', '').split('<br>')
        
header = ["Station Name", "Latitude", "Longitude", "Elevation", "Station Code", "Starting Year", "Ending Year", "Monthly Series Link"]
station_names = []
lats = []
lons = []
elevs = []
codes = []
start_year = []
end_year = []
links = []

stripped_lines = []

for i in range(len(lines)):
    if len("".join(lines[i].split())) != 0 and i >= 3:
        stripped_lines.append(lines[i])
        # print(lines[i])
    else:
        continue

for j in range(len(stripped_lines)):
    # Station names
    if j%4 == 0:
        # print(" ".join(stripped_lines[j].split()))
        station_names.append(" ".join(stripped_lines[j].split()))

    # Coordinates and elevation
    elif j%4 == 1:
        latitude = re.search('\s+(.*)N', stripped_lines[j])
        # print(latitude.group(0).strip())
        lats.append(latitude.group(0).strip())

        longitude = re.search(',\s+(.*)E', stripped_lines[j])
        # print(longitude.group(0).replace(',','').strip())
        lons.append(longitude.group(0).replace(',','').strip())

        elevation = re.search('E,\s+(.*m)', stripped_lines[j])
        # print(elevation.group(0).replace("E,", '').strip())
        elevs.append(elevation.group(0).replace("E,", '').strip())

    # Station code and monthly series link
    elif j%4 == 2:
        code = re.search('([0-9\.]+)', stripped_lines[j])
        # print(code.group(0))
        codes.append(code.group(0))

        link = re.search('\"(.*)\"', stripped_lines[j])
        # print(link.group(0).replace('"', ''))
        links.append(link.group(0).replace('"', ''))

    # Data start and end year
    elif j%4 == 3:
        years = re.search('([0-9]{4}\-[0-9]{4})', stripped_lines[j])
        # print(years.group(0))
        # print(years.group(0)[:4])
        # print(years.group(0)[5:])
        start_year.append(years.group(0)[:4])
        end_year.append(years.group(0)[5:])

data = {'Station Name': station_names, 'Latitude': lats, 'Longitude': lons, 'Elevation': elevs, 'Station Code': codes, 'Starting Year': start_year, 'Ending Year': end_year, 'Monthly Series Link': links}

df = pd.DataFrame(data)
print(df)

### Download Station Links in .csv File

Run this code block to download the data as a .csv file.

In [0]:
path = "stations/"
filename_range = "{p}all_stations_lat={a}to{b}_lon={c}to{d}.csv".format(p = path, a = lat1, b = lat2, c = lon1, d = lon2)
df.to_csv(filename_range, index=False)

## Station Data

Get the monthly precipitation, annual cycle, and anomaly data per station in the previously specified latitude-longitude coordinate range.

### Get Station Data

Gets the raw data for monthly precipitation, annual cycles, and anomalies, and puts it into dataframes.

In [0]:
import requests
import datetime
import time

raw_df_list = []
annual_df_list = []
anom_df_list = []

# Visit all links, use array indices to keep track of stations
for i in range(len(links)):
    station_data_url = "https://climexp.knmi.nl/" + links[i]
    station_data_name = station_names[i]
    station_data_code = codes[i]
    print("{a} | {b} | {c}".format(a=station_data_name, b=station_data_code, c=station_data_url))
    print("================================================================")

    raw_data_link = "https://climexp.knmi.nl/data/pa" + station_data_code + ".dat"
    annual_cycle_link = "https://climexp.knmi.nl/data/pa"+ station_data_code +"__yr.txt"
    anom_link = "https://climexp.knmi.nl/data/pa"+ station_data_code +"_a.txt"

    print("Station {a} of {b}".format(a = i + 1, b = len(links)))

    # wait if there is no response (due to timeouts)
    code = requests.head(raw_data_link)
    print("Response Status: {a}\n".format(a=code.status_code))
    if requests.head(raw_data_link).status_code == 404:
        time.sleep(60)

    datapage = requests.get(raw_data_link)

    # print(datapage.text)
    # raw_data_headers = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    raw_data_list = []
    lines = datapage.text.split("\n")
    for line in lines:
        elements = line.split()
        # print(elements)
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            # csvline = ','.join(elements)
            # print(csvline)
            raw_data_list.append(elements)
    raw_data_df = pd.DataFrame(raw_data_list, columns = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
    raw_df_list.append(raw_data_df)
    print(raw_data_df)
    # print("confirm")

    annualpage = requests.get(annual_cycle_link)
    # print(annualpage.text)
    raw_data_yr_list = []
    lines = annualpage.text.split("\n")
    for line in lines:
        elements = line.split()
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            monthint = re.findall('2{1}0{3,4}([1-9]{1}[012]?)01', elements[0])
            # print(monthint)
            if monthint and len(monthint) != 0:
                elements[0] = datetime.date(1900, int(monthint[0]), 1).strftime('%B')
                # print(len(elements))
                # print(elements)
                raw_data_yr_list.append(elements)
    raw_data_yr_df = pd.DataFrame(raw_data_yr_list, columns = ['Month', 'Mean', '2.5%', '17%', '50%', '83%', '97.5%'])
    annual_df_list.append(raw_data_yr_df)
    print(raw_data_yr_df)
    # print("confirm")

    anompage = requests.get(anom_link)
    # print(anompage.text)
    # raw_data_anom_list = {'Year': [], 'January': [], 'February': [], 'March': [], 'April': [], 'May': [], 'June': [], 'July': [], 'August': [], 'September': [], 'October': [], 'November': [], 'December': []}
    raw_data_anom_dict = {}
    raw_data_anom_list = []
    lines = anompage.text.split("\n")
    for line in lines:
        elements = line.split()
        if len(elements) == 0:
            continue
        if elements[0] == "#":
            continue
        else:
            date = re.findall('([0-9]{4})\.([0-9]{4})', elements[0])
            if date and len(date[0]) == 2:
                if date[0][0] not in raw_data_anom_dict:
                    raw_data_anom_dict[date[0][0]] = {'January': '-999.9', 'February': '-999.9', 'March':'-999.9', 'April': '-999.9', 'May': '-999.9', 'June': '-999.9', 'July': '-999.9', 'August': '-999.9', 'September': '-999.9', 'October': '-999.9', 'November': '-999.9', 'December': '-999.9'}
                monthint = round((float('0.' + date[0][1]) * 12) + 1)
                monthint = datetime.date(1900, int(monthint), 1).strftime('%B')
                raw_data_anom_dict[date[0][0]][monthint] = elements[1]
    for key in raw_data_anom_dict:
        dlist = []
        dlist.append(key)
        for d in raw_data_anom_dict[key]:
            dlist.append(raw_data_anom_dict[key][d])
        raw_data_anom_list.append(dlist)
    raw_data_anom_df = pd.DataFrame(raw_data_anom_list, columns = ['Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
    anom_df_list.append(raw_data_anom_df)
    print(raw_data_anom_df)
    # print("confirm")

    # ######## not needed ##########
    # page = requests.get(station_data_url)
    # # print(page.text)
    
    # pagesoup = BeautifulSoup(page.text)
    # linkdivs = pagesoup.find_all("div", {"class": "bijschrift"})
    # print(linkdivs)
    # print("")
    # ##############################


### Download Station Data as .csv Files

Run this block to download all raw precipitation data as separate .csv files.

In [0]:
for df in range(len(links)):
    # Change path to save to another directory
    path = "raw_data/"
    filename_station_data = "{p}station_{a}_raw_data.csv".format(p = path, a = codes[df])
    raw_df_list[df].to_csv(filename_station_data, index=False)

Run this block to download all annual cycle data as separate .csv files.

In [0]:
for df in range(len(links)):
    # Change path to save to another directory
    path = 'annual_data/'
    filename_annual_data = "{p}station_{a}_annual_cycle_data.csv".format(p = path, a = codes[df])
    annual_df_list[df].to_csv(filename_annual_data, index=False)

Run this block to download all anomaly data as separate .csv files.

In [0]:
for df in range(len(links)):
    # Change path to save to another directory
    path = 'anomalies_data/'
    filename_anom_data = "{p}station_{a}_anomaly_data.csv".format(p = path, a = codes[df])
    anom_df_list[df].to_csv(filename_anom_data, index=False)

Run this block to concatenate all the raw data into one .csv file, or to add it into an existing file

In [0]:
def concat_raw_format_long(raw_df_list, existing_file=''):
    full_raw_list = []
    columns = ["Year", "Month", "Precipitation (mm)"]
    for df in raw_df_list:
        for row in range(df.shape[0]):
            row_data = df.iloc[row]
            for element in range(len(row_data)):
                if element == 0:    # element 0 is year
                    pass
                else:
                    month = datetime.date(1900, int(element), 1).strftime('%B')
                    full_raw_list.append([row_data[0], month, row_data[element]])
                    print([row_data[0], month, row_data[element]])
    return full_raw_list
    # print(full_raw_list)

def concat_raw_format_wide(raw_df_list, station_list, existing_file=''):
    full_raw_list_df = []
    dfs = []    #list of dataframes to concatenate
    columns = ['Station', 'Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    for station_df in range(len(station_list)):
        codes = [station_list[station_df] for row in range(raw_df_list[station_df].shape[0])]
        new_df = raw_df_list[station_df].copy()
        new_df.insert(0, 'Station', codes)
        dfs.append(new_df)
    if len(existing_file) != 0:
        existing_df = pd.read_csv(existing_file)
        # Check if this data already exists in the file, remove from list to be concatenated if so
        for station in range(len(station_list)):
            if not existing_df.loc(station_list[station]):
                dfs.remove(dfs[station])
        dfs.append(existing_df)
    full_raw_list_df = pd.concat(dfs)
    return full_raw_list_df
    # print(full_raw_list)

# concat_raw_format_long(raw_df_list)
full_raw_list = concat_raw_format_wide(raw_df_list, codes)

print(full_raw_list)
print(full_raw_list.shape)

In [0]:
# Run to download file
full_raw_data_df = pd.DataFrame(full_raw_list, columns = ['Station', 'Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
print(full_raw_data_df)


Run this block to concatenate all the annual cycle data into one .csv file, or to add it into an existing file.

In [0]:
def concat_raw_annual_format_wide(raw_df_list, station_list, existing_file=''):
    full_raw_list = []
    for station_df in range(len(station_list)):
        df = raw_df_list[station_df]
        for row in range(df.shape[0]):
            new_row_data = [station_list[station_df]]
            for element in range(len(df.iloc[row])):
                new_row_data.append(df.iloc[row][element])
            full_raw_list.append(new_row_data)
    # if len(existing_file) != 0:
    #     existing_df = pd.read_csv(existing_file)
    #     # Check if this data already exists in the file
    #     for station in station_list:
    #         if not existing_df.loc[station]:

    return full_raw_list
    # print(full_raw_list)

full_raw_annual_list = concat_raw_annual_format_wide(annual_df_list, codes)

for element in full_raw_annual_list:
    print(element)

In [0]:
# Run to download file
full_annual_data_df = pd.DataFrame(full_raw_annual_list, columns = ['Station', 'Month', 'Mean', '2.5%', '17%', '50%', '83%', '97.5%'])
print(full_annual_data_df)

Run this block to concatenate all the anomaly data into one .csv file, or to add it into an existing file.

In [0]:
# Work in progress
# def concat_raw_anom_format_long(raw_df_list, existing_file=''):
#     full_raw_list = []
#     columns = ["Year", "Month", "Precipitation (mm)"]
#     for df in raw_df_list:
#         for row in range(df.shape[0]):
#             row_data = df.iloc[row]
#             for element in range(len(row_data)):
#                 if element == 0:    # element 0 is year
#                     pass
#                 else:
#                     month = datetime.date(1900, int(element), 1).strftime('%B')
#                     full_raw_list.append([row_data[0], month, row_data[element]])
#                     print([row_data[0], month, row_data[element]])
#     return full_raw_list
#     # print(full_raw_list)

def concat_raw_anom_format_wide(raw_df_list, station_list, existing_file=''):
    full_raw_list = []
    columns = ['Station', 'Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    for station_df in range(len(station_list)):
        df = raw_df_list[station_df]
        for row in range(df.shape[0]):
            new_row_data = [station_list[station_df]]
            for element in range(len(df.iloc[row])):
                new_row_data.append(df.iloc[row][element])
            full_raw_list.append(new_row_data)
    return full_raw_list
    # print(full_raw_list)

# concat_raw_anom_format_long(raw_df_list)
full_raw_anom_list = concat_raw_anom_format_wide(anom_df_list, codes)

for element in full_raw_anom_list:
    print(element)

In [0]:
# Run to download file
full_anom_data_df = pd.DataFrame(full_raw_anom_list, columns = ['Station', 'Year', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
print(full_anom_data_df)

## References

Code adapted from: https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python

Other Resources: