In [1]:
import pandas as pd
import datetime as dt
from fmiopendata.wfs import download_stored_query
from tqdm import tqdm

#### without retry policy, 6 days intervals

In [72]:
import datetime as dt
from fmiopendata.wfs import download_stored_query

start_date = dt.date(2023, 10, 1)  # Example start date
end_date = dt.date(2023, 12, 31)

data_list = []  # List to collect data

current_date = start_date
while current_date <= end_date:
    # Set the start time to the current date
    start_time = current_date.isoformat() + "T00:00:00Z"
    # Calculate the end time 7 days from the start time
    end_date_interval = current_date + dt.timedelta(days=6)
    # Make sure the end date does not exceed the overall end date
    if end_date_interval > end_date:
        end_date_interval = end_date
    end_time = end_date_interval.isoformat() + "T23:59:59Z"

    print(f"Downloading data from {start_time} to {end_time}")  # Optional: print statement to track progress

    # Download data for the current 7-day interval
    obs = download_stored_query("fmi::observations::weather::multipointcoverage",
                                args=["bbox=18,55,35,75",  # whole Finland
                                      "starttime=" + start_time,
                                      "timestep="+str(60*24),  # daily entries
                                      "endtime=" + end_time,
                                      "timeseries=True"])

    # Parse and organize the data
    for station, station_data in obs.data.items():
        times = station_data['times']
        for param, values in station_data.items():
            if param != 'times':  # Skip the 'times' key
                for time, value in zip(times, values['values']):
                    data_list.append({'Timestamp': time, 'Station': station, param: value})

    # Move to the next 7-day interval for the next iteration
    current_date += dt.timedelta(days=6)


Downloading data from 2023-10-01T00:00:00Z to 2023-10-07T23:59:59Z
Downloading data from 2023-10-07T00:00:00Z to 2023-10-13T23:59:59Z
Downloading data from 2023-10-13T00:00:00Z to 2023-10-19T23:59:59Z
Downloading data from 2023-10-19T00:00:00Z to 2023-10-25T23:59:59Z
Downloading data from 2023-10-25T00:00:00Z to 2023-10-31T23:59:59Z
Downloading data from 2023-10-31T00:00:00Z to 2023-11-06T23:59:59Z
Downloading data from 2023-11-06T00:00:00Z to 2023-11-12T23:59:59Z
Downloading data from 2023-11-12T00:00:00Z to 2023-11-18T23:59:59Z
Downloading data from 2023-11-18T00:00:00Z to 2023-11-24T23:59:59Z
Downloading data from 2023-11-24T00:00:00Z to 2023-11-30T23:59:59Z
Downloading data from 2023-11-30T00:00:00Z to 2023-12-06T23:59:59Z
Downloading data from 2023-12-06T00:00:00Z to 2023-12-12T23:59:59Z
Downloading data from 2023-12-12T00:00:00Z to 2023-12-18T23:59:59Z
Downloading data from 2023-12-18T00:00:00Z to 2023-12-24T23:59:59Z
Downloading data from 2023-12-24T00:00:00Z to 2023-12-30T23:59

In [3]:
import datetime as dt
from fmiopendata.wfs import download_stored_query
import time  # For adding delay between retries

start_date = dt.date(2009, 1, 1)  # Example start date
end_date = dt.date(2023, 12, 31)

data_list = []  # List to collect data

current_date = start_date
while current_date <= end_date:
    # Set the start time to the current date
    start_time = current_date.isoformat() + "T00:00:00Z"
    # Calculate the end time 7 days from the start time
    end_date_interval = current_date + dt.timedelta(days=6)
    # Make sure the end date does not exceed the overall end date
    if end_date_interval > end_date:
        end_date_interval = end_date
    end_time = end_date_interval.isoformat() + "T23:59:59Z"

    print(f"Downloading data from {start_time} to {end_time}")  # Optional: print statement to track progress

    max_retries = 5  # Maximum number of retries for each download attempt
    retry_delay = 5  # Delay between retries in seconds

    for attempt in range(max_retries):
        try:
            # Download data for the current 7-day interval
            obs = download_stored_query("fmi::observations::weather::multipointcoverage",
                                        args=["bbox=18,55,35,75",  # whole Finland
                                              "starttime=" + start_time,
                                              "timestep=" + str(60*24),  # daily entries
                                              "endtime=" + end_time,
                                              "timeseries=True"])
            # If download is successful, break out of the retry loop
            break
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)  # Wait before retrying
            else:
                print("Max retries reached, moving to next interval")
                obs = None  # Set obs to None to handle the case where all retries fail

    if obs is not None:
        # Parse and organize the data only if the download was successful
        for station, station_data in obs.data.items():
            times = station_data['times']
            for param, values in station_data.items():
                if param != 'times':  # Skip the 'times' key
                    for time, value in zip(times, values['values']):
                        data_list.append({'Timestamp': time, 'Station': station, param: value})

    # Move to the next 7-day interval for the next iteration
    current_date += dt.timedelta(days=7)  # Fixed to add 7 instead of 6 to avoid overlapping dates


Downloading data from 2009-01-01T00:00:00Z to 2009-01-07T23:59:59Z
No observations found
Downloading data from 2009-01-08T00:00:00Z to 2009-01-14T23:59:59Z
Downloading data from 2009-01-15T00:00:00Z to 2009-01-21T23:59:59Z
Downloading data from 2009-01-22T00:00:00Z to 2009-01-28T23:59:59Z
Downloading data from 2009-01-29T00:00:00Z to 2009-02-04T23:59:59Z
Downloading data from 2009-02-05T00:00:00Z to 2009-02-11T23:59:59Z
Downloading data from 2009-02-12T00:00:00Z to 2009-02-18T23:59:59Z
Downloading data from 2009-02-19T00:00:00Z to 2009-02-25T23:59:59Z
Downloading data from 2009-02-26T00:00:00Z to 2009-03-04T23:59:59Z
Downloading data from 2009-03-05T00:00:00Z to 2009-03-11T23:59:59Z
Downloading data from 2009-03-12T00:00:00Z to 2009-03-18T23:59:59Z
Downloading data from 2009-03-19T00:00:00Z to 2009-03-25T23:59:59Z
Downloading data from 2009-03-26T00:00:00Z to 2009-04-01T23:59:59Z
Downloading data from 2009-04-02T00:00:00Z to 2009-04-08T23:59:59Z
Downloading data from 2009-04-09T00:00:0

Downloading data from 2011-05-05T00:00:00Z to 2011-05-11T23:59:59Z
Downloading data from 2011-05-12T00:00:00Z to 2011-05-18T23:59:59Z
Downloading data from 2011-05-19T00:00:00Z to 2011-05-25T23:59:59Z
Downloading data from 2011-05-26T00:00:00Z to 2011-06-01T23:59:59Z
Downloading data from 2011-06-02T00:00:00Z to 2011-06-08T23:59:59Z
Downloading data from 2011-06-09T00:00:00Z to 2011-06-15T23:59:59Z
Downloading data from 2011-06-16T00:00:00Z to 2011-06-22T23:59:59Z
Downloading data from 2011-06-23T00:00:00Z to 2011-06-29T23:59:59Z
Downloading data from 2011-06-30T00:00:00Z to 2011-07-06T23:59:59Z
Downloading data from 2011-07-07T00:00:00Z to 2011-07-13T23:59:59Z
Downloading data from 2011-07-14T00:00:00Z to 2011-07-20T23:59:59Z
Downloading data from 2011-07-21T00:00:00Z to 2011-07-27T23:59:59Z
Downloading data from 2011-07-28T00:00:00Z to 2011-08-03T23:59:59Z
Downloading data from 2011-08-04T00:00:00Z to 2011-08-10T23:59:59Z
Downloading data from 2011-08-11T00:00:00Z to 2011-08-17T23:59

Downloading data from 2013-09-12T00:00:00Z to 2013-09-18T23:59:59Z
Downloading data from 2013-09-19T00:00:00Z to 2013-09-25T23:59:59Z
Downloading data from 2013-09-26T00:00:00Z to 2013-10-02T23:59:59Z
Downloading data from 2013-10-03T00:00:00Z to 2013-10-09T23:59:59Z
Downloading data from 2013-10-10T00:00:00Z to 2013-10-16T23:59:59Z
Downloading data from 2013-10-17T00:00:00Z to 2013-10-23T23:59:59Z
Downloading data from 2013-10-24T00:00:00Z to 2013-10-30T23:59:59Z
Downloading data from 2013-10-31T00:00:00Z to 2013-11-06T23:59:59Z
Downloading data from 2013-11-07T00:00:00Z to 2013-11-13T23:59:59Z
Downloading data from 2013-11-14T00:00:00Z to 2013-11-20T23:59:59Z
Downloading data from 2013-11-21T00:00:00Z to 2013-11-27T23:59:59Z
Downloading data from 2013-11-28T00:00:00Z to 2013-12-04T23:59:59Z
Downloading data from 2013-12-05T00:00:00Z to 2013-12-11T23:59:59Z
Downloading data from 2013-12-12T00:00:00Z to 2013-12-18T23:59:59Z
Downloading data from 2013-12-19T00:00:00Z to 2013-12-25T23:59

Downloading data from 2016-01-21T00:00:00Z to 2016-01-27T23:59:59Z
Downloading data from 2016-01-28T00:00:00Z to 2016-02-03T23:59:59Z
Downloading data from 2016-02-04T00:00:00Z to 2016-02-10T23:59:59Z
Downloading data from 2016-02-11T00:00:00Z to 2016-02-17T23:59:59Z
Downloading data from 2016-02-18T00:00:00Z to 2016-02-24T23:59:59Z
Downloading data from 2016-02-25T00:00:00Z to 2016-03-02T23:59:59Z
Downloading data from 2016-03-03T00:00:00Z to 2016-03-09T23:59:59Z
Downloading data from 2016-03-10T00:00:00Z to 2016-03-16T23:59:59Z
Downloading data from 2016-03-17T00:00:00Z to 2016-03-23T23:59:59Z
Downloading data from 2016-03-24T00:00:00Z to 2016-03-30T23:59:59Z
Downloading data from 2016-03-31T00:00:00Z to 2016-04-06T23:59:59Z
Downloading data from 2016-04-07T00:00:00Z to 2016-04-13T23:59:59Z
Downloading data from 2016-04-14T00:00:00Z to 2016-04-20T23:59:59Z
Downloading data from 2016-04-21T00:00:00Z to 2016-04-27T23:59:59Z
Downloading data from 2016-04-28T00:00:00Z to 2016-05-04T23:59

Downloading data from 2018-05-31T00:00:00Z to 2018-06-06T23:59:59Z
Downloading data from 2018-06-07T00:00:00Z to 2018-06-13T23:59:59Z
Downloading data from 2018-06-14T00:00:00Z to 2018-06-20T23:59:59Z
Downloading data from 2018-06-21T00:00:00Z to 2018-06-27T23:59:59Z
Downloading data from 2018-06-28T00:00:00Z to 2018-07-04T23:59:59Z
Downloading data from 2018-07-05T00:00:00Z to 2018-07-11T23:59:59Z
Downloading data from 2018-07-12T00:00:00Z to 2018-07-18T23:59:59Z
Downloading data from 2018-07-19T00:00:00Z to 2018-07-25T23:59:59Z
Downloading data from 2018-07-26T00:00:00Z to 2018-08-01T23:59:59Z
Downloading data from 2018-08-02T00:00:00Z to 2018-08-08T23:59:59Z
Downloading data from 2018-08-09T00:00:00Z to 2018-08-15T23:59:59Z
Downloading data from 2018-08-16T00:00:00Z to 2018-08-22T23:59:59Z
Downloading data from 2018-08-23T00:00:00Z to 2018-08-29T23:59:59Z
Downloading data from 2018-08-30T00:00:00Z to 2018-09-05T23:59:59Z
Downloading data from 2018-09-06T00:00:00Z to 2018-09-12T23:59

Downloading data from 2020-10-08T00:00:00Z to 2020-10-14T23:59:59Z
Downloading data from 2020-10-15T00:00:00Z to 2020-10-21T23:59:59Z
Downloading data from 2020-10-22T00:00:00Z to 2020-10-28T23:59:59Z
Downloading data from 2020-10-29T00:00:00Z to 2020-11-04T23:59:59Z
Downloading data from 2020-11-05T00:00:00Z to 2020-11-11T23:59:59Z
Downloading data from 2020-11-12T00:00:00Z to 2020-11-18T23:59:59Z
Downloading data from 2020-11-19T00:00:00Z to 2020-11-25T23:59:59Z
Downloading data from 2020-11-26T00:00:00Z to 2020-12-02T23:59:59Z
Downloading data from 2020-12-03T00:00:00Z to 2020-12-09T23:59:59Z
Downloading data from 2020-12-10T00:00:00Z to 2020-12-16T23:59:59Z
Downloading data from 2020-12-17T00:00:00Z to 2020-12-23T23:59:59Z
Downloading data from 2020-12-24T00:00:00Z to 2020-12-30T23:59:59Z
Downloading data from 2020-12-31T00:00:00Z to 2021-01-06T23:59:59Z
Downloading data from 2021-01-07T00:00:00Z to 2021-01-13T23:59:59Z
Downloading data from 2021-01-14T00:00:00Z to 2021-01-20T23:59

Downloading data from 2023-02-16T00:00:00Z to 2023-02-22T23:59:59Z
Downloading data from 2023-02-23T00:00:00Z to 2023-03-01T23:59:59Z
Downloading data from 2023-03-02T00:00:00Z to 2023-03-08T23:59:59Z
Downloading data from 2023-03-09T00:00:00Z to 2023-03-15T23:59:59Z
Downloading data from 2023-03-16T00:00:00Z to 2023-03-22T23:59:59Z
Downloading data from 2023-03-23T00:00:00Z to 2023-03-29T23:59:59Z
Downloading data from 2023-03-30T00:00:00Z to 2023-04-05T23:59:59Z
Downloading data from 2023-04-06T00:00:00Z to 2023-04-12T23:59:59Z
Downloading data from 2023-04-13T00:00:00Z to 2023-04-19T23:59:59Z
Downloading data from 2023-04-20T00:00:00Z to 2023-04-26T23:59:59Z
Downloading data from 2023-04-27T00:00:00Z to 2023-05-03T23:59:59Z
Downloading data from 2023-05-04T00:00:00Z to 2023-05-10T23:59:59Z
Downloading data from 2023-05-11T00:00:00Z to 2023-05-17T23:59:59Z
Downloading data from 2023-05-18T00:00:00Z to 2023-05-24T23:59:59Z
Downloading data from 2023-05-25T00:00:00Z to 2023-05-31T23:59

#### data sorted by measurement type

In [4]:
# Identify all unique data types (excluding 'Timestamp' and 'Station')
data_types = set(key for entry in data_list for key in entry if key not in ['Timestamp', 'Station'])

# Initialize a dictionary to hold a DataFrame for each data type
dfs = {}

# Extract unique station names and timestamps
station_names = sorted(set(entry['Station'] for entry in data_list))
timestamps = sorted(set(entry['Timestamp'] for entry in data_list))

# Create a DataFrame for each data type
for data_type in data_types:
    # Filter entries for the current data type
    filtered_data = [
        {key: value for key, value in entry.items() if key in ['Timestamp', 'Station', data_type]}
        for entry in data_list if data_type in entry
    ]

    # Initialize an empty DataFrame for the current data type
    df = pd.DataFrame(columns=station_names, index=pd.to_datetime(timestamps))

    # Fill the DataFrame with the current data type's measurements
    for entry in filtered_data:
        timestamp = entry['Timestamp']
        station = entry['Station']
        value = entry.get(data_type)  # Use .get() to handle missing data_type in some entries
        df.at[timestamp, station] = value

    # Filter rows that have data from at least half of the measurement stations
    threshold = len(station_names) // 2  # At least half of the stations must have data
    df_filtered = df.dropna(thresh=threshold)

    # Store the filtered DataFrame in the dictionary
    dfs[data_type] = df_filtered

# Access a specific filtered DataFrame by its data type, for example:
dfs['Wind speed']  # For filtered wind speed data

Unnamed: 0,Alajärvi Möksy,Asikkala Pulkkilanharju,Eckerö Torp,Enontekiö Kilpisjärvi Saana,Enontekiö Kilpisjärvi kyläkeskus,Enontekiö Näkkälä,Enontekiö lentoasema,Espoo Nuuksio,Espoo Sepänkylä,Espoo Tapiola,...,Varkaus Kosulanniemi,Vesanto kirkonkylä,Vieremä Kaarakkala,Vihti Maasoja,Viitasaari Haapaniemi,Virolahti Koivuniemi,Virrat Äijänneva,Ylitornio Meltosjärvi,Ylivieska lentokenttä,Ähtäri Inha
2013-01-03,,1.4,,,,,,,2.7,,...,,,2.9,1.1,1.5,2.6,3.8,,2.8,0.0
2013-01-04,,2.9,,,,,,,1.1,,...,,,1.9,1.0,2.0,1.2,2.3,,2.2,1.1
2013-01-05,,9.9,,,,,,,6.1,,...,,,1.9,5.1,0.7,2.7,4.6,,2.4,1.8
2013-01-06,,5.8,,,,,,,3.8,,...,,,1.4,3.8,1.2,4.3,2.6,,0.7,1.1
2013-01-07,,0.8,,,,,,,0.0,,...,,,2.1,0.7,0.9,3.0,2.6,,0.7,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,,2.3,,7.3,4.0,3.2,2.7,2.6,,3.9,...,1.0,,1.3,0.0,1.3,0.8,1.3,0.8,1.3,
2023-12-28,,0.6,,6.9,0.4,1.5,0.6,2.0,,3.0,...,2.7,,1.2,1.1,0.8,0.0,1.3,0.4,2.4,
2023-12-29,,4.0,,5.8,0.6,0.9,0.5,2.9,,3.7,...,1.1,,2.0,1.6,1.1,3.2,3.3,0.6,2.1,
2023-12-30,,4.0,,10.4,2.9,2.4,1.8,3.9,,3.7,...,2.3,,2.5,2.1,1.2,3.4,3.8,1.1,3.6,


In [5]:
import os

# Define the subfolder name
subfolder = 'data_by_measurement_type'

# Check if the subfolder exists, and if not, create it
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

# Iterate over the dfs dictionary to save each DataFrame to a CSV file in the subfolder
for data_type, df in dfs.items():
    # Format the data_type string to create a valid and readable filename
    filename = f"{data_type.replace(' ', '_').lower()}_data.csv"
    # Create the full path by joining the subfolder and filename
    full_path = os.path.join(subfolder, filename)
    
    # Save the DataFrame to a CSV file at the full path
    df.to_csv(full_path)
    
    print(f"Saved {data_type} data to {full_path}")


Saved Relative humidity data to data_by_measurement_type/relative_humidity_data.csv
Saved Precipitation intensity data to data_by_measurement_type/precipitation_intensity_data.csv
Saved Horizontal visibility data to data_by_measurement_type/horizontal_visibility_data.csv
Saved Wind direction data to data_by_measurement_type/wind_direction_data.csv
Saved Cloud amount data to data_by_measurement_type/cloud_amount_data.csv
Saved Dew-point temperature data to data_by_measurement_type/dew-point_temperature_data.csv
Saved Wind speed data to data_by_measurement_type/wind_speed_data.csv
Saved Gust speed data to data_by_measurement_type/gust_speed_data.csv
Saved Air temperature data to data_by_measurement_type/air_temperature_data.csv
Saved Snow depth data to data_by_measurement_type/snow_depth_data.csv
Saved Present weather (auto) data to data_by_measurement_type/present_weather_(auto)_data.csv
Saved Pressure (msl) data to data_by_measurement_type/pressure_(msl)_data.csv
Saved Precipitation am

#### Data grouped by the measurement station

In [6]:

# Identify all unique stations and measurement types
stations = sorted(set(entry['Station'] for entry in data_list))
measurement_types = sorted(set(key for entry in data_list for key in entry if key not in ['Timestamp', 'Station']))

# Initialize a dictionary to hold a DataFrame for each station
station_dfs = {}

# Create a DataFrame for each station
for station in stations:
    # Filter entries for the current station
    station_data = [entry for entry in data_list if entry['Station'] == station]
    
    # Extract timestamps for the current station
    timestamps = sorted(set(entry['Timestamp'] for entry in station_data))
    
    # Initialize an empty DataFrame for the current station
    df = pd.DataFrame(index=pd.to_datetime(timestamps), columns=measurement_types)
    
    # Fill the DataFrame with measurements
    for entry in station_data:
        timestamp = entry['Timestamp']
        for measurement in measurement_types:
            if measurement in entry:
                df.at[timestamp, measurement] = entry[measurement]

    # Store the DataFrame in the dictionary
    station_dfs[station] = df

# Define the subfolder name
subfolder = 'data_by_station'

# Check if the subfolder exists, and if not, create it
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

# Save each station's DataFrame to a CSV file in the subfolder
for station, df in station_dfs.items():
    # Format the station name to create a valid and readable filename
    filename = f"{station.replace(' ', '_').replace('/', '_').lower()}.csv"
    full_path = os.path.join(subfolder, filename)
    
    # Save the DataFrame to a CSV file
    df.to_csv(full_path)
    
    print(f"Saved data for {station} to {full_path}")

Saved data for Alajärvi Möksy to data_by_station/alajärvi_möksy.csv
Saved data for Asikkala Pulkkilanharju to data_by_station/asikkala_pulkkilanharju.csv
Saved data for Eckerö Torp to data_by_station/eckerö_torp.csv
Saved data for Enontekiö Kilpisjärvi Saana to data_by_station/enontekiö_kilpisjärvi_saana.csv
Saved data for Enontekiö Kilpisjärvi kyläkeskus to data_by_station/enontekiö_kilpisjärvi_kyläkeskus.csv
Saved data for Enontekiö Näkkälä to data_by_station/enontekiö_näkkälä.csv
Saved data for Enontekiö lentoasema to data_by_station/enontekiö_lentoasema.csv
Saved data for Espoo Nuuksio to data_by_station/espoo_nuuksio.csv
Saved data for Espoo Sepänkylä to data_by_station/espoo_sepänkylä.csv
Saved data for Espoo Tapiola to data_by_station/espoo_tapiola.csv
Saved data for Haapavesi Mustikkamäki to data_by_station/haapavesi_mustikkamäki.csv
Saved data for Hailuoto Keskikylä to data_by_station/hailuoto_keskikylä.csv
Saved data for Hailuoto Marjaniemi to data_by_station/hailuoto_marjani

Saved data for Multia Karhila to data_by_station/multia_karhila.csv
Saved data for Muonio Alamuonio to data_by_station/muonio_alamuonio.csv
Saved data for Muonio Laukukero to data_by_station/muonio_laukukero.csv
Saved data for Muonio Oustajärvi to data_by_station/muonio_oustajärvi.csv
Saved data for Muonio Sammaltunturi to data_by_station/muonio_sammaltunturi.csv
Saved data for Mustasaari Valassaaret to data_by_station/mustasaari_valassaaret.csv
Saved data for Mäntsälä Hirvihaara to data_by_station/mäntsälä_hirvihaara.csv
Saved data for Nurmes Valtimo to data_by_station/nurmes_valtimo.csv
Saved data for Nurmijärvi Röykkä to data_by_station/nurmijärvi_röykkä.csv
Saved data for Oulu Kaukovainio to data_by_station/oulu_kaukovainio.csv
Saved data for Oulu Oulunsalo Pellonpää to data_by_station/oulu_oulunsalo_pellonpää.csv
Saved data for Oulu Vihreäsaari satama to data_by_station/oulu_vihreäsaari_satama.csv
Saved data for Oulu lentoasema to data_by_station/oulu_lentoasema.csv
Saved data for