In [None]:
import pandas as pd
import datetime as dt
from fmiopendata.wfs import download_stored_query
from tqdm import tqdm

#### need to adjust dates

In [None]:
start_date = dt.date(2023, 12, 1)  # Example start date
end_date = dt.date(2023, 12, 31)

data_list = []  # List to collect data

current_date = start_date
while tqdm(current_date <= end_date):
    start_time = current_date.isoformat() + "T00:00:00Z"
    end_time = current_date.isoformat() + "T23:59:59Z"

    # Download data for the current day
    obs = download_stored_query("fmi::observations::weather::multipointcoverage",
                        args=["bbox=18,55,35,75",  #whole finland
                              "starttime=" + start_time,
                              "timestep="+str(60*24),  #daily entries
                              "endtime=" + end_time,
                             "timeseries=True"])

    # Parse and organize the data
    for station, station_data in obs.data.items():
        times = station_data['times']
        for param, values in station_data.items():
            if param != 'times':  # Skip the 'times' key
                for time, value in zip(times, values['values']):
                    data_list.append({'Timestamp': time, 'Station': station, param: value})
                    
    current_date += dt.timedelta(days=1) 



#### data sorted by measurement type

In [None]:
# Identify all unique data types (excluding 'Timestamp' and 'Station')
data_types = set(key for entry in data_list for key in entry if key not in ['Timestamp', 'Station'])

# Initialize a dictionary to hold a DataFrame for each data type
dfs = {}

# Extract unique station names and timestamps
station_names = sorted(set(entry['Station'] for entry in data_list))
timestamps = sorted(set(entry['Timestamp'] for entry in data_list))

# Create a DataFrame for each data type
for data_type in data_types:
    # Filter entries for the current data type
    filtered_data = [
        {key: value for key, value in entry.items() if key in ['Timestamp', 'Station', data_type]}
        for entry in data_list if data_type in entry
    ]

    # Initialize an empty DataFrame for the current data type
    df = pd.DataFrame(columns=station_names, index=pd.to_datetime(timestamps))

    # Fill the DataFrame with the current data type's measurements
    for entry in filtered_data:
        timestamp = entry['Timestamp']
        station = entry['Station']
        value = entry.get(data_type)  # Use .get() to handle missing data_type in some entries
        df.at[timestamp, station] = value

    # Filter rows that have data from at least half of the measurement stations
    threshold = len(station_names) // 2  # At least half of the stations must have data
    df_filtered = df.dropna(thresh=threshold)

    # Store the filtered DataFrame in the dictionary
    dfs[data_type] = df_filtered

# Access a specific filtered DataFrame by its data type, for example:
dfs['Wind speed']  # For filtered wind speed data

In [None]:
import os

# Define the subfolder name
subfolder = 'data_by_measurement_type'

# Check if the subfolder exists, and if not, create it
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

# Iterate over the dfs dictionary to save each DataFrame to a CSV file in the subfolder
for data_type, df in dfs.items():
    # Format the data_type string to create a valid and readable filename
    filename = f"{data_type.replace(' ', '_').lower()}_data.csv"
    # Create the full path by joining the subfolder and filename
    full_path = os.path.join(subfolder, filename)
    
    # Save the DataFrame to a CSV file at the full path
    df.to_csv(full_path)
    
    print(f"Saved {data_type} data to {full_path}")


#### Data grouped by the measurement station

In [None]:

# Identify all unique stations and measurement types
stations = sorted(set(entry['Station'] for entry in data_list))
measurement_types = sorted(set(key for entry in data_list for key in entry if key not in ['Timestamp', 'Station']))

# Initialize a dictionary to hold a DataFrame for each station
station_dfs = {}

# Create a DataFrame for each station
for station in stations:
    # Filter entries for the current station
    station_data = [entry for entry in data_list if entry['Station'] == station]
    
    # Extract timestamps for the current station
    timestamps = sorted(set(entry['Timestamp'] for entry in station_data))
    
    # Initialize an empty DataFrame for the current station
    df = pd.DataFrame(index=pd.to_datetime(timestamps), columns=measurement_types)
    
    # Fill the DataFrame with measurements
    for entry in station_data:
        timestamp = entry['Timestamp']
        for measurement in measurement_types:
            if measurement in entry:
                df.at[timestamp, measurement] = entry[measurement]

    # Store the DataFrame in the dictionary
    station_dfs[station] = df

# Define the subfolder name
subfolder = 'data_by_station'

# Check if the subfolder exists, and if not, create it
if not os.path.exists(subfolder):
    os.makedirs(subfolder)

# Save each station's DataFrame to a CSV file in the subfolder
for station, df in station_dfs.items():
    # Format the station name to create a valid and readable filename
    filename = f"{station.replace(' ', '_').replace('/', '_').lower()}.csv"
    full_path = os.path.join(subfolder, filename)
    
    # Save the DataFrame to a CSV file
    df.to_csv(full_path)
    
    print(f"Saved data for {station} to {full_path}")