In [8]:
import pandas as pd
import requests
from io import StringIO
from pathlib import Path

# URLs for different climate data files
data_urls = {
    "AIRPRESSURE": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_pg.txt",
    "PRECIPITATION": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_rh24.txt",
    "MINIMUM_TEMPERATURES": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_tng.txt",
    "MAXIMUM_TEMPERATURES": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_txg.txt",
    "MEAN_TEMPERATURES": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_tg.txt"
}

# Expected columns based on the structure you provided
columns = ["STN", "YYYY", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", "YEAR"]

# Function to download and convert data to DataFrame
def load_knmi_data(url, data_type):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download data from {url}, status code: {response.status_code}")
    
    # Read content as a text stream
    data = StringIO(response.text)
    
    # Read the file line-by-line to identify where the data starts
    lines = data.readlines()
    
    # Find the line where actual data starts (typically the line starting with 'STN')
    start_index = next(i for i, line in enumerate(lines) if line.startswith("STN,"))
    
    # Convert the lines after the header to a DataFrame
    data_str = ''.join(lines[start_index:])
    data = StringIO(data_str)
    
    try:
        # Load the data using comma as delimiter, handling missing values, and specifying columns
        df = pd.read_csv(data, sep=',', names=columns, na_values=[" "], skiprows=1, engine='python')
        df['type'] = data_type.lower()  # Add type column with data type name in lowercase
    except pd.errors.ParserError as e:
        print(f"Parser error for {url}: {e}")
        return None
    
    return df

# Dictionary to store DataFrames for each data type
dataframes = {}

# Load each dataset into a DataFrame and add it to the dictionary
for data_type, url in data_urls.items():
    df = load_knmi_data(url, data_type)
    if df is not None:
        dataframes[data_type] = df

# Function to combine all DataFrames into a single DataFrame
def combine_dataframes(dataframes):
    return pd.concat(dataframes.values(), ignore_index=True)

# Function to filter data by year range
def filter_years(df, start_year=2006, end_year=2024):
    return df[(df['YYYY'] >= start_year) & (df['YYYY'] <= end_year)]

# Function to convert monthly data to quarterly data
def convert_to_quarters(df):
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Ensure all monthly columns are numeric, converting non-numeric values to NaN
    monthly_columns = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
    df.loc[:, monthly_columns] = df[monthly_columns].apply(pd.to_numeric, errors='coerce')
    
    # Create new columns for each quarter by averaging the corresponding months
    df['Q1'] = df[['JAN', 'FEB', 'MAR']].mean(axis=1)
    df['Q2'] = df[['APR', 'MAY', 'JUN']].mean(axis=1)
    df['Q3'] = df[['JUL', 'AUG', 'SEP']].mean(axis=1)
    df['Q4'] = df[['OCT', 'NOV', 'DEC']].mean(axis=1)
    
    # Drop only the monthly columns, keeping quarterly data and other columns
    df = df.drop(columns=monthly_columns)
    
    return df

# Updated function to reshape to wide format with each type as a separate column
def reshape_to_wide_format(df):
    # Drop the 'STN' column as requested
    df = df.drop(columns=['STN'])
    
    # Convert to long format with 'Year_Quarter' and 'value' columns
    df_long = pd.melt(df, id_vars=['YYYY', 'type'], value_vars=['Q1', 'Q2', 'Q3', 'Q4'], 
                      var_name='Quarter', value_name='value')
    
    # Combine 'YYYY' and 'Quarter' to create 'Year_Quarter' format
    df_long['Year_Quarter'] = df_long['YYYY'].astype(str) + '-' + df_long['Quarter']
    
    # Drop the now redundant 'YYYY' and 'Quarter' columns
    df_long = df_long.drop(columns=['YYYY', 'Quarter'])
    
    # Pivot the DataFrame so each type becomes a separate column
    df_wide = df_long.pivot(index='Year_Quarter', columns='type', values='value').reset_index()
    
    # Sort by 'Year_Quarter'
    df_wide = df_wide.sort_values(by='Year_Quarter').reset_index(drop=True)
    
    return df_wide

# Function to export DataFrame to CSV
def export_to_csv(df, filename="knmi_data.csv", folder="data"):
    # Ensure the directory exists
    output_dir = Path(folder)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Define the file path
    file_path = output_dir / filename
    
    # Export to CSV
    df.to_csv(file_path, index=False)
    print(f"Data exported to {file_path}")


In [9]:
# Combine all DataFrames into one
combined_df = combine_dataframes(dataframes)
print(combined_df.shape)
combined_df.head()

(620, 16)


Unnamed: 0,STN,YYYY,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,YEAR,type
0,260,1901,,,,,,,,,,,,,,airpressure
1,260,1902,10196.0,10131.0,10108.0,10147.0,10130.0,10138.0,10163.0,10138.0,10187.0,10160.0,10163.0,10182.0,10154.0,airpressure
2,260,1903,10183.0,10212.0,10137.0,10100.0,10133.0,10167.0,10139.0,10125.0,10181.0,10069.0,10165.0,10114.0,10144.0,airpressure
3,260,1904,10173.0,10037.0,10157.0,10142.0,10156.0,10175.0,10182.0,10171.0,10196.0,10199.0,10167.0,10147.0,10159.0,airpressure
4,260,1905,10251.0,10208.0,10097.0,10114.0,10193.0,10144.0,10177.0,10133.0,10152.0,10143.0,10084.0,10257.0,10163.0,airpressure


In [10]:
# Filter the combined DataFrame for the years 2008 to 2022
filtered_df = filter_years(combined_df)
filtered_df.head()

Unnamed: 0,STN,YYYY,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,YEAR,type
105,260,2006,10246,10147,10106,10141,10136,10205,10191,10108,10140,10114,10148,10208,10158,airpressure
106,260,2007,10152,10088,10150,10216,10105,10117,10114,10147,10189,10239,10181,10207,10159,airpressure
107,260,2008,10140,10239,10050,10111,10159,10159,10141,10117,10178,10148,10125,10182,10146,airpressure
108,260,2009,10137,10140,10140,10141,10188,10168,10128,10165,10208,10166,10041,10068,10141,airpressure
109,260,2010,10151,10034,10168,10194,10158,10166,10158,10129,10147,10135,10057,10133,10136,airpressure


In [11]:
# Convert the filtered data to quarters
quarterly_df = convert_to_quarters(filtered_df)
quarterly_df.head()

Unnamed: 0,STN,YYYY,YEAR,type,Q1,Q2,Q3,Q4
105,260,2006,10158,airpressure,10166.333333,10160.666667,10146.333333,10156.666667
106,260,2007,10159,airpressure,10130.0,10146.0,10150.0,10209.0
107,260,2008,10146,airpressure,10143.0,10143.0,10145.333333,10151.666667
108,260,2009,10141,airpressure,10139.0,10165.666667,10167.0,10091.666667
109,260,2010,10136,airpressure,10117.666667,10172.666667,10144.666667,10108.333333


In [12]:
final_df = reshape_to_wide_format(quarterly_df)
final_df.head()

type,Year_Quarter,airpressure,maximum_temperatures,mean_temperatures,minimum_temperatures,precipitation
0,2006-Q1,10166.333333,58.333333,27.666667,-1.666667,594.0
1,2006-Q2,10160.666667,180.666667,134.0,82.666667,493.333333
2,2006-Q3,10146.333333,241.666667,188.666667,135.0,680.333333
3,2006-Q4,10156.666667,128.333333,97.666667,64.0,922.666667
4,2007-Q1,10130.0,102.666667,70.333333,38.333333,855.0


In [13]:
# Export final_df to CSV
export_to_csv(final_df)

Data exported to data/knmi_data.csv
