In [None]:
import pandas as pd
import requests
from io import StringIO
from pathlib import Path

# URLs for different climate data files
data_urls = {
    "AIRPRESSURE": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_pg.txt",
    "PRECIPITATION": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_rh24.txt",
    "MINIMUM_TEMPERATURES": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_tng.txt",
    "MAXIMUM_TEMPERATURES": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_txg.txt",
    "MEAN_TEMPERATURES": "https://cdn.knmi.nl/knmi/map/page/klimatologie/gegevens/maandgegevens/mndgeg_260_tg.txt"
}

# Expected columns based on the structure you provided
columns = ["STN", "YYYY", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", "YEAR"]

# Function to download and convert data to DataFrame
def load_knmi_data(url, data_type):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download data from {url}, status code: {response.status_code}")
    
    # Read content as a text stream
    data = StringIO(response.text)
    
    # Read the file line-by-line to identify where the data starts
    lines = data.readlines()
    
    # Find the line where actual data starts (typically the line starting with 'STN')
    start_index = next(i for i, line in enumerate(lines) if line.startswith("STN,"))
    
    # Convert the lines after the header to a DataFrame
    data_str = ''.join(lines[start_index:])
    data = StringIO(data_str)
    
    try:
        # Load the data using comma as delimiter, handling missing values, and specifying columns
        df = pd.read_csv(data, sep=',', names=columns, na_values=[" "], skiprows=1, engine='python')
        df['type'] = data_type.lower()  # Add type column with data type name in lowercase
    except pd.errors.ParserError as e:
        print(f"Parser error for {url}: {e}")
        return None
    
    return df

# Dictionary to store DataFrames for each data type
dataframes = {}

# Load each dataset into a DataFrame and add it to the dictionary
for data_type, url in data_urls.items():
    df = load_knmi_data(url, data_type)
    if df is not None:
        dataframes[data_type] = df

# Function to combine all DataFrames into a single DataFrame
def combine_dataframes(dataframes):
    return pd.concat(dataframes.values(), ignore_index=True)

# Function to filter data by year range
def filter_years(df, start_year=2008, end_year=2022):
    return df[(df['YYYY'] >= start_year) & (df['YYYY'] <= end_year)]

# Function to convert monthly data to quarterly data
def convert_to_quarters(df):
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Ensure all monthly columns are numeric, converting non-numeric values to NaN
    monthly_columns = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
    df.loc[:, monthly_columns] = df[monthly_columns].apply(pd.to_numeric, errors='coerce')
    
    # Create new columns for each quarter by averaging the corresponding months
    df['Q1'] = df[['JAN', 'FEB', 'MAR']].mean(axis=1)
    df['Q2'] = df[['APR', 'MAY', 'JUN']].mean(axis=1)
    df['Q3'] = df[['JUL', 'AUG', 'SEP']].mean(axis=1)
    df['Q4'] = df[['OCT', 'NOV', 'DEC']].mean(axis=1)
    
    # Drop only the monthly columns, keeping quarterly data and other columns
    df = df.drop(columns=monthly_columns)
    
    return df

# Updated function to reshape to wide format with each type as a separate column
def reshape_to_wide_format(df):
    # Drop the 'STN' column as requested
    df = df.drop(columns=['STN'])
    
    # Convert to long format with 'Year_Quarter' and 'value' columns
    df_long = pd.melt(df, id_vars=['YYYY', 'type'], value_vars=['Q1', 'Q2', 'Q3', 'Q4'], 
                      var_name='Quarter', value_name='value')
    
    # Combine 'YYYY' and 'Quarter' to create 'Year_Quarter' format
    df_long['Year_Quarter'] = df_long['YYYY'].astype(str) + '-' + df_long['Quarter']
    
    # Drop the now redundant 'YYYY' and 'Quarter' columns
    df_long = df_long.drop(columns=['YYYY', 'Quarter'])
    
    # Pivot the DataFrame so each type becomes a separate column
    df_wide = df_long.pivot(index='Year_Quarter', columns='type', values='value').reset_index()
    
    # Sort by 'Year_Quarter'
    df_wide = df_wide.sort_values(by='Year_Quarter').reset_index(drop=True)
    
    return df_wide

# Function to export DataFrame to CSV
def export_to_csv(df, filename="knmi_data.csv", folder=r'C:\Users\mth2\OneDrive - Gemeente Breda\Bureaublad\Github\uwv\data'):
    # Ensure the directory exists
    output_dir = Path(folder)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Define the file path
    file_path = output_dir / filename
    
    # Export to CSV
    df.to_csv(file_path, index=False)
    print(f"Data exported to {file_path}")


In [None]:
# Combine all DataFrames into one
combined_df = combine_dataframes(dataframes)
print(combined_df.shape)
combined_df.head()

In [None]:
# Filter the combined DataFrame for the years 2008 to 2022
filtered_df = filter_years(combined_df)
filtered_df.head()

In [None]:
# Convert the filtered data to quarters
quarterly_df = convert_to_quarters(filtered_df)
quarterly_df.head()

In [None]:
final_df = reshape_to_wide_format(quarterly_df)
final_df.head()

In [None]:
# Export final_df to CSV
export_to_csv(final_df)