# 1. prepare dataset

In [None]:
import io
import glob
import os
import zipfile

import pandas as pd
import requests

## 1.1 download raw data

In [None]:
SOURCE_BASE_URL = "https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat/"
OUTPUT_DIR = "../data/raw_files"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
for year in range(2001, 2026):
    for month in range(1, 13):
        month_str = "0" + str(month) if month < 10 else str(month)
        zip_url = f"{SOURCE_BASE_URL}/{year}/{year}_{month_str}_k.zip"
        file_to_extract = f"k_d_t_{month_str}_{year}.csv"
        output_path = os.path.join(OUTPUT_DIR, f"{year}_{month_str}.csv")
        
        # skip if file already exists
        if os.path.exists(output_path):
            print(f"skipping file {output_path} (already exists)")
            continue

        try:
            # download zip
            response = requests.get(zip_url)

            # extract zip
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                with z.open(file_to_extract) as f:
                    file_bytes = f.read()

            # save only the file we want
            with open(output_path, "wb") as out_file:
                out_file.write(file_bytes)
        
        except Exception as e:
            # skip if some issue
            print(f"error when downloading file: {zip_url}: {e}")
            continue
        

## 1.2 consolidate and select one station

In [None]:
INPUT_DIR = "../data/raw_files"
OUTPUT_FILE = "../data/one_station.csv"
FILTER_COL_INDEX = 0  # station code
FILTER_VALUE = 252210170  # WARSZAWA-OBSERWATORIUM II
COLUMN_NAMES = [
    "station_code",
    "station_name",
    "year",
    "month",
    "day",
    "avg_day_temp",
    "temp_status",
    "avg_day_humidity",
    "humidity_status",
    "avg_day_wind_speed",
    "wind_status",
    "avg_day_cloud_lvl",
    "cloud_status"
]

csv_files = glob.glob(os.path.join(INPUT_DIR, "*.csv"))
all_dfs = []

for file in csv_files:
    df = pd.read_csv(
        file, 
        header=None, 
        encoding="utf-8", 
        encoding_errors="ignore",
        sep=",",
        quotechar='"'
    ) 
    filtered = df[df[FILTER_COL_INDEX] == FILTER_VALUE]

    if not filtered.empty:
        all_dfs.append(filtered)

final_df = pd.concat(all_dfs, ignore_index=True)
final_df.columns = COLUMN_NAMES
final_df.to_csv(OUTPUT_FILE, index=False, header=True)

print(f"Saved {len(final_df)} rows to {OUTPUT_FILE}") 