# 1. prepare dataset

In [5]:
import io
import glob
import os
import zipfile

import pandas as pd
import requests

## 1.1 download raw data

In [6]:
SOURCE_BASE_URL = "https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat/"
OUTPUT_DIR = "../data/raw_files"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [7]:
for year in range(2001, 2026):
    for month in range(1, 13):
        month_str = "0" + str(month) if month < 10 else str(month)
        zip_url = f"{SOURCE_BASE_URL}/{year}/{year}_{month_str}_k.zip"
        file_to_extract = f"k_d_{month_str}_{year}.csv"
        output_path = os.path.join(OUTPUT_DIR, f"{year}_{month_str}.csv")
        
        # skip if file already exists
        if os.path.exists(output_path):
            print(f"skipping file {output_path} (already exists)")
            continue

        try:
            # download zip
            response = requests.get(zip_url)

            # extract zip
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                with z.open(file_to_extract) as f:
                    file_bytes = f.read()

            # save only the file we want
            with open(output_path, "wb") as out_file:
                out_file.write(file_bytes)
        
        except Exception as e:
            # skip if some issue
            print(f"error when downloading file: {zip_url}: {e}")
            continue
        

error when downloading file: https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat//2024/2024_07_k.zip: Bad CRC-32 for file 'k_d_07_2024.csv'
error when downloading file: https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat//2025/2025_08_k.zip: File is not a zip file
error when downloading file: https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat//2025/2025_09_k.zip: File is not a zip file
error when downloading file: https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat//2025/2025_10_k.zip: File is not a zip file
error when downloading file: https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat//2025/2025_11_k.zip: File is not a zip file
error when downloading file: https://danepubliczne.imgw.pl/data/dane_pomiarowo_obserwacyjne/dane_meteorologiczne/dobowe/klimat//20

## 1.2 consolidate and select one station

In [8]:
INPUT_DIR = "../data/raw_files"
OUTPUT_FILE = "../data/one_station.csv"
FILTER_COL_INDEX = 0  # station code
FILTER_VALUE = 252210170  # WARSZAWA-OBSERWATORIUM II
COLUMN_NAMES = [
    "station_code",
    "station_name",
    "year",
    "month",
    "day",
    "max_day_temp",
    "max_temp_status",
    "min_day_temp",
    "min_temp_status",
    "avg_day_temp",
    "avg_temp_status",
    "min_ground_temp",
    "min_ground_temp_status",
    "rainfall",
    "rainfall_status",
    "rainfall_type",
    "snowfall_height",
    "snowfall_height_status"
]

csv_files = glob.glob(os.path.join(INPUT_DIR, "*.csv"))
all_dfs = []

for file in csv_files:
    df = pd.read_csv(
        file, 
        header=None, 
        encoding="utf-8", 
        encoding_errors="ignore",
        sep=",",
        quotechar='"'
    ) 
    filtered = df[df[FILTER_COL_INDEX] == FILTER_VALUE]

    if not filtered.empty:
        all_dfs.append(filtered)

final_df = pd.concat(all_dfs, ignore_index=True)
final_df.columns = COLUMN_NAMES
final_df.to_csv(OUTPUT_FILE, index=False, header=True)

print(f"Saved {len(final_df)} rows to {OUTPUT_FILE}") 

Saved 8866 rows to ../data/one_station.csv


## 1.3 data exploration

In [9]:
INPUT_FILE = "../data/one_station.csv"

df = pd.read_csv(INPUT_FILE)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8866 entries, 0 to 8865
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   station_code            8866 non-null   int64  
 1   station_name            8866 non-null   object 
 2   year                    8866 non-null   int64  
 3   month                   8866 non-null   int64  
 4   day                     8866 non-null   int64  
 5   max_day_temp            8866 non-null   float64
 6   max_temp_status         4 non-null      float64
 7   min_day_temp            8866 non-null   float64
 8   min_temp_status         3 non-null      float64
 9   avg_day_temp            8866 non-null   float64
 10  avg_temp_status         6 non-null      float64
 11  min_ground_temp         8440 non-null   float64
 12  min_ground_temp_status  8866 non-null   float64
 13  rainfall                8440 non-null   float64
 14  rainfall_status         8371 non-null   

In [17]:
INPUT_FILE = "../data/one_station.csv"

df = pd.read_csv(INPUT_FILE)
with_errors = df[
    (
        (df["max_temp_status"] == 8) 
        | (df["min_temp_status"] == 8) 
        | (df["avg_temp_status"] == 8) 
        | (df["min_ground_temp_status"] == 8)
        | (df["rainfall_status"] == 8)
        | (df["snowfall_height_status"] == 8)
    )]
print(with_errors.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8866 entries, 0 to 8865
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   station_code            8866 non-null   int64  
 1   station_name            8866 non-null   object 
 2   year                    8866 non-null   int64  
 3   month                   8866 non-null   int64  
 4   day                     8866 non-null   int64  
 5   max_day_temp            8866 non-null   float64
 6   max_temp_status         4 non-null      float64
 7   min_day_temp            8866 non-null   float64
 8   min_temp_status         3 non-null      float64
 9   avg_day_temp            8866 non-null   float64
 10  avg_temp_status         6 non-null      float64
 11  min_ground_temp         8440 non-null   float64
 12  min_ground_temp_status  8866 non-null   float64
 13  rainfall                8440 non-null   float64
 14  rainfall_status         8371 non-null   

## 1.4 filter out bad columns and rows without measurements 

In [19]:
INPUT_FILE = "../data/one_station.csv"
OUTPUT_fILE = "../data/final.csv"

df = pd.read_csv(INPUT_FILE)
df = df.drop(
    [
        "station_code",
        "station_name",
        "min_ground_temp",
        "min_ground_temp_status",
        "rainfall",
        "rainfall_status",
        "rainfall_type",
        "snowfall_height",
        "snowfall_height_status"
    ],
    axis=1
)

df = df[
    ~(
        (df["max_temp_status"] == 8) 
        | (df["min_temp_status"] == 8) 
        | (df["avg_temp_status"] == 8)
    )]

df = df.drop(
    [
        "max_temp_status",
        "min_temp_status",
        "avg_temp_status",
    ],
    axis=1
)

print(df.info())
df.to_csv(OUTPUT_FILE, index=False, header=True)

<class 'pandas.core.frame.DataFrame'>
Index: 8860 entries, 0 to 8865
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          8860 non-null   int64  
 1   month         8860 non-null   int64  
 2   day           8860 non-null   int64  
 3   max_day_temp  8860 non-null   float64
 4   min_day_temp  8860 non-null   float64
 5   avg_day_temp  8860 non-null   float64
dtypes: float64(3), int64(3)
memory usage: 484.5 KB
None
