### Imports

In [1]:
%load_ext autoreload
%autoreload 2

import os
from pprint import pprint

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import matplotlib.pyplot as plt
import seaborn as sns

from openaq_anomaly_prediction.config import Configuration as config
from openaq_anomaly_prediction.utils.logging import logger, ProgressLogger
from openaq_anomaly_prediction.load.openmeteo import client as openmeteo
from openaq_anomaly_prediction.utils.helpers import get_trimestrial_periods

### Load the files

In [2]:
# MEASUREMENTS (already cleaned)
measurements_pq_path = os.path.join(config.DATA_EXPORT_PATH, "seoul_measurements_concatenated.parquet")
pq_measurements = pq.read_table(measurements_pq_path).to_pandas()
pprint(pq_measurements.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5763283 entries, 0 to 5763282
Data columns (total 28 columns):
 #   Column                      Dtype                                
---  ------                      -----                                
 0   location_id                 int64                                
 1   sensor_id                   int64                                
 2   name                        object                               
 3   value                       float64                              
 4   parameter.id                int64                                
 5   parameter.name              object                               
 6   parameter.units             object                               
 7   parameter.displayName       object                               
 8   period.datetimeFrom.local   datetime64[ns, pytz.FixedOffset(540)]
 9   period.datetimeTo.local     datetime64[ns, pytz.FixedOffset(540)]
 10  period.datetimeFrom.utc     da

In [3]:
# WEATHER
weather_pq_path = os.path.join(config.DATA_EXPORT_PATH, "seoul_weather.int.parquet")
pq_weather = pq.read_table(weather_pq_path).to_pandas()
pprint(pq_weather.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998064 entries, 0 to 998063
Data columns (total 36 columns):
 #   Column                      Non-Null Count   Dtype                     
---  ------                      --------------   -----                     
 0   datetimeWeather.local       998064 non-null  datetime64[ns, Asia/Seoul]
 1   datetimeWeather.utc         998064 non-null  datetime64[ns, UTC]       
 2   location_id                 998064 non-null  int64                     
 3   weather_latitude            998064 non-null  float64                   
 4   weather_longitude           998064 non-null  float64                   
 5   timezone                    998064 non-null  object                    
 6   elevation                   998064 non-null  float64                   
 7   temperature_2m              998064 non-null  float64                   
 8   relative_humidity_2m        998064 non-null  int64                     
 9   dew_point_2m                998064 no

### Merge on time & location_id

In [None]:
joined_df = pd.merge(pq_measurements, pq_weather, left_on=["period.datetimeTo.local", "location_id"], right_on=["datetimeWeather.utc", "location_id"], how="left")
pprint(joined_df.info())

### Sanity Checks

In [None]:
# Test what the weather.int.parquet looks like
# run_id = "seoul_2025_T4_2025-10-01_2025-12-31"
# filepath = os.path.join(
#     config.DATA_PARQUET_PATH,
#     run_id,
#     "openmeteo",
#     f"{run_id}_weather.int.parquet",
# )

filepath = os.path.join(config.DATA_EXPORT_PATH, "seoul_weather.int.parquet")

pq_weather = pd.read_parquet(filepath)
pq_weather.head()

In [None]:
pq_weather["datetimeWeather.local"].nunique()

print(f"LOADED WEATHER FILE: {filepath}")
print(f"Start date: {pq_weather['datetimeWeather.local'].min()}")
print(f"End date: {pq_weather['datetimeWeather.local'].max()}")
print(f"Total records: {len(pq_weather)}")
print(f"Unique locations: {pq_weather['location_id'].nunique()}")

In [None]:
full_range = pd.date_range(start=pq_weather['datetimeWeather.local'].min(), end=pq_weather['datetimeWeather.local'].max(), freq='h')
unique_hours_in_period = len(full_range)
unique_hours_in_data = pq_weather["datetimeWeather.local"].nunique()

In [None]:
# Preload the parquet file: THIS IS NOT LOADED IN MEMORY
filtered_table = pq.read_table(
    "seoul_complete.int.parquet",
    # filters=combined_filters,
    # Optional: only load the columns you actually need
    columns=[
        "location_id",
        "sensor_id",
        "name",
        "value",
        "parameter.id",
        "parameter.name",
        "parameter.units",
        "parameter.displayName",
        "period.datetimeFrom.local",
        "period.datetimeTo.local",
        "period.datetimeFrom.utc",
        "period.datetimeTo.utc",
        "location.datetimeFirst.utc",
        "location.datetimeLast.utc",
        "coordinates.latitude",
        "coordinates.longitude",
        "location_name",
        "country.code",
        "owner.name",
        "provider.name",
        "timestampTo",
        "datetimeWeather.local",
        "datetimeWeather.utc",
        "weather_latitude",
        "weather_longitude",
        "timezone",
        "elevation",
        "temperature_2m",
        "relative_humidity_2m",
        "dew_point_2m",
        "apparent_temperature",
        "precipitation",
        "rain",
        "snowfall",
        "snow_depth",
        "shortwave_radiation",
        "direct_radiation",
        "diffuse_radiation",
        "global_tilted_irradiance",
        "direct_normal_irradiance",
        "terrestrial_radiation",
        "weather_code",
        "pressure_msl",
        "surface_pressure",
        "cloud_cover",
        "cloud_cover_low",
        "cloud_cover_mid",
        "cloud_cover_high",
        "vapour_pressure_deficit",
        "et0_fao_evapotranspiration",
        "wind_speed_100m",
        "wind_speed_10m",
        "wind_direction_10m",
        "wind_direction_100m",
        "wind_gusts_10m",
        "is_day",
    ],
)