In [1]:
# Cell 1: Imports

import os
from pathlib import Path
from typing import Dict, Any, List

import pandas as pd
import numpy as np

# plus whatever you already use:
# from sklearn.something import ...
# import matplotlib.pyplot as plt
# ...


In [2]:
# Cell 2: Global config (sensors and dates)

# List of sensors. Adapt paths to your own structure.
SENSORS: List[Dict[str, Any]] = [
    {
        "sensor_id": "S1",
        "location": "Station_1",
        "raw_path": "data/raw/S1/",
        "feature_path": "data/features/S1/",
    },
    {
        "sensor_id": "S2",
        "location": "Station_2",
        "raw_path": "data/raw/S2/",
        "feature_path": "data/features/S2/",
    },
    # add more sensors here...
]

# Default date range for backfill (adapt to your project)
START_DATE = "2024-01-01"
END_DATE   = "2024-12-31"

# Choose mode: "single" or "all"
RUN_MODE = "all"     # "single" or "all"

# If RUN_MODE == "single", which sensor to use:
SINGLE_SENSOR_ID = "S1"


In [3]:
# Cell 3: Helper I/O functions

def ensure_dir(path: str) -> None:
    Path(path).mkdir(parents=True, exist_ok=True)


def load_raw_data(raw_path: str, start_date: str, end_date: str) -> pd.DataFrame:
    """
    Simple placeholder:
    - You can replace this with your own reading logic from the notebook.
    """
    # EXAMPLE: load all CSVs in the folder (you will adapt this)
    all_files = [
        os.path.join(raw_path, f)
        for f in os.listdir(raw_path)
        if f.endswith(".csv")
    ]
    if not all_files:
        return pd.DataFrame()

    df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    # You can filter by date here if you have a 'timestamp' column
    return df


def save_features(features_df: pd.DataFrame, feature_path: str, start_date: str, end_date: str) -> None:
    """
    Simple placeholder: save features as one Parquet file for the period.
    Replace with your existing saving logic if needed.
    """
    ensure_dir(feature_path)
    out_file = os.path.join(feature_path, f"features_{start_date}_to_{end_date}.parquet")
    features_df.to_parquet(out_file, index=False)


In [4]:
# Cell 4: Main function for one sensor

def run_backfill_for_sensor(sensor_cfg: Dict[str, Any], start_date: str, end_date: str) -> None:
    """
    This function corresponds to what your notebook is doing now, but for ONE sensor.
    You just make it use:
        - sensor_cfg["raw_path"]
        - sensor_cfg["feature_path"]
        - start_date, end_date
    """
    sensor_id   = sensor_cfg["sensor_id"]
    raw_path    = sensor_cfg["raw_path"]
    feature_path = sensor_cfg["feature_path"]

    print(f"\n[BACKFILL] Sensor={sensor_id} ({sensor_cfg.get('location', 'N/A')}) "
          f"from {start_date} to {end_date}")

    # 1) LOAD RAW DATA
    # If you already have code like:
    #   raw_df = pd.read_csv("data/raw/sensorX.csv")
    #   ...
    # replace it with a version that uses raw_path, start_date, end_date.
    # Example using our helper:
    raw_df = load_raw_data(raw_path, start_date, end_date)

    if raw_df.empty:
        print(f"  No raw data found for sensor {sensor_id} in this period.")
        return

    # 2) CLEANING
    # >>> PUT YOUR EXISTING CLEANING / PREPROCESSING CODE HERE <<<
    #
    # Example placeholder:
    clean_df = raw_df.copy()
    # clean_df = clean_df.dropna()
    # clean_df['timestamp'] = pd.to_datetime(clean_df['timestamp'])
    # clean_df = clean_df.sort_values('timestamp')

    # If in your original notebook you did something like:
    #   df_clean = ...
    # just rename to clean_df or keep the same name.

    # 3) FEATURE ENGINEERING
    # >>> PUT YOUR EXISTING FEATURE CREATION CODE HERE <<<
    #
    # Example placeholder:
    features_df = clean_df.copy()
    # features_df["pm25_ma_3"] = features_df["pm25"].rolling(3, min_periods=1).mean()
    # features_df["hour"] = features_df["timestamp"].dt.hour
    # etc...

    # 4) SAVE FEATURES
    # Here you use your existing saving logic, but parameterized by feature_path, start_date, end_date.
    # If in the notebook you had:
    #   features_df.to_csv("data/features/sensorX_features.csv", index=False)
    # replace the path with feature_path.
    #
    # Example using helper:
    save_features(features_df, feature_path, start_date, end_date)

    print(f"  Done. {len(features_df)} feature rows saved for sensor {sensor_id}.")


In [5]:
# Cell 5: Run for one sensor or all sensors

def get_sensor_by_id(sensor_id: str) -> Dict[str, Any]:
    for cfg in SENSORS:
        if cfg["sensor_id"] == sensor_id:
            return cfg
    raise ValueError(f"Unknown sensor_id: {sensor_id}")


if RUN_MODE == "single":
    print(f"Running backfill for SINGLE sensor: {SINGLE_SENSOR_ID}")
    sensor_cfg = get_sensor_by_id(SINGLE_SENSOR_ID)
    run_backfill_for_sensor(sensor_cfg, START_DATE, END_DATE)

elif RUN_MODE == "all":
    print("Running backfill for ALL sensors")
    for sensor_cfg in SENSORS:
        run_backfill_for_sensor(sensor_cfg, START_DATE, END_DATE)

else:
    raise ValueError(f"Unknown RUN_MODE: {RUN_MODE}")


Running backfill for ALL sensors

[BACKFILL] Sensor=S1 (Station_1) from 2024-01-01 to 2024-12-31


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/S1/'