# irp-dbk24 - "Optimising Demand Response Strategies for Carbon-Intelligent Electricity Use"

# Marginal Emissions Model Development

asdfasdfasd

### Importing Libraries

In [1]:
# ────────────────────────────────────────────────────────────────────────────
# Jupyter/Notebook Setup
# ────────────────────────────────────────────────────────────────────────────
%matplotlib inline
from IPython.display import display

# ────────────────────────────────────────────────────────────────────────────
# Core Data Handling
# ────────────────────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import polars as pl

# ────────────────────────────────────────────────────────────────────────────
# Machine Learning & Statistics
# ────────────────────────────────────────────────────────────────────────────
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import SplineTransformer
from sklearn.kernel_approximation import RBFSampler
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import (
    r2_score,
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy.stats import skew, kurtosis, zscore

from feature_engine.creation import CyclicalFeatures

# ────────────────────────────────────────────────────────────────────────────
# Visualization
# ────────────────────────────────────────────────────────────────────────────
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# ────────────────────────────────────────────────────────────────────────────
# Geospatial
# ────────────────────────────────────────────────────────────────────────────
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.wkb import loads
from pyproj import Proj, transform

# ────────────────────────────────────────────────────────────────────────────
# System & Utilities
# ────────────────────────────────────────────────────────────────────────────
import os
import re
import logging
import random
import binascii
import calendar
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional, Any, Union, Callable, Iterable, Sequence
import json
import hashlib
from pathlib import Path
import math
from itertools import product,combinations
from copy import deepcopy
from multiprocessing import Pool, cpu_count, Manager, Lock
from multiprocessing.pool import ThreadPool
from functools import wraps
from tqdm import tqdm
from functools import partial
import inspect

# ────────────────────────────────────────────────────────────────────────────
# Logging Configuration
# ────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

### Loading Data from Local Storage

#### Defining File Paths and Directories

In [2]:
# Reminder of data directory structure and contents

# This is a redundant code block, but it is included as a reminder of the directory variables.
base_data_directory = "data"  # Base directory where the dataframes will be saved

hitachi_data_directory = os.path.join(base_data_directory, "hitachi_copy")  # Directory where the dataframes will be saved
meter_save_directory = os.path.join(hitachi_data_directory, "meter_primary_files")

print("\n" + "-" * 120)
print(f"Contents of '{base_data_directory}' and subdirectories:\n" + "-" * 120)
for root, dirs, files in sorted(os.walk(base_data_directory)):
    for f in sorted(files):
        rel_dir = os.path.relpath(root, base_data_directory)
        rel_file = os.path.join(rel_dir, f) if rel_dir != "." else f
        print(f"  - {rel_file}")


------------------------------------------------------------------------------------------------------------------------
Contents of 'data' and subdirectories:
------------------------------------------------------------------------------------------------------------------------
  - .DS_Store
  - era5_data.parquet
  - era5/.DS_Store
  - era5/era5_reanalysis_data_2020-2025.parquet
  - era5/grib_downloads/125ae282169904325e8bc153160be150.grib
  - era5/grib_downloads/125ae282169904325e8bc153160be150.grib.47d85.idx
  - era5/grib_downloads/289f2aac241f8a158ff074a66682452e.grib
  - era5/grib_downloads/289f2aac241f8a158ff074a66682452e.grib.47d85.idx
  - era5/grib_downloads/554832a6209258041784298e5401a7ab.grib
  - era5/grib_downloads/554832a6209258041784298e5401a7ab.grib.47d85.idx
  - era5/grib_downloads/5aee58993569287064988fbc8ad385dd.grib
  - era5/grib_downloads/5aee58993569287064988fbc8ad385dd.grib.47d85.idx
  - era5/grib_downloads/5bcc58c42bdde8ce6b147b00099404bc.grib
  - era5/grib_dow

#### Loading Data

In [3]:
combined_weather_grid_data_filename = "grid_and_weather_stacked_20250714_1401"
combined_weather_grid_data_filepath = os.path.join(hitachi_data_directory, combined_weather_grid_data_filename + ".parquet")
combined_weather_grid_data_pldf = pl.read_parquet(combined_weather_grid_data_filepath)
display(combined_weather_grid_data_pldf.head(1))

timestamp,weather_longitude,weather_latitude,city,era5_longitude,era5_latitude,distance_btwn_weather_and_era5_m,gap_filling,temperature,precipitation_mm,surface_net_solar_radiation_kwh,surface_solar_radiation_downwards_kwh,total_cloud_cover,high_cloud_cover,medium_cloud_cover,low_cloud_cover,wind_speed,wind_direction,thermal_generation,gas_generation,hydro_generation,nuclear_generation,renewable_generation,total_generation,tons_co2,tonnes_CO2_emissions_year_specific,CO2_difference,relative_CO2_difference,g_co2_per_kwh,tons_co2_per_mwh,thermal_emission_factor_tonnes_CO2_per_mwh,gas_emission_factor_tonnes_CO2_per_mwh
datetime[μs],f64,f64,cat,f32,f32,f64,list[struct[2]],f32,f64,f64,f64,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2022-02-08 08:30:00,72.87,18.8,"""mumbai""",72.75,18.75,0.13,[],25.04216,0.000428,3.404626,3.69525,0.018921,0.0,0.0,0.018921,9.812794,119.757309,143411.5,3495.833333,22310.833333,5228.666667,8120.916667,182567.75,70656.6453,70963.727667,307.082367,0.004346,774.04315,0.774043,0.978,0.478


### Data Processing

In [None]:
### Initial Stats about the data

# Stats about the new dataframe
print("\n" + "-" * 120)
print("Combined DataFrame (Grid Readings and Weather):\n" + "-" * 120)
print(f"Number of rows: {combined_weather_grid_data_pldf.shape[0]:,}")
print(f"Number of columns: {combined_weather_grid_data_pldf.shape[1]}")
print(f"Columns: {combined_weather_grid_data_pldf.columns}")
print("\n" + "-" * 80)
print("Time Boundaries:")
print(f"\tStart Time: {combined_weather_grid_data_pldf['timestamp'].min()}")
print(f"\tEnd Time: {combined_weather_grid_data_pldf['timestamp'].max()}")
print(f"\tTotal Duration: {combined_weather_grid_data_pldf['timestamp'].max() - combined_weather_grid_data_pldf['timestamp'].min()}")
print("\n" + "-" * 80)


------------------------------------------------------------------------------------------------------------------------
Combined DataFrame (Grid Readings and Weather):
------------------------------------------------------------------------------------------------------------------------
Number of rows: 3,426,345
Number of columns: 32
Columns: ['timestamp', 'weather_longitude', 'weather_latitude', 'city', 'era5_longitude', 'era5_latitude', 'distance_btwn_weather_and_era5_m', 'gap_filling', 'temperature', 'precipitation_mm', 'surface_net_solar_radiation_kwh', 'surface_solar_radiation_downwards_kwh', 'total_cloud_cover', 'high_cloud_cover', 'medium_cloud_cover', 'low_cloud_cover', 'wind_speed', 'wind_direction', 'thermal_generation', 'gas_generation', 'hydro_generation', 'nuclear_generation', 'renewable_generation', 'total_generation', 'tons_co2', 'tonnes_CO2_emissions_year_specific', 'CO2_difference', 'relative_CO2_difference', 'g_co2_per_kwh', 'tons_co2_per_mwh', 'thermal_emission_fa

In [5]:
print("Memory Usage:\n" + "-" * 80)
print(f"Data size in Memory: {combined_weather_grid_data_pldf.estimated_size() / 1024**2:,.2f} MB")
print(f"Data size on Disk: {os.path.getsize(combined_weather_grid_data_filepath) / 1024**2:,.2f} MB")
print(f"Memory Usage per column:")
for col in combined_weather_grid_data_pldf.columns:
    size_bytes = combined_weather_grid_data_pldf.select(pl.col(col)).estimated_size()
    print(f"{col}: {size_bytes / 1024:.2f} MB")

Memory Usage:
--------------------------------------------------------------------------------
Data size in Memory: 724.89 MB
Data size on Disk: 233.36 MB
Memory Usage per column:
timestamp: 26768.32 MB
weather_longitude: 26768.32 MB
weather_latitude: 26768.32 MB
city: 13384.19 MB
era5_longitude: 13384.16 MB
era5_latitude: 13384.16 MB
distance_btwn_weather_and_era5_m: 26768.32 MB
gap_filling: 46306.61 MB
temperature: 13384.16 MB
precipitation_mm: 26768.32 MB
surface_net_solar_radiation_kwh: 26768.32 MB
surface_solar_radiation_downwards_kwh: 26768.32 MB
total_cloud_cover: 13384.16 MB
high_cloud_cover: 13384.16 MB
medium_cloud_cover: 13384.16 MB
low_cloud_cover: 13384.16 MB
wind_speed: 13384.16 MB
wind_direction: 13384.16 MB
thermal_generation: 26768.32 MB
gas_generation: 26768.32 MB
hydro_generation: 26768.32 MB
nuclear_generation: 26768.32 MB
renewable_generation: 26768.32 MB
total_generation: 26768.32 MB
tons_co2: 26768.32 MB
tonnes_CO2_emissions_year_specific: 26768.32 MB
CO2_differe

#### Train - Validate - Test Splits

In [None]:
train_start_date = combined_weather_grid_data_pldf['timestamp'].min()
train_end_date = datetime(2023, 12,31, 23, 59, 59)
validation_start_date = datetime(2024, 1 , 1)
validation_end_date = datetime(2024, 5 , 31, 23, 59, 59)
test_start_date = datetime(2024, 6 , 1)
test_end_date = combined_weather_grid_data_pldf['timestamp'].max()

print("\n" + "-" * 120)
print(f"Boundaries for Train, Validation, and Test Sets:\n" + "-" * 120)
print(f"Train Set: {train_start_date} to {train_end_date}")
print(f"Validation Set: {validation_start_date} to {validation_end_date}")
print(f"Test Set: {test_start_date} to {test_end_date}")


------------------------------------------------------------------------------------------------------------------------
Boundaries for Train, Validation, and Test Sets:
------------------------------------------------------------------------------------------------------------------------
Train Set: 2021-01-01 00:00:00 to 2023-12-31 23:59:59
Validation Set: 2024-01-01 00:00:00 to 2024-05-31 23:59:59
Test Set: 2024-06-01 00:00:00 to 2025-05-31 23:00:00


In [None]:
# Split the combined DataFrame into training, validation, and test sets based on the revised dates
train_pldf = combined_weather_grid_data_pldf.filter(
    (pl.col("timestamp") >= train_start_date) & (pl.col("timestamp") <= train_end_date)
)
validation_pldf = combined_weather_grid_data_pldf.filter(
    (pl.col("timestamp") >= validation_start_date) & (pl.col("timestamp") <= validation_end_date)
)
test_pldf = combined_weather_grid_data_pldf.filter(
    (pl.col("timestamp") >= test_start_date) & (pl.col("timestamp") <= test_end_date)
)

In [8]:
# Conversion to Pandas DataFrame for compatibility with existing code
train_pdf = train_pldf.to_pandas()
validation_pdf = validation_pldf.to_pandas()
test_pdf = test_pldf.to_pandas()

### Model Development

#### EDA

In [9]:
display(train_pldf.describe())

statistic,timestamp,weather_longitude,weather_latitude,city,era5_longitude,era5_latitude,distance_btwn_weather_and_era5_m,gap_filling,temperature,precipitation_mm,surface_net_solar_radiation_kwh,surface_solar_radiation_downwards_kwh,total_cloud_cover,high_cloud_cover,medium_cloud_cover,low_cloud_cover,wind_speed,wind_direction,thermal_generation,gas_generation,hydro_generation,nuclear_generation,renewable_generation,total_generation,tons_co2,tonnes_CO2_emissions_year_specific,CO2_difference,relative_CO2_difference,g_co2_per_kwh,tons_co2_per_mwh,thermal_emission_factor_tonnes_CO2_per_mwh,gas_emission_factor_tonnes_CO2_per_mwh
str,str,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""2333610""",2333610.0,2333610.0,"""2333610""",2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0,2333610.0
"""null_count""","""0""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""","""2022-07-02 13:08:16.424852""",75.701111,25.393333,,75.688889,25.388889,0.094474,,25.489737,1.760304,2.658106,3.196327,0.367619,0.27045,0.157853,0.103301,5.865259,165.680542,127254.206979,3493.420794,18321.211727,4898.900135,17235.898926,171203.638561,62488.527128,62473.077375,-15.449753,-3.3e-05,737.269524,0.73727,0.973968,0.464554
"""std""",,1.969453,4.538596,,1.979167,4.544703,0.035546,,10.671576,6.049977,2.007118,2.413419,0.391935,0.383951,0.24656,0.211769,3.098983,86.422653,15961.404462,1294.633173,8786.321193,630.321572,12800.744352,21462.533537,8389.46305,8291.5231,267.617474,0.004062,73.470494,0.07347,0.003736,0.010994
"""min""","""2021-01-01 00:00:00""",72.77,18.5,,72.75,18.5,0.01,,2.149261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011599,0.00136,75325.666667,21.0,3093.333333,2281.083333,495.083333,106969.666667,8329.1386,8339.84,-443.336742,-0.005538,501.631083,0.501631,0.969,0.42
"""25%""","""2021-09-28 11:00:00""",72.97,19.2,,73.0,19.25,0.06,,21.685486,0.0,0.565495,0.672136,0.0,0.0,0.0,0.0,3.76971,103.95974,115573.083333,2497.416667,10453.9,4394.916667,6512.0,157253.083333,56852.5085,56924.144417,-353.163908,-0.005449,688.547518,0.688548,0.969,0.451
"""50%""","""2022-07-05 17:00:00""",76.94,28.5,,77.0,28.5,0.102956,,26.253143,0.001717,2.817825,3.394795,0.195831,0.002014,0.027802,0.0,5.369909,145.199127,128801.25,3118.416667,17434.416667,4986.833333,13138.333333,172326.166667,63340.8632,63393.516083,66.01575,0.001167,736.58131,0.736581,0.975,0.465
"""75%""","""2023-04-04 16:30:00""",77.14,28.7,,77.25,28.75,0.12083,,29.450287,0.604347,4.328448,5.204252,0.787811,0.576538,0.225677,0.096344,7.346128,238.304382,139695.666667,4266.25,25290.583333,5391.0,25976.583333,186118.25,68761.20965,68724.192667,240.04755,0.004213,797.770909,0.797771,0.978,0.478
"""max""","""2023-12-31 23:30:00""",77.34,28.8,,77.25,28.75,0.156205,,305.190186,136.593109,6.810003,8.057081,1.0,1.0,1.0,1.0,42.517315,359.999969,165972.666667,10157.0,43162.083333,6387.083333,63835.75,243275.75,83028.12695,82584.814625,342.836017,0.00472,878.867047,0.878867,0.978,0.478


In [10]:
display(train_pldf.schema)

Schema([('timestamp', Datetime(time_unit='us', time_zone=None)),
        ('weather_longitude', Float64),
        ('weather_latitude', Float64),
        ('city', Categorical(ordering='physical')),
        ('era5_longitude', Float32),
        ('era5_latitude', Float32),
        ('distance_btwn_weather_and_era5_m', Float64),
        ('gap_filling', List(Struct({'column': String, 'method': String}))),
        ('temperature', Float32),
        ('precipitation_mm', Float64),
        ('surface_net_solar_radiation_kwh', Float64),
        ('surface_solar_radiation_downwards_kwh', Float64),
        ('total_cloud_cover', Float32),
        ('high_cloud_cover', Float32),
        ('medium_cloud_cover', Float32),
        ('low_cloud_cover', Float32),
        ('wind_speed', Float32),
        ('wind_direction', Float32),
        ('thermal_generation', Float64),
        ('gas_generation', Float64),
        ('hydro_generation', Float64),
        ('nuclear_generation', Float64),
        ('renewable_genera

##### Data Quality and Diagnostics Functions

In [None]:
def correlation_checker(df: pd.DataFrame, columns: Optional[List[str]] = None, display_plot: bool = True, display_table: bool = False) -> pd.DataFrame:
    """
    Compute the Pearson correlation matrix for a subset of columns, optionally plot a heatmap and or display the correlation table.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.
    columns : List[str]
        List of column names to include in the correlation analysis.
        If none, all numeric columns will be used.
    display_plot : bool, optional
        Whether to display a heatmap of the correlation matrix (default is True).
    display_table : bool, optional
        Whether to display the correlation matrix as a table (default is False).

    Returns
    -------
    corr_matrix : pd.DataFrame
        The correlation matrix of the specified columns.
        Also displays a heatmap if `display_plot` is True and a table if `display_table` is True.
    """
    # If no columns specified, use all numeric columns
    if columns is None:
        # If no columns specified, use all numeric columns
        cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # If columns are specified, ensure they are in the dataframe and numeric
    else:
        # Keep only existing numeric columns
        cols = [c for c in columns
                if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]

    # If no valid numeric columns are found, raise an error
    if not cols:
        raise ValueError("No valid numeric columns provided for correlation analysis.")

    # Subset and compute correlation
    corr_matrix = df[cols].corr()

    if display_plot:
        n = len(cols)
        # Dynamic figure size: at least 6x6, scale by number of cols
        fig_size = (max(6, n * 0.8), max(6, n * 0.8))
        fig, ax = plt.subplots(figsize=fig_size)
        sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="vlag", square=True, ax=ax)
        ax.set_title("Correlation Matrix for Selected Numeric Columns")
        plt.tight_layout()
        plt.show()

    if display_table:
        print("-" * 120)
        print("\nCorrelation Matrix for Selected Numeric Columns:\n"+ "-" * 120)
        display(corr_matrix)

    return corr_matrix

In [12]:
def vif_calculator(df: pd.DataFrame, columns: Optional[List[str]] = None, display_table: bool = False) -> pd.DataFrame:
    """
    Compute Variance Inflation Factors (VIF) for a subset of columns.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.
    columns : List[str]
        List of column names to include in the VIF calculation.
        If none, all numeric columns will be used.

    Returns
    -------
    vif_df : pd.DataFrame
        DataFrame with columns ['Variable', 'VIF'], excluding the constant term.
    """
    # Step 1: Select numeric columns
    if columns is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        cols = [c for c in columns if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]

    if not cols:
        raise ValueError("No valid numeric columns provided for VIF calculation.")

    # Step 2: Extract and clean
    X = df[cols].copy()

    # Step 3: Convert all to float64 safely
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')

    # Step 4: Drop rows with NaN/Inf
    X = X.replace([np.inf, -np.inf], np.nan).dropna()

    # Step 5: Force float64 dtype
    try:
        X = X.astype(np.float64)
    except Exception as e:
        raise TypeError(f"Failed to convert columns to float64. Error: {e}")

    # # Diagnostic: Show dtypes and values before VIF
    # print("Final column dtypes:")
    # print(X.dtypes)
    # print("\nSample values:")
    # print(X.head())

    # Step 6: Add constant
    X = sm.add_constant(X)

    # Step 7: Final type check
    if X.values.dtype != np.float64:
        raise TypeError(f"X.values is not float64! dtype is {X.values.dtype}")

    # Step 8: Calculate VIF
    vif_data = []
    for i in range(X.shape[1]):
        vif = variance_inflation_factor(X.values, i)
        vif_data.append((X.columns[i], vif))

    # Step 9: Drop constant
    vif_df = pd.DataFrame(vif_data, columns=["Variable", "VIF"])
    vif_df = vif_df[vif_df["Variable"] != "const"].reset_index(drop=True)

    if display_table:
        from IPython.display import display
        print("\n" + "-" * 100)
        print("Variance Inflation Factors (VIF):")
        display(vif_df)

    return vif_df


In [13]:
def compute_eigenvalues_and_condition(df: pd.DataFrame, columns: Optional[List[str]] = None, show: bool = False) -> Tuple[np.ndarray, float]:
    """
    Compute eigenvalues of X^T X and the condition number of the design matrix.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    columns : list of str, optional
        Numeric columns to include in the calculation. If None, all numeric columns are used.
    show : bool, default=False
        If True, prints a table of eigenvalues and the condition number.

    Returns
    -------
    eigenvalues : np.ndarray
        Eigenvalues of the covariance‐like matrix X^T X.
    condition_number : float
        Condition number (ratio of largest to smallest singular value) of X.
    """
    # Step 1: Select numeric (non-bool) columns
    if columns is None:
        cols = [
            c for c in df.columns
            if pd.api.types.is_numeric_dtype(df[c]) and not pd.api.types.is_bool_dtype(df[c])
        ]
    else:
        cols = [
            c for c in columns
            if c in df.columns and pd.api.types.is_numeric_dtype(df[c]) and not pd.api.types.is_bool_dtype(df[c])
        ]

    if not cols:
        raise ValueError("No valid numeric (non-boolean) columns provided.")

    # Step 2: Clean and coerce to float64
    X = df[cols].apply(pd.to_numeric, errors="coerce").dropna().astype("float64")

    if X.empty:
        raise ValueError("All rows were dropped after coercion and NaN removal.")

    # Step 3: Safe matrix ops
    cov_mat = X.T @ X
    eigenvalues = np.linalg.eigvals(cov_mat)
    condition_number = np.linalg.cond(X.values)

    # Step 4: Optional display
    if show:
        eig_df = pd.DataFrame({
            "Index": np.arange(1, len(eigenvalues) + 1),
            "Eigenvalue": np.round(eigenvalues, 4)
        }).sort_values("Eigenvalue", ascending=False).reset_index(drop=True)
        print("\n" + "-"*120)
        print("Results of Eigenvalue and Condition Number Calculation:\n" + "-"*120)
        print(f"Condition Number: {condition_number:.4g}\n")
        print("Eigenvalues of X.T X:")
        display(eig_df)

    return eigenvalues, condition_number

In [14]:
def skew_kurtosis_checker(
    df: pd.DataFrame,
    columns: Optional[List[str]] = None,
    display_table: bool = False,
    plot_distributions: bool = False,
    bins: int = 30,
    figsize_per : tuple = (5,3)
) -> pd.DataFrame:
    """
    Compute skewness and kurtosis for specified numeric columns, optionally display results and histograms.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    columns : list of str, optional
        Numeric columns to include; if None, all numeric columns are used.
    display_table : bool, default=False
        If True, prints a table of skewness and kurtosis.
    plot_distributions : bool, default=False
        If True, plots histograms with overlaid KDE for each column.
    bins : int, default=30
        Number of bins for histograms.
    figsize_per : tuple, default=(5, 3)
        Size of each subplot (width, height) in inches.

    Returns
    -------
    stats_df : pd.DataFrame
        DataFrame with index=column names and columns ['skewness','kurtosis'].
    """
    # Determine numeric columns
    if columns is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        cols = [c for c in columns if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not cols:
        raise ValueError("No valid numeric columns provided for skew/kurtosis calculation.")

    # Compute skewness and kurtosis
    data = df[cols]
    skewness_vals = data.apply(lambda x: skew(x.dropna()), axis=0)
    kurtosis_vals = data.apply(lambda x: kurtosis(x.dropna()), axis=0)

    stats_df = pd.DataFrame({'skewness': skewness_vals, 'kurtosis': kurtosis_vals}, index=cols)

    # Display table if requested
    if display_table:
        print("\n" + "-"*120)
        print("Skewness and Kurtosis:")
        print("-"*120)
        display(stats_df)

    # 4) Plot distributions if requested
    if plot_distributions:
        n = len(cols)
        fig, axes = plt.subplots(
            nrows=n, ncols=2,
            figsize=(figsize_per[0]*2, figsize_per[1]*n),
            squeeze=False
        )

        for i, col in enumerate(cols):
            # Skewness plot
            ax1 = axes[i, 0]
            sns.histplot(df[col].dropna(), bins=bins, stat="density", kde=True, ax=ax1)
            ax1.set_title(f"{col}\nskewness = {skewness_vals[col]:.2f}")
            ax1.set_xlabel(col)
            ax1.set_ylabel("Density")

            # Kurtosis plot
            ax2 = axes[i, 1]
            sns.histplot(df[col].dropna(), bins=bins, stat="density", kde=True, ax=ax2)
            ax2.set_title(f"{col}\nkurtosis = {kurtosis_vals[col]:.2f}")
            ax2.set_xlabel(col)
            ax2.set_ylabel("Density")

        plt.tight_layout()
        plt.show()

    return stats_df

In [15]:
def outlier_zscore_checker(
    df: pd.DataFrame,
    columns: Optional[List[str]] = None,
    threshold: float = 3.0,
    display_table: bool = False
) -> pd.DataFrame:
    """
    Identify outliers using the Z-score method for specified numeric columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    columns : list of str, optional
        Columns to analyze. If None, all numeric columns are used.
    threshold : float, default=3.0
        Absolute Z-score above which an observation is considered an outlier.
    display_table : bool, default=False
        If True, prints a summary table of outlier counts and percentages.

    Returns
    -------
    mask_df : pd.DataFrame
        Boolean DataFrame (same index as df) with True for outliers, one column per variable.
    """
    # Select numeric columns if not provided
    if columns is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        cols = [c for c in columns if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not cols:
        raise ValueError("No valid numeric columns provided for Z-score outlier detection.")

    mask_df = pd.DataFrame(index=df.index)
    for col in cols:
        series = df[col]
        # Compute Z-scores ignoring NaNs
        zs = zscore(series.dropna())
        mask = pd.Series(False, index=df.index)
        mask.loc[series.dropna().index] = np.abs(zs) > threshold
        mask_df[col] = mask

    if display_table:
        counts = mask_df.sum()
        percents = counts / len(df) * 100
        summary = pd.DataFrame({'n_outliers': counts, 'pct_outliers': percents.round(2)})
        print("\n" + "-" * 120)
        print("Z-score Outlier Summary:\n" + "-" * 120)
        display(summary)

    return mask_df

In [16]:
def outlier_iqr_checker(
    df: pd.DataFrame,
    columns: Optional[List[str]] = None,
    k: float = 1.5,
    display_table: bool = False
) -> pd.DataFrame:
    """
    Identify outliers using the IQR method for specified numeric columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    columns : list of str, optional
        Columns to analyze. If None, all numeric columns are used.
    k : float, default=1.5
        Multiplier for the IQR to define outlier thresholds.
    display_table : bool, default=False
        If True, prints a summary table of outlier counts and percentages.

    Returns
    -------
    mask_df : pd.DataFrame
        Boolean DataFrame (same index as df) with True for outliers, one column per variable.
    """
    # Select numeric columns if not provided
    if columns is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        cols = [c for c in columns if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not cols:
        raise ValueError("No valid numeric columns provided for IQR outlier detection.")

    mask_df = pd.DataFrame(index=df.index)
    for col in cols:
        series = df[col]
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - k * IQR, Q3 + k * IQR
        mask = series.lt(lower) | series.gt(upper)
        mask_df[col] = mask.fillna(False)

    if display_table:
        counts = mask_df.sum()
        percents = counts / len(df) * 100
        summary = pd.DataFrame({'n_outliers': counts, 'pct_outliers': percents.round(2)})
        print("\n" + "-" * 120)
        print("IQR Outlier Summary:\n" + "-" * 120)
        display(summary)

    return mask_df

##### Diagnostics and Data Quality - Visualisations

In [None]:
def plot_numeric_distribution(
    df: pd.DataFrame,
    columns: Optional[List[str]] = None,
    bins: int = 30,
    figsize_per: Tuple[float, float] = (5, 3),
    ncols: int = 3,
    hspace: float = 0.4,
    wspace: float = 0.3
) -> None:
    """
    Plot histograms for numeric columns in a grid of subplots with 3 columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    columns : list of str, optional
        Columns to analyze. If None, all numeric columns are used.
    bins : int, default=30
        Number of bins for histograms.
    figsize_per : tuple, default=(5,3)
        Width, height per subplot.
    ncols : int, default=3
        Number of columns in the subplot grid.
    hspace : float, default=0.4
        Vertical space between rows.
    wspace : float, default=0.3
        Horizontal space between columns.

    Returns
    -------
    None
        Displays histograms for each specified column.

    """
    # Select numeric columns
    if columns is None:
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        cols = [c for c in columns if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not cols:
        raise ValueError("No valid numeric columns provided.")

    n = len(cols)
    nrows = math.ceil(n / ncols)

    # Set up the figure and axes grid
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols,
                             figsize=(figsize_per[0] * ncols, figsize_per[1] * nrows),
                             squeeze=False)
    axes_flat = axes.flatten()

    # Plot each distribution
    for i, col in enumerate(cols):
        ax = axes_flat[i]
        sns.histplot(df[col].dropna(), bins=bins, kde=True, ax=ax)
        ax.set_title(f"Distribution of {col}")
        ax.set_xlabel(col)
        ax.set_ylabel("Density")

    # Hide any unused subplots
    for j in range(n, len(axes_flat)):
        axes_flat[j].axis('off')

    # Adjust spacing
    plt.subplots_adjust(hspace=hspace, wspace=wspace)
    plt.tight_layout()
    plt.show()


In [18]:
def plot_categorical_distribution(
    df: pd.DataFrame,
    columns: Optional[List[str]] = None,
    figsize_per: Tuple[float, float] = (6, 4),
    ncols: int = 3,
    hspace: float = 0.4,
    wspace: float = 0.3
) -> None:
    """
    Compute value counts and plot distributions for categorical columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    columns : list of str, optional
        Columns to analyze. If None, all object or category dtypes are used.
    figsize_per : tuple, default=(6,4)
        Width, height per subplot.
    ncols : int, default=3
        Number of columns in the subplot grid.
    hspace : float, default=0.4
        Vertical space between rows.
    wspace : float, default=0.3
        Horizontal space between columns.

    Returns
    -------
    None
        Displays count plots for each specified categorical column.
    """
    # Select categorical columns
    if columns is None:
        cols = df.select_dtypes(include=['category', 'object']).columns.tolist()
    else:
        cols = [c for c in columns if c in df.columns]

    if not cols:
        raise ValueError("No valid categorical columns provided.")

    # Calculate number of rows needed
    n = len(cols)
    nrows = math.ceil(n / ncols)

    # Set up the figure and axes grid
    fig, axes = plt.subplots(
        nrows=nrows, ncols=ncols,
        figsize=(figsize_per[0]*ncols, figsize_per[1]*nrows),
        squeeze=False
    )
    axes_flat = axes.flatten()

    # Plot each categorical distribution
    for i, col in enumerate(cols):
        ax = axes_flat[i]
        order = df[col].value_counts(dropna=False).index
        sns.countplot(y=col, data=df, order=order, ax=ax)
        ax.set_title(f"Counts of {col}")
        ax.set_xlabel("Count")
        ax.set_ylabel(col)

    # Hide any unused subplots
    for j in range(n, len(axes_flat)):
        axes_flat[j].axis('off')

    # Adjust spacing & show
    plt.subplots_adjust(hspace=hspace, wspace=wspace)
    plt.tight_layout()
    plt.show()

    return None

##### Feature Generation Functions

In [None]:
class DateTimeFeatureAdder(BaseEstimator, TransformerMixin):
    """
    Add datetime-based features from a timestamp column.

    Parameters
    ----------
    timestamp_col : str, default="timestamp"
        Name of the column containing datetime strings or pd.Timestamp.

    Raises
    ------
    TypeError
        If `timestamp_col` is not found in the DataFrame.

    See Also
    --------
    sklearn.pipeline.Pipeline : for chaining this transformer with others.
    """
    def __init__(
        self,
        timestamp_col: str = "timestamp",
    ):
        """
        Initialize the feature adder.

        Parameters
        ----------
        timestamp_col : str
            Column name to parse as datetime.
        """
        if not isinstance(timestamp_col, str):
            raise TypeError("timestamp_col must be a string representing the column name.")

        self.timestamp_col = timestamp_col

    def fit(self, X, y=None):
        """
        No-op fit. Exists for sklearn compatibility.

        Returns
        -------
        self : DateTimeFeatureAdder
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")
        if self.timestamp_col not in X.columns:
            raise KeyError(f"Column '{self.timestamp_col}' not found in DataFrame.")
        # No fitting logic needed, just return self
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transform X by adding:

        - year (int)
        - month (int)
        - week_of_year (int)
        - day (int)
        - hour (int)
        - half_hour (int, 0-47)
        - day_of_week (int, 1=Mon)
        - is_weekend (0/1)

        Parameters
        ----------
        X : pd.DataFrame
            Input DataFrame with a column named `self.timestamp_col`.

        Returns
        -------
        X_out : pd.DataFrame
            Copy of X with the above new columns appended.

        Raises
        ------
        KeyError
            If `self.timestamp_col` is not present in X.
        """
        df = X.copy()
        # Attempt to convert the timestamp column to datetime (if not already)
        try:
            df[self.timestamp_col] = pd.to_datetime(df[self.timestamp_col], errors='raise')
        except Exception as e:
            raise TypeError(f"Column '{self.timestamp_col}' could not be converted to datetime: {e}")

        dt = df[self.timestamp_col]
        df["year"] = dt.dt.year.astype('int32')
        df["month"] = dt.dt.month.astype('int32')
        df["week_of_year"] = dt.dt.isocalendar().week.astype('int32')
        df["day"] = dt.dt.day.astype('int32')
        df["hour"] = dt.dt.hour.astype('int32')
        df["half_hour"] = ((dt.dt.minute // 30) + (dt.dt.hour * 2)).astype('int32')
        df["day_of_week"] = (dt.dt.dayofweek).astype('int32') + 1  # Monday=1
        df["is_weekend"] = (df["day_of_week"] >= 6).astype('int32')

        return df


In [None]:
class GenerationShareAdder(BaseEstimator, TransformerMixin):
    """
    Add percentage‐share features for specified generation columns relative to a total.

    Parameters
    ----------
    generation_cols : List[str]
        Names of the columns whose share of `total_col` should be computed.
    total_col : str, default="total_generation"
        Name of the column containing the denominator for share calculation.
    suffix : str, default="_share"
        Suffix to append to each new share column.

    Raises
    ------
    KeyError
        If any of the specified `generation_cols` or `total_col` is not present in the DataFrame.

    See Also
    --------
    sklearn.pipeline.Pipeline : chaining this transformer with others.
    """

    def __init__(
        self,
        generation_cols: List[str],
        total_col: str = "total_generation",
        suffix: str = "_share"
    ):
        """
        Initialize the share adder.

        Parameters
        ----------
        generation_cols : List[str]
            Columns to convert into percentage shares.
        total_col : str
            Column used as the denominator in share calculation.
        suffix : str
            Suffix for the new share columns.

        Raises
        ------
        TypeError
            If `generation_cols` is not a list of strings, or if `total_col` or `suffix` are not strings.
        """
        if not isinstance(generation_cols, list) or not all(isinstance(col, str) for col in generation_cols):
            raise TypeError("generation_cols must be a list of strings.")
        if not isinstance(total_col, str):  # Ensure total_col is a string
            raise TypeError("total_col must be a string representing the column name.")
        if not isinstance(suffix, str):  # Ensure suffix is a string
            raise TypeError("suffix must be a string representing the suffix for new columns.")

        self.generation_cols = generation_cols
        self.total_col = total_col
        self.suffix = suffix

    def fit(self, X, y=None):
        """
        No‐op fit for compatibility with sklearn’s transformer API.

        Parameters
        ----------
        X : pd.DataFrame
            Input DataFrame.
        y : Ignored

        Returns
        -------
        self : GenerationShareAdder

        Raises
        ------
        TypeError
            If `X` is not a pandas DataFrame.
        KeyError
            If any of the specified `generation_cols` or `total_col` is not present in the DataFrame.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        # Check if all generation_cols and total_col are present in X
        missing_cols = [col for col in self.generation_cols if col not in X.columns]
        if missing_cols:
            raise KeyError(f"Generation columns {missing_cols} not found in input DataFrame")
        if self.total_col not in X.columns:
            raise KeyError(f"Total column '{self.total_col}' not found in input DataFrame")

        # No fitting logic needed, just return self
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Compute and append share columns.

        For each `col` in `generation_cols`, creates a new column
        `col + suffix` = 100 * (X[col] / X[total_col]). Zeros in `total_col`
        are treated as NaN to avoid division‐by‐zero.

        Parameters
        ----------
        X : pd.DataFrame
            Input DataFrame containing `generation_cols` and `total_col`.

        Returns
        -------
        X_out : pd.DataFrame
            Copy of X with additional `<col><suffix>` columns.

        """
        df = X.copy()

        total = df[self.total_col].replace({0: np.nan})
        for col in self.generation_cols:
            share_col = f"{col}{self.suffix}"
            df[share_col] = df[col] / total * 100

        return df


In [None]:
class WindDirectionCategorizer(BaseEstimator, TransformerMixin):
    """
    Convert wind‑direction degrees into cardinal categories.

    Parameters
    ----------
    direction_col : str, default="wind_direction"
        Name of the column containing wind direction in degrees (0–360).
    n_directions : int, default=8
        Number of cardinal bins: must be either 8 or 16.
        - 8 yields: ["N","NE","E","SE","S","SW","W","NW"]
        - 16 yields: ["N","NNE","NE","ENE", …, "NNW"]

    Raises
    ------
    ValueError
        If `n_directions` is not 4, 8 or 16.

    See Also
    --------
    pandas.cut : for binning numeric values.
    """

    def __init__(
        self,
        direction_col: str = "wind_direction",
        n_directions: int = 8
    ):
        """
        Initialize the categorizer.

        Parameters
        ----------
        direction_col : str
            Column name to read wind‑direction degrees from.
        n_directions : int
            Number of bins (4, 8 or 16) for cardinal directions.
        """
        # Validate input parameters
        if not isinstance(direction_col, str):
            raise TypeError("direction_col must be a string representing the column name.")
        if n_directions not in [4, 8, 16]:
            raise ValueError("n_directions must be either 4, 8 or 16.")

        # Assign attributes
        self.direction_col = direction_col
        self.n_directions = n_directions

        # Define cardinal labels based on n_directions
        if n_directions == 8:
            self.labels = ["N", "NE", "E", "SE", "S", "SW", "W", "NW"]
        elif n_directions == 4:
            self.labels = ["N", "E", "S", "W"]
        elif n_directions == 16:
            self.labels = [
                "N", "NNE", "NE", "ENE", "E", "ESE", "SE", "SSE",
                "S", "SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"
            ]
        else:
            raise ValueError("n_directions must be 4, 8 or 16")

    def fit(self, X, y=None):
        """
        No‑op fit for transformer API compatibility.

        Parameters
        ----------
        X : pd.DataFrame
            Input DataFrame.
        y : Ignored
            Not used, exists for sklearn API.

        Returns
        -------
        self : WindDirectionCategorizer
        """
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Bin the wind‑direction column into named cardinal categories.

        Parameters
        ----------
        X : pd.DataFrame
            DataFrame containing `self.direction_col` in degrees.

        Returns
        -------
        X_out : pd.DataFrame
            Copy of X with a new column `"wind_dir_cardinal"` holding
            the categorical labels.

        Raises
        ------
        KeyError
            If `self.direction_col` is not in X.
        """
        df = X.copy()
        if self.direction_col not in df:
            raise KeyError(f"Column '{self.direction_col}' not found in input DataFrame")

        deg = df[self.direction_col] % 360

        step = 360 / self.n_directions
        edges = np.linspace(-step / 2, 360 - step/2, self.n_directions + 1)

        df["wind_dir_cardinal"] = pd.cut(
            deg,
            bins=edges,
            labels=self.labels,
            include_lowest=True,
            right=False
        )
        return df

In [None]:
class ElectricityPeakAdder(BaseEstimator, TransformerMixin):
    """
    Flag morning/evening peak and off‑peak periods per weekday.

    Parameters
    ----------
    timestamp_col : str, default="timestamp"
        Name of the datetime column.
    peak_hours : Dict[int, Dict[str, List[int]]]
        Mapping from day_of_week (0=Mon…6=Sun) to two lists:
          { day: {
              "morning": [h1,h2,…],
              "evening": [h1,h2,…]
            },
            … }
        e.g. {
            0: {"morning":[9,10,11], "evening":[19,20,21]},  # Monday
            1: {"morning":[9,10,11], "evening":[19,20,21]},  # Tuesday
            …,
            6: {"morning":[10,11,12], "evening":[19,20,21]}  # Sunday
        }

    """

    def __init__(
        self,
        timestamp_col: str = "timestamp",
        peak_hours: Dict[int, Dict[str, List[int]]] = None
    ):
        """
        Initialize the peak adder.

        Parameters
        ----------
        timestamp_col : str
            Column name to read timestamps from.
        peak_hours : Dict[int, Dict[str, List[int]]], optional
            Mapping from day_of_week (0=Mon, 6=Sun) to morning/evening hours.
            If None, defaults to a common pattern for Delhi data in Q4
            2021, which can be overridden by the user.

        Raises
        ------
        TypeError
            If `timestamp_col` is not a string or `peak_hours` is not a dictionary (if not None).
        KeyError
            If `peak_hours` is not a dictionary with the correct structure,
            or if any day_of_week is missing from the mapping.
        ValueError
            If `peak_hours` does not contain "morning" and "evening" keys
            for each day_of_week or if lists are not of integers.

        """

        # Validate input of timestamp column
        if not isinstance(timestamp_col, str):
            raise TypeError("timestamp_col must be a string representing the column name.")

        self.timestamp_col = timestamp_col

        # If no peak_hours provided, use defaults
        if peak_hours is None:
            # defaults from quick analysis of Q4 2021 Delhi data
            self.peak_hours = {
                d: {"morning":[9,10,11], "evening":[19,20,21]} for d in range(1, 8)
            }
            self.peak_hours[7] = {"morning":[10,11,12], "evening":[19,20,21]}

        # Validate and set peak_hours if provided
        else:
            if not isinstance(self.peak_hours, dict):
                raise TypeError("peak_hours must be a dict mapping 1–7 to windows")
            if set(self.peak_hours) != set(range(1, 8)):
                raise KeyError("peak_hours must specify all days 1–7 (Mon–Sun)")
            for day, w in self.peak_hours.items():
                if not isinstance(w, dict) or "morning" not in w or "evening" not in w:
                    raise KeyError(f"Day {day}: must map to dict with 'morning' and 'evening'")
                try:
                    m = [int(h) for h in w["morning"]]
                    e = [int(h) for h in w["evening"]]
                except Exception:
                    raise ValueError(f"Day {day}: hours must be int-castable")
                for h in m+e:
                    if not 0 <= h < 24:
                        raise ValueError(f"Day {day}: hour {h} out of acceptable range 0–23 (inclusive)")

            self.peak_hours[day] = {"morning": sorted(set(m)), "evening": sorted(set(e))}


    def fit(self, X, y=None):
        """
        No-op fit for transformer compatibility.
        Parameters
        ----------
        X : pd.DataFrame
            Input DataFrame.
        y : Ignored
            Not used, exists for sklearn API compatibility.

        Returns
        -------
        self : ElectricityPeakAdder

        Raises
        ------
        TypeError
            If `X` is not a pandas DataFrame.
        KeyError
            If `timestamp_col` is not present in the DataFrame.

        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        if self.timestamp_col not in X:
            raise KeyError(f"Column '{self.timestamp_col}' not in DataFrame.")

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Add three columns:
          - is_morning_peak   (1 if hour in that day’s morning window)
          - is_evening_peak   (1 if hour in that day’s evening window)
          - is_offpeak        (1 otherwise)

        Parameters
        ----------
        X : pd.DataFrame
            Must contain `self.timestamp_col`.

        Returns
        -------
        X_out : pd.DataFrame
            Copy of X with new boolean flag columns.

        Raises
        ------
        KeyError
            If `self.timestamp_col` is not present in the DataFrame.
        RuntimeError
            If `fit()` was not called before `transform()`, missing attributes.
        TypeError
            If `X` is not a pandas DataFrame or if `timestamp_col` cannot be converted to datetime.

        """
        # Validate input
        # Check input is dataframe
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        # Check fit was called first
        if not hasattr(self, 'peak_hours'):
            raise RuntimeError("You must call fit() before transform(). Object missing attribute 'peak_hours'.")
        if not hasattr(self, 'timestamp_col'):
            raise RuntimeError("You must call fit() before transform(). Object missing attribute 'timestamp_col'.")

        # Check if timestamp_col exists in DataFrame
        if self.timestamp_col not in X.columns:
            raise KeyError(f"Column '{self.timestamp_col}' not found in input DataFrame")

        df = X.copy()
        dt  = pd.to_datetime(df[self.timestamp_col])
        hr  = dt.dt.hour
        dow = dt.dt.dayofweek

        # prepare empty flags
        df["is_morning_peak"] = False
        df["is_evening_peak"] = False

        # map each row
        for day, windows in self.peak_hours.items():
            mask = dow == day
            df.loc[mask, "is_morning_peak"] = hr[mask].isin(windows["morning"])
            df.loc[mask, "is_evening_peak"] = hr[mask].isin(windows["evening"])

        # off-peak = neither morning nor evening
        df["is_offpeak"] = ~(df["is_morning_peak"] | df["is_evening_peak"])

        return df

##### Metrics and Scoring

In [23]:
def mean_absolute_percentage_error(y_true, y_pred, eps: float=1e-6):
    """
    MAPE = mean(|(y_true - y_pred) / y_true|) * 100
    eps is added in denominator to avoid division by zero.
    """
    return np.mean(np.abs((y_true - y_pred) / (y_true + eps))) * 100

In [None]:
def mean_metric(df, metric):
    """ Helper function to compute mean of a metric, handling special cases like MSE.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the metric values.
    metric : str
        Name of the metric to compute the mean for.

    Returns
    -------
    float
        The mean value of the specified metric.
    """
    if metric == 'mse':
        return (df['rmse'] ** 2).mean()
    else:
        return df[metric].mean()

##### Analysis 1 : 

#### Rebuilding R Analysis

##### Helper Functions

In [None]:
class AnalysisFeatureAdder(BaseEstimator, TransformerMixin):
    """
    Adds the core temporal and quantitative features from the original R analysis.

    New columns:
      • time_id        (HH‑MM string)                        ← R’s `p`
      • generation_sqrd  (total_generation^2)                 ← R’s `Q2`
      • log_generation (log(total_generation + ε))            ← R’s `q`
      • log_generation_sqrd     (log_generation^2)                    ← R’s `q2`
      • log_co2        (log(tons_co2 + ε))                   ← R’s `co2`

    """

    def __init__(
        self,
        timestamp_col: str = "timestamp",
        generation_col: str = "total_generation",
        co2_col: str = "tons_co2",
        epsilon: float = 1e-6
    ):
        """
        Initializes the AnalysisFeatureAdder with the specified column names and epsilon value.

        Parameters
        ----------
        timestamp_col : str
            Name of the datetime column (must be parseable by pd.to_datetime).
        generation_col : str
            Name of the total‐generation column (R’s `total_generation` or `Q`).
        co2_col : str
            Name of the emissions column (R’s `tons_co2` or `CO2`).
        epsilon : float, default=1e-6
            Small constant to avoid log(0).

        Returns
        -------
        None
        Initializes the AnalysisFeatureAdder instance with the specified parameters.

        Raises
        -------
        ValueError
            If any of the specified columns are not strings or if epsilon is not a float or int.
        """
        # input validation
        if not isinstance(timestamp_col, str):
            raise ValueError("timestamp_col must be a string")
        if not isinstance(generation_col, str):
            raise ValueError("generation_col must be a string")
        if not isinstance(co2_col, str):
            raise ValueError("co2_col must be a string")
        if not isinstance(epsilon, (float, int)):
            raise ValueError("epsilon must be a float or int")

        # assign parameters
        self.timestamp_col = timestamp_col
        self.generation_col = generation_col
        self.co2_col = co2_col
        self.epsilon = epsilon

    def fit(self, X, y: pd.Series = None):
        # self._y = y  # store y for later use in transform
        # No fitting needed
        return self

    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Transforms the input DataFrame by adding new features.

        Parameters
        ----------
        X : pd.DataFrame
            Input DataFrame containing the data to be transformed.
        y : None, optional
            Not used, present for compatibility with scikit-learn's TransformerMixin.
        Returns
        -------
        pd.DataFrame
            Transformed DataFrame with new features added.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")

        df = X.copy()

        # Ensure timestamp and generation columns are present
        for col in [self.timestamp_col, self.generation_col]:
            if col not in df.columns:
                raise ValueError(f"Missing required column '{col}' in input DataFrame")

        # Ensure timestamp is datetime
        df[self.timestamp_col] = pd.to_datetime(df[self.timestamp_col])

        # ── Temporal features ─────────────────────────────────

        # HH‑MM string for grouping (R’s `p`)
        df["time_id"] = df[self.timestamp_col].dt.strftime("%H-%M").astype("string")

        # ── Quantitative features ───────────────────────────────
        # generation_sq = total_generation^2 (R’s Q2)
        df[f"{self.generation_col}_sqrd"] = df[self.generation_col] ** 2

        # log_generation = log(total_generation + ε) (R’s q)
        df[f"log_{self.generation_col}"] = np.log(df[self.generation_col] + self.epsilon)

        # log_generation_sqrd = (log_generation)^2 (R’s q2)
        df[f"log_{self.generation_col}_sqrd"] = df[f"log_{self.generation_col}"] ** 2

        if self.co2_col in df.columns:
                    df[f"log_{self.co2_col}"] = np.log(df[self.co2_col] + self.epsilon)

        return df

    # not actually used in the analysis, but generated in the R code
    # df['month_time_id'] = df['timestamp'].dt.strftime("%m-%H-%M")

    # Not in the original analysis, but potentially useful for time series analysis
    # df['day_of_week'] = df['timestamp'].dt.dayofweek
    # df['day_of_month'] = df['timestamp'].dt.day
    # df['week_of_year'] = df['timestamp'].dt.isocalendar().week

###### MultiQuantileBinner

In [None]:
class MultiQuantileBinner(BaseEstimator, TransformerMixin):
    """
    Given bin_specs = {'v1': 5, 'v2': 4, ...}, on fit() learn the quantile edges for each var.
    On transform() assign var_bin ∈ [1..n_bins] and compute a single group ID 'k'
    by treating the vector of bins as digits in a mixed‑radix number.
    """
    def __init__(self, bin_specs: dict[str,int], group_col_name: str = "quantile_group_id", retain_flags: bool = True):
        """
        Initialize the MultiQuantileBinner with specifications for binning.

        Parameters
        ----------
        bin_specs : dict[str, int]
            A dictionary where keys are variable names and values are the number of bins to create for each variable.
            Example: {'var1': 5, 'var2': 4} means
            'var1' will be binned into 5 quantiles and 'var2' will be binned into 4 quantiles.
        Raises
        ------
        ValueError
            If bin_specs is empty or not a dictionary.
        TypeError
            If bin_specs keys are not strings or values are not integers.

        """
        # Validate bin_specs
        # Ensure that it is not empty and is a dictionary
        if not isinstance(bin_specs, dict) or not bin_specs:
            raise ValueError("bin_specs must be a non-empty dictionary")

        # Esnure that all keys are strings
        if not all(isinstance(k, str) for k in bin_specs.keys()):
            raise TypeError("All keys in bin_specs must be strings")

        # Ensure that all values are positive integers or can be converted to positive integers
        # store the bin specifications
        self.bin_specs = self.validate_and_convert_bins(bin_specs)
        # dictionary to hold the bin edges for each variable
        self.bins_: dict[str, np.ndarray] = {}
        # column name for the group ID
        self.group_col_name = group_col_name
        self.quantile_edges_: dict[str, list[float]] = {}
        # whether to retain the individual bin flags in the output DataFrame
        self.retain_flags = retain_flags



    def fit(self, X: pd.DataFrame, y=None):
        """
        Fit the MultiQuantileBinner to the DataFrame X.
        This method computes the quantile edges for each variable specified in bin_specs.

        Parameters
        ----------
        X : pd.DataFrame
            The input DataFrame containing the variables to be binned.
        y : None, optional
            Ignored, exists for compatibility with scikit-learn's API.
        Returns
        -------
        self : MultiQuantileBinner
            Returns the instance itself after fitting.
        """
        self.quantile_edges_ = {}
        epsilon = 1e-4  # small padding to avoid edge issues


        # compute bin edges for each variable
        for var, n_bins in self.bin_specs.items():
            if not isinstance(var, str):
                raise TypeError(f"Variable name '{var}' from bin_specs is not a string.\nCheck the instantiation of MultiQuantileBinner and handling of the bin_specs argument.")

            if var not in X.columns:
                raise ValueError(f"Column '{var}' not in DataFrame")

            # # pandas.qcut returns (labels, bin_edges)
            # _, edges = pd.qcut(X[var], q=n_bins, retbins=True, duplicates="drop")
            # self.bins_[var] = edges
            # logging.info(f"Fitted {n_bins} quantiles for '{var}'")

            # Compute quantile bin edges (include 0.0 and 1.0 for full coverage)
            quantiles = np.linspace(0, 1, n_bins + 1)
            raw_edges = X[var].quantile(quantiles, interpolation="midpoint").values

            # ensure full range is captured
            min_val, max_val = X[var].min(), X[var].max()
            edges = np.unique(
                np.concatenate([
                    [min_val - epsilon],
                    raw_edges,
                    [max_val + epsilon]
                ])
            )
            edges.sort()

            self.quantile_edges_[var] = edges.tolist()

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transform the DataFrame X by assigning each variable to its respective bin
        and computing a single group ID 'k' based on the bin assignments.

        Parameters
        ----------
        X : pd.DataFrame
            The input DataFrame containing the variables to be binned.
        Returns
        -------
        pd.DataFrame
            A DataFrame with the original columns plus additional columns for each variable's bin assignment
            and a combined group ID 'k'.
        Raises
        ------
        ValueError
            If the DataFrame X does not contain all the variables specified in bin_specs.
        TypeError
            If the input X is not a pandas DataFrame.

        """

        if not self.quantile_edges_:
            raise RuntimeError("Must fit binner before transform.")
        df = X.copy()
        # assign each var to its bin
        flags = []
        for var, edges in self.quantile_edges_.items():
            df[f"{var}_group"] = pd.cut(df[var], bins=edges, labels=(range(1, len(edges))), include_lowest=True, right=True)

            if var == "month":
                # Just cast to categorical with 1–12 regardless of data contents
                df[f"{var}_group"] = pd.Categorical(
                    df[var].astype(int),  # ensure int
                    categories=range(1, 13),  # 1 to 12
                    ordered=True
                )
                flags.append(df[f"{var}_group"].astype(int))
                continue

            # Check for NaNs (i.e., value outside learned bin edges)
            # if df[f"{var}_group"].isna().any():
            #     bad_values = df.loc[df[f"{var}_group"].isna(), var].unique()
            #     raise ValueError(
            #         f"Values {bad_values} in column '{var}' fall outside the fitted bin edges: {edges}. "
            #         "This usually happens if the test/validation set has more extreme values than the training set. "
            #         "Try adjusting the quantile interpolation or using wider bins."
            #     )
            if df[f"{var}_group"].isna().any():
                # Extract and log the out-of-bounds values
                oob_values = df.loc[df[f"{var}_group"].isna(), var]
                unique_oob = oob_values.unique()
                min_val, max_val = oob_values.min(), oob_values.max()

                logging.warning(
                    f"[MultiQuantileBinner] OOB detected in '{var}' — {len(oob_values)} values. "
                    f"Range: {min_val:.3f} to {max_val:.3f}. Clipping to edge bins."
                )
                logging.debug(f"OOB values for '{var}': {unique_oob}")

                # Clip behavior: Add categories only if needed
                existing = set(df[f"{var}_group"].cat.categories)
                to_add = [c for c in [1, len(edges) - 1] if c not in existing]
                if to_add:
                    df[f"{var}_group"] = df[f"{var}_group"].cat.add_categories(to_add)
                df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


            flags.append(df[f"{var}_group"].astype(int))


        # Combine flags into group ID
        df[self.group_col_name] = 1  # Start from 1
        for i, flag in enumerate(flags[::-1]):  # least significant last
            radix = np.prod([len(self.quantile_edges_[v]) - 1 for v in list(self.bin_specs.keys())[i+1:]])
            df[self.group_col_name] += (flag - 1) * radix

        if not self.retain_flags:
            for var in self.bin_specs:
                del df[f"{var}_group"]
        return df

    @staticmethod
    def validate_and_convert_bins(bin_specs):
        """
        Convert and validate bin_specs to positive integers.

        Parameters
        ----------
        bin_specs : dict
            A dictionary where keys are variable names and values are the number of bins to create for each variable.

        Returns
        -------
        dict
            A dictionary with the same keys as bin_specs, but with values converted to positive integers.

        Raises
        ------
        ValueError
            If any value in bin_specs cannot be converted to a positive integer or is not a positive integer.
        """
        converted = {}

        for k, v in bin_specs.items():

            try:
                # Convert to integer (works for strings, floats, numpy numbers)
                v_int = int(float(v))  # Handle strings and floats first
                if v_int != float(v):  # Check if original was truly integer
                    raise ValueError
                if v_int <= 0:
                    raise ValueError("Value must be positive")
                converted[k] = v_int

            except (ValueError, TypeError) as e:
                raise TypeError(
                    f"Bin spec '{k}' value '{v}' cannot be converted to positive integer"
                ) from e

        return converted


###### MultiMedianBinner

In [None]:
class MultiMedianBinner(BaseEstimator, TransformerMixin):
    """
    Split N variables at their medians, emitting per-variable flags and
    a combined group ID.

    Each variable produces a binary flag (Below=0, Above=1). These flags
    are treated as bits in a binary number (least significant bit = last
    variable), and then shifted to 1-based indexing so group IDs run from
    1 to 2**n_vars.

    Parameters
    ----------
    variables : list of str
        Names of the DataFrame columns to split at their medians.
    group_col_name : str, default="median_group_id"
        Name of the output column that will contain the combined group ID.
    retain_flags : bool, default=True
        Whether to keep the individual per-variable flag columns
        (named `<variable>_group`). If False, only the combined group
        ID column is kept.

    Attributes
    ----------
    medians_ : dict of {str: float}
        The fitted median for each variable in ``variables``. Populated
        after calling ``fit``.

    Raises
    ------
    ValueError
        If ``variables`` is empty, or if any entry is not a string.
    TypeError
        If ``group_col_name`` is not a non-empty string, or
        if ``retain_flags`` is not a bool.

    See Also
    --------
    sklearn.pipeline.Pipeline : For chaining this transformer with others.
    """

    def __init__(
        self,
        variables: list[str],
        group_col_name: str = "median_group_id",
        retain_flags: bool = True
    ):
        """
        Initialize the binner.

        Parameters
        ----------
        variables : list of str
            Columns to median‑split.
        group_col_name : str
            Name of combined‑ID output column.
        retain_flags : bool
            If True, keep each `<var>_group` flag column.
        """
        # Validate variables list
        if not isinstance(variables, list) or len(variables) == 0:
            raise ValueError("`variables` must be a non-empty list of column names.")
        if any(not isinstance(v, str) for v in variables):
            raise TypeError("All entries in `variables` must be strings.")

        # Validate group_col_name
        if not isinstance(group_col_name, str) or not group_col_name:
            raise TypeError("`group_col_name` must be a non-empty string.")

        # Validate retain_flags
        if not isinstance(retain_flags, bool):
            raise TypeError("`retain_flags` must be a boolean value.")

        self.variables = variables
        self.group_col_name = group_col_name
        self.retain_flags = retain_flags
        self.medians_: dict[str, float] = {}


    def fit(self, X, y=None):
        """
        Learn the medians of each variable.

        Parameters
        ----------
        X : DataFrame
            Input table containing the columns in `self.variables`.
        y : ignored
            For API compatibility.

        Returns
        -------
        self : MultiMedianBinner
        """
        # Validate input
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        # Column existence
        missing = [v for v in self.variables if v not in X.columns]
        if missing:
            raise ValueError(f"Columns not found in input DataFrame: {missing}")

        # Compute medians
        self.medians_ = X[self.variables].median(skipna=True).to_dict()
        for var, med in self.medians_.items():
            logging.info(f"Fitted median for '{var}': {med:.4f}")
        return self

    def transform(self, X):
        """
        Assign Above/Below flags and compute the combined group ID.

        Parameters
        ----------
        X : DataFrame
            Must include the same columns you passed to `fit`.

        Returns
        -------
        X_out : DataFrame
            Copy of X with new `<var>_group` (if retained) and
            `self.group_col_name` columns.
        """
        # Ensure fit() was called
        if not hasattr(self, 'medians_') or not self.medians_:
            raise RuntimeError("Must call fit() before transform().")

        # Type check
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame.")

        # Column existence
        missing = [v for v in self.variables if v not in X.columns]
        if missing:
            raise ValueError(f"Columns missing at transform time: {missing}")

        df = X.copy()

        # Build binary flags: 0=Below median, 1=Above median
        flags = (df[self.variables] > pd.Series(self.medians_)).astype(int)

        # Compute mixed‑radix group ID
        multipliers = 2 ** np.arange(len(self.variables))[::-1]
        df[self.group_col_name] = flags.values.dot(multipliers) + 1
        logging.info(
            f"Created combined group '{self.group_col_name}' with "
            f"{df[self.group_col_name].nunique()} unique values"
        )

        # Optionally retain individual flags
        if self.retain_flags:
            for var in self.variables:
                df[f"{var}_group"] = flags[var]

        return df


    def get_feature_names_out(self, input_features=None):
        """
        Produce the output column names after transformation.

        Parameters
        ----------
        input_features : list of str, optional
            Ignored; exists for compatibility with sklearn.

        Returns
        -------
        names : ndarray of str
        """
        names = []
        if self.retain_flags:
            names += [f"{v}_group" for v in self.variables]
        names.append(self.group_col_name)
        return np.array(names)


###### GroupwiseRegressor

In [None]:
class GroupwiseRegressor(BaseEstimator, TransformerMixin):
    """
    Runs separate OLS regressions in each group and computes marginal emission factors.

    For each group k, we fit:
        y_t = α₁ₖ · x₁_t + α₂ₖ · x₂_t + Σ β_i·C(f_i)_t + ε_t
    and compute the marginal effect:
        ME_t = ∂y_t/∂x₁_t = α₁ₖ + 2·α₂ₖ·x₁_t.

    Parameters
    ----------
    y_var : str
        Target column name (e.g. 'tons_co2').
    x_vars : List[str]
        Predictor columns; first is Q, second is Q².
    fe_vars : List[str], optional
        Categorical fixed-effect columns.
    group_col : str
        Column with integer group IDs.
    min_group_size : int
        Minimum observations per group to run regression.
    track_metrics : bool
        If True, store per-group models and metrics.
    verbose : bool
        If True, log progress and metrics.

    Attributes
    ----------
    group_models_ : dict
        Fitted statsmodels results per group (if track_metrics=True).
    group_metrics_ : dict
        Computed metrics per group (if track_metrics=True).
    """
    def __init__(
        self,
        y_var: str = "tons_co2",
        x_vars: List[str] = ["total_generation", "total_generation_sqrd"],
        fe_vars: Optional[List[str]] = None,
        group_col: str = "k",
        min_group_size: int = 10,
        track_metrics: bool = True,
        verbose: bool = True,
        random_state: int = 12
    ):
        # Parameter validation
        if not isinstance(y_var, str):
            raise TypeError("y_var must be a string")
        if not isinstance(x_vars, list) or not x_vars or not all(isinstance(v, str) for v in x_vars):
            raise TypeError("x_vars must be a non-empty list of strings")
        if fe_vars is not None and (not isinstance(fe_vars, list) or not all(isinstance(v, str) for v in fe_vars)):
            raise TypeError("fe_vars must be a list of strings or None")
        if not isinstance(group_col, str):
            raise TypeError("group_col must be a string")
        if not isinstance(min_group_size, int) or min_group_size < 1:
            raise ValueError("min_group_size must be a positive integer")
        if not isinstance(track_metrics, bool):
            raise TypeError("track_metrics must be a boolean")
        if not isinstance(verbose, bool):
            raise TypeError("verbose must be a boolean")

        self.y_var = y_var
        self.x_vars = x_vars
        self.fe_vars = fe_vars or []
        self.group_col = group_col
        self.min_group_size = min_group_size
        self.track_metrics = track_metrics
        self.verbose = verbose
        self.random_state = random_state
        if self.track_metrics:
            self.group_models_ = {}
            self.group_metrics_ = {}

    def fit(self, X, y=None):
        if self.random_state is not None:
            np.random.seed(self.random_state)
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X must be a pandas DataFrame")
        if y is None:
            raise ValueError("y must be provided for fitting")

        df = X.copy()
        df[self.y_var] = y.values
        uint_cols = [
            col for col in df.columns
            if str(df[col].dtype).startswith(('uint', 'UInt'))
        ]

        if uint_cols:
            df[uint_cols] = df[uint_cols].astype('int64')

        if self.track_metrics:
            self.group_models_ = {}
            self.group_metrics_ = {}

        self._fitted_groups = []

        if 'month' in self.fe_vars:
            df['month'] = pd.Categorical(
                df['month'].astype(int),
                categories=range(1, 13),
                ordered=True
            )
        if 'hour' in self.fe_vars:
            df['hour'] = pd.Categorical(
            df['hour'].astype(int),
            categories=range(24),  # 0-23 for hours
            ordered=True
        )
        if 'day_of_week' in self.fe_vars:
            df['day_of_week'] = pd.Categorical(
                df['day_of_week'].astype(int),
                categories=range(1, 8),  # 1-7 for days of the week
                ordered=True
            )
        if 'week_of_year' in self.fe_vars:
            df['week_of_year'] = pd.Categorical(
                df['week_of_year'].astype(int),
                categories=range(1, 54),  # 1-53 for weeks of the year
                ordered=True
            )
        if 'half_hour' in self.fe_vars:
            df['half_hour'] = pd.Categorical(
                df['half_hour'].astype(int),
                categories=range(0, 48),  # 0-47 for half-hour intervals
                ordered=True
            )


        for grp, df_grp in df.groupby(self.group_col, sort=True):
            n = len(df_grp)
            if n < self.min_group_size:
                if self.verbose:
                    logging.warning(f"Skipping group {grp!r}: only {n} < {self.min_group_size}")
                continue

            # Regression formula
            reg = " + ".join(self.x_vars)
            fe = " + ".join(f"C({f})" for f in self.fe_vars)
            formula = f"{self.y_var} ~ {reg}" + (f" + {fe}" if fe else "")

            model = smf.ols(formula, data=df_grp).fit()
            self._fitted_groups.append(grp)

            if self.track_metrics:
                preds = model.predict(df_grp)
                self.group_models_[grp] = model
                rmse = np.sqrt(((preds - df_grp[self.y_var]) ** 2).mean())
                mae = np.abs(preds - df_grp[self.y_var]).mean()
                mape = mean_absolute_percentage_error(df_grp[self.y_var], preds)
                self.group_metrics_[grp] = {
                    'r2': model.rsquared,
                    'rmse': rmse,
                    'mae': mae,
                    'mape': mape,
                    'n_obs': n
                }

        if not self._fitted_groups:
            raise ValueError("No valid groups found for fitting.")
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Apply groupwise OLS and compute marginal effects ME_t.

        Parameters
        ----------
        X : pd.DataFrame
            Must contain y_var, x_vars, fe_vars, and group_col.

        Returns
        -------
        pd.DataFrame
            Original rows plus 'alpha1', 'alpha2', and 'ME'.

        Raises
        ------
        TypeError
            If X is not a pandas DataFrame.
        ValueError
            If required columns missing or no group qualifies.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame")

        df = X.copy()

        if 'month' in self.fe_vars:
            df['month'] = pd.Categorical(
                df['month'].astype(int),
                categories=range(1, 13),
                ordered=True
            )
        if 'hour' in self.fe_vars:
            df['hour'] = pd.Categorical(
            df['hour'].astype(int),
            categories=range(24),  # 0-23 for hours
            ordered=True
        )
        if 'day_of_week' in self.fe_vars:
            df['day_of_week'] = pd.Categorical(
                df['day_of_week'].astype(int),
                categories=range(7),  # 0-6 for days of the week
                ordered=True
            )
        if 'week_of_year' in self.fe_vars:
            df['week_of_year'] = pd.Categorical(
                df['week_of_year'].astype(int),
                categories=range(1, 54),  # 1-53 for weeks of the year
                ordered=True
            )
        if 'half_hour' in self.fe_vars:
            df['half_hour'] = pd.Categorical(
                df['half_hour'].astype(int),
                categories=range(0, 48),  # 0-47 for half-hour intervals
                ordered=True
            )

        df['alpha1'] = np.nan
        df['alpha2'] = np.nan
        df['ME'] = np.nan

        for grp, df_grp in df.groupby(self.group_col, sort=True):
            model = self.group_models_.get(grp)
            if model is None:
                continue
            a1 = model.params.get(self.x_vars[0], np.nan)
            a2 = model.params.get(self.x_vars[1], 0.0)
            indices = df_grp.index

            df.loc[indices, 'alpha1'] = a1
            df.loc[indices, 'alpha2'] = a2
            df.loc[indices, 'ME'] = a1 + 2 * a2 * df_grp[self.x_vars[0]]

        return df


    def predict(self, X: pd.DataFrame) -> pd.Series:
        """
        Predict marginal effects for each row in X using the fitted group models.

        Parameters
        ----------
        X : pd.DataFrame
            Must contain the same columns used in fit/transform.

        Returns
        -------
        pd.Series
            Series of predicted marginal effects (ME_t) for each row.

        Raises
        ------
        TypeError
            If X is not a pandas DataFrame.
        ValueError
            If required columns are missing or no group qualifies.
        """
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input X must be a pandas DataFrame")
        if 'month' in self.fe_vars:
            X = X.copy()
            X['month'] = pd.Categorical(
                X['month'].astype(int),
                categories=range(1, 13),
                ordered=True
            )

        required = [self.y_var] + self.x_vars + self.fe_vars + [self.group_col]
        missing = [c for c in required if c not in X.columns]
        if missing:
            raise ValueError(f"Missing columns in input DataFrame: {missing}")

        df = X.copy()
        df['ME'] = np.nan

        for grp, df_grp in df.groupby(self.group_col, sort=True):
            model = self.group_models_.get(grp)
            if model is not None:
                df.loc[df[self.group_col] == grp, 'ME'] = model.predict(df_grp)

        return df['ME']

    def get_metrics(self, summarise: bool = True) -> Union[dict, pd.DataFrame]:
        """
        Get the metrics for each group.

        Parameters
        ----------
        summarise : bool, default=True
            If True, return a summary DataFrame; otherwise return raw metrics dict.

        Returns
        -------
        dict or pd.DataFrame
            If summarise=True, returns a DataFrame with group metrics.
            If False, returns the raw metrics dictionary.

        Raises
        ------
        RuntimeError
            If track_metrics was not set to True during initialization.
        """
        if not self.track_metrics:
            raise RuntimeError("Metrics tracking is disabled. Set track_metrics=True to enable.")

        if summarise:
            df = pd.DataFrame.from_dict(self.group_metrics_, orient='index')
            df.index.name = self.group_col
            df.reset_index(inplace=True)
            return df
        else:
            return self.group_metrics_

##### Manual Analysis

In [29]:
feature_addition_pipeline = Pipeline([
    ("Add_Datetime_Features", DateTimeFeatureAdder(timestamp_col="timestamp")),
    ("Add_Original_Analysis_Features", AnalysisFeatureAdder(timestamp_col="timestamp", generation_col="total_generation", co2_col="tons_co2")),
])
feature_addition_pipeline.name = "FeatureAdditionPipeline"
train_original_added_features_df = feature_addition_pipeline.fit_transform(train_pdf)

In [30]:
print(feature_addition_pipeline.name)

FeatureAdditionPipeline


In [31]:
# Columns in the training set (post feature transformation):
print("Columns in Training Set (post feature transformation):")
print(train_original_added_features_df.columns)

Columns in Training Set (post feature transformation):
Index(['timestamp', 'weather_longitude', 'weather_latitude', 'city',
       'era5_longitude', 'era5_latitude', 'distance_btwn_weather_and_era5_m',
       'gap_filling', 'temperature', 'precipitation_mm',
       'surface_net_solar_radiation_kwh',
       'surface_solar_radiation_downwards_kwh', 'total_cloud_cover',
       'high_cloud_cover', 'medium_cloud_cover', 'low_cloud_cover',
       'wind_speed', 'wind_direction', 'thermal_generation', 'gas_generation',
       'hydro_generation', 'nuclear_generation', 'renewable_generation',
       'total_generation', 'tons_co2', 'tonnes_CO2_emissions_year_specific',
       'CO2_difference', 'relative_CO2_difference', 'g_co2_per_kwh',
       'tons_co2_per_mwh', 'thermal_emission_factor_tonnes_CO2_per_mwh',
       'gas_emission_factor_tonnes_CO2_per_mwh', 'year', 'month',
       'week_of_year', 'day', 'hour', 'half_hour', 'day_of_week', 'is_weekend',
       'time_id', 'total_generation_sqrd', 'l

In [32]:
# Original Columns from the R Analysis
# y_var = "tons_co2"
# x_vars = Q : "total_generation", Q2 : "total_generation_sqrd"
# fe_vars = mo: "month", h: "hour"
# group_col = "k" (from MultiQuantileBinner on ssr : "surface_net_solar_radiation_kwh" and v2: "wind_speed"

# Dropping to only the relevant columns for reproducing MultiQuantileBinner & Regresion
x_original_relevant_columns = [
    "total_generation", "total_generation_sqrd",
    "surface_net_solar_radiation_kwh", "wind_speed",
    "month", "hour",
]
y_original_relevant_columns = ["tons_co2"]

x_original_train_added_features_df = train_original_added_features_df[x_original_relevant_columns]
y_original_train_added_features_df = train_original_added_features_df[y_original_relevant_columns]

# confirm the columns in the DataFrame
print("Columns in x_original_train_added_features_df:")
print(x_original_train_added_features_df.columns)
print("Columns in y_original_train_added_features_df:")
print(y_original_train_added_features_df.columns)

Columns in x_original_train_added_features_df:
Index(['total_generation', 'total_generation_sqrd',
       'surface_net_solar_radiation_kwh', 'wind_speed', 'month', 'hour'],
      dtype='object')
Columns in y_original_train_added_features_df:
Index(['tons_co2'], dtype='object')


In [33]:
# Add the same features to the validation and test sets
validation_added_features_df = feature_addition_pipeline.transform(validation_pdf)
test_added_features_df = feature_addition_pipeline.transform(test_pdf)

# Split to X and y for validation and test sets
x_validation_added_features_df = validation_added_features_df[x_original_relevant_columns]
y_validation_added_features_df = validation_added_features_df[y_original_relevant_columns]
x_test_added_features_df = test_added_features_df[x_original_relevant_columns]
y_test_added_features_df = test_added_features_df[y_original_relevant_columns]


###### MANUAL Multi - Quantile Binning Model

In [34]:
# Run the MultiQuantileBinner with original parameters
original_multi_binner = MultiQuantileBinner(
    bin_specs={
        "surface_net_solar_radiation_kwh": 5,
        "wind_speed": 5,
    },
    group_col_name="original_quantile_group_id"
)
# Fit the binner on the x_original_train_added_features_df
original_multi_binner.fit(x_original_train_added_features_df)

# Transform the DataFrame to get the group IDs
x_original_multi_binner_train_added_features_df = original_multi_binner.transform(x_original_train_added_features_df)

# Checking the columns in the binned DataFrame
print("Columns in x_original_multi_binner_train_added_features_df:")
print(x_original_multi_binner_train_added_features_df.columns)

Columns in x_original_multi_binner_train_added_features_df:
Index(['total_generation', 'total_generation_sqrd',
       'surface_net_solar_radiation_kwh', 'wind_speed', 'month', 'hour',
       'surface_net_solar_radiation_kwh_group', 'wind_speed_group',
       'original_quantile_group_id'],
      dtype='object')


In [35]:
# Run the GroupwiseRegressor with these original parameters
original_quantile_regressor = GroupwiseRegressor(
    y_var="tons_co2",
    x_vars=["total_generation", "total_generation_sqrd"],
    fe_vars=["month", "hour"],
    group_col="original_quantile_group_id",
    min_group_size=10,
    track_metrics=True,
    verbose=True
)
# Fit and transform the binned DataFrame
original_quantile_regressor_result_df = original_quantile_regressor.fit_transform(x_original_multi_binner_train_added_features_df, y_original_train_added_features_df)
# Checking the columns in the result DataFrame
print("Columns in result_df:")
print(original_quantile_regressor_result_df.columns)



Columns in result_df:
Index(['total_generation', 'total_generation_sqrd',
       'surface_net_solar_radiation_kwh', 'wind_speed', 'month', 'hour',
       'surface_net_solar_radiation_kwh_group', 'wind_speed_group',
       'original_quantile_group_id', 'alpha1', 'alpha2', 'ME'],
      dtype='object')


In [36]:
# Binning the validation and test sets
x_original_multi_binner_validation_added_features_df = original_multi_binner.transform(x_validation_added_features_df)
x_original_multi_binner_test_added_features_df = original_multi_binner.transform(x_test_added_features_df)

# Run the GroupwiseRegressor on the validation set
original_quantile_regressor_validation_result_df = original_quantile_regressor.transform(pd.concat([x_original_multi_binner_validation_added_features_df, y_validation_added_features_df], axis=1))
# Run the GroupwiseRegressor on the test set
original_quantile_regressor_test_result_df = original_quantile_regressor.transform(pd.concat([x_original_multi_binner_test_added_features_df, y_test_added_features_df], axis=1))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because 

In [37]:
original_quantile_regressor_test_result_df

Unnamed: 0,total_generation,total_generation_sqrd,surface_net_solar_radiation_kwh,wind_speed,month,hour,surface_net_solar_radiation_kwh_group,wind_speed_group,original_quantile_group_id,tons_co2,alpha1,alpha2,ME
0,211533.250000,4.474632e+10,2.076698,7.743866,6,6,4,5,32.0,82239.76420,0.584765,-5.798672e-07,0.339443
1,221341.333333,4.899199e+10,6.182488,4.490737,6,23,6,3,20.0,82649.01790,0.653027,-7.669251e-07,0.313523
2,199481.750000,3.979297e+10,2.144150,6.845677,6,6,4,5,32.0,80018.71225,0.584765,-5.798672e-07,0.353419
3,192567.083333,3.708208e+10,6.043013,3.112697,6,19,6,2,13.0,74628.74880,0.468242,-2.641633e-07,0.366503
4,203072.666667,4.123851e+10,0.202346,4.491776,6,2,2,3,16.0,81194.67905,0.820634,-1.289248e-06,0.297012
...,...,...,...,...,...,...,...,...,...,...,...,...,...
764635,179721.000000,3.229964e+10,4.510326,6.770185,5,10,5,5,33.0,61648.51620,0.623601,-7.038707e-07,0.370601
764636,199069.833333,3.962880e+10,4.695296,5.157878,5,11,6,4,27.0,51125.90030,0.749501,-1.051464e-06,0.330871
764637,210525.166667,4.432085e+10,4.903509,2.167075,5,20,6,2,13.0,72662.22250,0.468242,-2.641633e-07,0.357016
764638,191024.250000,3.649026e+10,0.503446,1.136288,5,3,3,2,10.0,64883.14425,0.875432,-1.433724e-06,0.327680


###### MANUAL Median Binning Models

In [38]:
# Run the MultiMedianBinner with original parameters
original_median_binner = MultiMedianBinner(
    variables=[
        "surface_net_solar_radiation_kwh",
        "wind_speed"
    ],
    group_col_name="median_group_id",
)
# Fit the binner on the x_original_train_added_features_df
original_median_binner.fit(x_original_train_added_features_df)

# Transform the DataFrame to get the group IDs
x_original_median_binner_train_added_features_df = original_median_binner.transform(x_original_train_added_features_df)

# Checking the columns in the binned DataFrame
print("Columns in x_original_median_binner_train_added_features_df:")
print(x_original_median_binner_train_added_features_df.columns)

2025-07-28 17:14:04,904 INFO Fitted median for 'surface_net_solar_radiation_kwh': 2.8178
2025-07-28 17:14:04,905 INFO Fitted median for 'wind_speed': 5.3699
2025-07-28 17:14:04,951 INFO Created combined group 'median_group_id' with 4 unique values


Columns in x_original_median_binner_train_added_features_df:
Index(['total_generation', 'total_generation_sqrd',
       'surface_net_solar_radiation_kwh', 'wind_speed', 'month', 'hour',
       'median_group_id', 'surface_net_solar_radiation_kwh_group',
       'wind_speed_group'],
      dtype='object')


In [39]:
# Run the GroupwiseRegressor with these original parameters
original_median_regressor = GroupwiseRegressor(
    y_var="tons_co2",
    x_vars=["total_generation", "total_generation_sqrd"],
    fe_vars=["month", "hour"],
    group_col="median_group_id",
    min_group_size=10,
    track_metrics=True,
    verbose=True
)
# Fit and transform the binned DataFrame
result_df = original_median_regressor.fit_transform(x_original_median_binner_train_added_features_df, y_original_train_added_features_df)
# Checking the columns in the result DataFrame
print("Columns in result_df:")
print(result_df.columns)

Columns in result_df:
Index(['total_generation', 'total_generation_sqrd',
       'surface_net_solar_radiation_kwh', 'wind_speed', 'month', 'hour',
       'median_group_id', 'surface_net_solar_radiation_kwh_group',
       'wind_speed_group', 'alpha1', 'alpha2', 'ME'],
      dtype='object')


In [40]:
# Binning the validation and test sets
x_original_median_binner_validation_added_features_df = original_median_binner.transform(x_validation_added_features_df)
x_original_median_binner_test_added_features_df = original_median_binner.transform(x_test_added_features_df)

# Run the GroupwiseRegressor on the validation set
original_median_regressor_validation_result_df = original_median_regressor.transform(pd.concat([x_original_median_binner_validation_added_features_df, y_validation_added_features_df], axis=1))
# Run the GroupwiseRegressor on the test set
original_median_regressor_test_result_df = original_median_regressor.transform(pd.concat([x_original_median_binner_test_added_features_df, y_test_added_features_df], axis=1))

2025-07-28 17:14:14,381 INFO Created combined group 'median_group_id' with 4 unique values
2025-07-28 17:14:14,402 INFO Created combined group 'median_group_id' with 4 unique values


In [41]:
original_median_regressor_test_result_df

Unnamed: 0,total_generation,total_generation_sqrd,surface_net_solar_radiation_kwh,wind_speed,month,hour,median_group_id,surface_net_solar_radiation_kwh_group,wind_speed_group,tons_co2,alpha1,alpha2,ME
0,211533.250000,4.474632e+10,2.076698,7.743866,6,6,2,0,1,82239.76420,0.623953,-6.883761e-07,0.332724
1,221341.333333,4.899199e+10,6.182488,4.490737,6,23,3,1,0,82649.01790,0.623381,-6.333639e-07,0.343001
2,199481.750000,3.979297e+10,2.144150,6.845677,6,6,2,0,1,80018.71225,0.623953,-6.883761e-07,0.349316
3,192567.083333,3.708208e+10,6.043013,3.112697,6,19,3,1,0,74628.74880,0.623381,-6.333639e-07,0.379451
4,203072.666667,4.123851e+10,0.202346,4.491776,6,2,1,0,0,81194.67905,0.814486,-1.259673e-06,0.302876
...,...,...,...,...,...,...,...,...,...,...,...,...,...
764635,179721.000000,3.229964e+10,4.510326,6.770185,5,10,4,1,1,61648.51620,0.639025,-7.296092e-07,0.376773
764636,199069.833333,3.962880e+10,4.695296,5.157878,5,11,3,1,0,51125.90030,0.623381,-6.333639e-07,0.371213
764637,210525.166667,4.432085e+10,4.903509,2.167075,5,20,3,1,0,72662.22250,0.623381,-6.333639e-07,0.356703
764638,191024.250000,3.649026e+10,0.503446,1.136288,5,3,1,0,0,64883.14425,0.814486,-1.259673e-06,0.333230


In [42]:
print(original_median_regressor.get_metrics(summarise=True))

   median_group_id        r2         rmse          mae      mape   n_obs
0                1  0.806505  3601.747729  1984.368615  3.731510  607540
1                2  0.764838  4101.291028  2182.926773  4.835096  559271
2                3  0.770935  4157.928892  2294.514117  4.870356  559265
3                4  0.755731  4036.804182  2316.440845  4.714747  607534


##### New Development

###### FUNCTIONS: Utilities - Logging & Scoring

In [None]:
def summarise_metrics_logs(train_logs: pd.DataFrame,
                           val_logs:   pd.DataFrame,
                           test_logs:  pd.DataFrame,
                           user_pipeline:   Pipeline,
                           x_columns:  list,
                           random_state: int = 12) -> pd.DataFrame:
    """
    Summarises the metrics and metadata from training, validation, and test logs into a single dataframe

    Parameters
    ----------
    train_logs : pd.DataFrame
        DataFrame containing training logs with columns ['group', 'r2', 'rmse', 'mae', 'mape', 'n_obs', 'model_id_hash', 'log_time'].
    val_logs : pd.DataFrame
        DataFrame containing validation logs with the same structure as train_logs.
    test_logs : pd.DataFrame
        DataFrame containing test logs with the same structure as train_logs.
    user_pipeline : Pipeline
        The user-defined pipeline that was used for training, which contains the final estimator.
    x_columns : list
        List of feature names used in the model.

    Returns
    -------
    pd.DataFrame
        A DataFrame summarising the metrics and metadata, including:
        - model_id_hash: Unique identifier for the model run.
        - log_time: Timestamp of the log.
        - model_name: Name of the final estimator in the pipeline.
        - pipeline_steps: List of steps in the pipeline.
        - pipeline_n_steps: Number of steps in the pipeline.
        - x_columns: List of feature names.
        - metrics_by_group: Nested dictionary containing metrics for each group in train, validation, and test logs.
        - {dataset}_r2, {dataset}_rmse, {dataset}_mae, {dataset}_mape: Average metrics for the each dataset split set.
    """

    model_id   = train_logs['model_id_hash'].iloc[0]
    log_time   = train_logs['log_time'].iloc[0]
    model_name = user_pipeline._final_estimator.__class__.__name__
    user_pipeline_steps      = list(user_pipeline.named_steps.keys())

    # Build nested per-group metrics
    nested = {}

    flat_params = user_pipeline.get_params(deep=True)
    params_json_str = json.dumps(flat_params, sort_keys=True, separators=(",", ":"), default=str)
    # Compute summary metrics
    summary = {
        'model_id_hash':  model_id,
        'random_state': random_state,
        'params_json':   params_json_str,
        'log_time':       log_time,
        'model_name':     model_name,
        'pipeline_steps': user_pipeline_steps,
        'pipeline_n_steps': len(user_pipeline_steps),
        'x_columns':      x_columns,
        'metrics_by_group': nested,
    }
    data_splits = {
        "train" : train_logs,
        "validation" : val_logs,
        "test" : test_logs
    }
    metrics_to_agg = ['r2', 'rmse', 'mae', 'mape', 'n_obs', 'mse']
    for metric in metrics_to_agg:
        for split_name, df in data_splits.items():
            summary[f'{metric}_{split_name}'] = mean_metric(df, metric)

    for split_name, df in data_splits.items():
        nested[split_name] = (
            df.set_index('group')[['r2','rmse','mae','mape','n_obs']]
              .to_dict(orient='index')
        )
    summary['metrics_by_group'] = nested

    return pd.DataFrame([summary])


In [None]:
def save_summary_to_csv(summary_df: pd.DataFrame,
                        csv_path: str = "marginal_emissions_log.csv",
                        force_overwrite: bool = False):
    """
    Save the summarized metrics to a CSV file (1 row per model run).
    If the file already exists, it will append the new summary.
    If force_overwrite is True, it will remove any existing rows with the same model_id_hash.
    Parameters
    ----------
    summary_df : pd.DataFrame
        DataFrame containing the summary metrics to save.
    csv_path : str, default="marginal_emissions_log.csv"
        Path to the CSV file where the summary will be saved.
    force_overwrite : bool, default=False
        If True, will overwrite any existing rows with the same model_id_hash.

    Returns
    -------
    None
        Prints confirmation message after saving the summary.
    """
    try:
        existing = pd.read_csv(csv_path)
        if force_overwrite:
            existing = existing[existing["model_id_hash"] != summary_df["model_id_hash"].iloc[0]]
        combined = pd.concat([existing, summary_df], ignore_index=True)
    except FileNotFoundError:
        combined = summary_df.copy()

    combined.to_csv(csv_path, index=False)
    print(f"[SAVE] Summary saved to {csv_path}")

###### FUNCTIONS: Hash Key

In [45]:
def make_config_key(config: dict) -> str:
    """Create a deterministic key for a given configuration dictionary.

    This key is generated by serializing the config dictionary to a JSON string
    with sorted keys and no spaces, then hashing the result.

    Parameters
    ----------
    config : dict
        The configuration dictionary to serialize and hash.

    Returns
    -------
    str
        The MD5 hash of the serialized configuration string.
    """
    # JSON‑dump with sorted keys, no spaces → deterministic string
    s = json.dumps(config, sort_keys=True, separators=(",",":"), default=str)
    return hashlib.md5(s.encode("utf-8")).hexdigest()

###### FUNCTIONS: Runners and Orchestrators

In [None]:
def evaluate_on_split(regression_model: GroupwiseRegressor,
    full_df: pd.DataFrame
) -> pd.DataFrame:
    """
    After pipeline.transform → full_df with group IDs & original y_var,
    compute per‑group r2/rmse/mae/n_obs using reg.group_models_.

    Parameters
    ----------
    reg : GroupwiseRegressor
        Fitted GroupwiseRegressor instance with group_models_ populated.
    full_df : pd.DataFrame
        DataFrame containing the original y_var and group_col.

    Returns
    -------
    pd.DataFrame
        DataFrame with group metrics: r2, rmse, mae, n_obs.

    """
    df = full_df.copy()
    metrics = {}

    for grp, model in regression_model.group_models_.items():
        grp_df = df[df[regression_model.group_col] == grp]
        if grp_df.empty:
            continue

        y_true = grp_df[regression_model.y_var]
        X_pred = grp_df[[c for c in grp_df.columns if c != regression_model.y_var]]

        y_pred = model.predict(X_pred)

        metrics[grp] = {
            "r2": r2_score(y_true, y_pred),
            "rmse": root_mean_squared_error(y_true, y_pred),
            "mae": mean_absolute_error(y_true, y_pred),
            "mape": mean_absolute_percentage_error(y_true, y_pred),
            "n_obs": len(grp_df),
        }

    mdf = pd.DataFrame.from_dict(metrics, orient="index")
    mdf.index.name = "group"
    return mdf.reset_index()

In [None]:
def run_regressor_model(
    user_pipeline: Pipeline,
    x_df: pd.DataFrame,
    y_df: pd.DataFrame,
    split_name: str,
    # log_csv: str = "metrics_log.csv",
    extra_info: dict  = None,
    return_model: bool = False,
    random_state: int = 12,
    ):
    """
    Run the pipeline on the given split, log metrics to CSV, and return the metrics Data
    frame.

    Parameters
    ----------
    pipeline : Pipeline
        The scikit-learn Pipeline to run.
    x_df : pd.DataFrame
        DataFrame containing the features.
    y_df : pd.DataFrame
        DataFrame containing the target variable.
    split_name : str
        Name of the split (e.g., "train", "val", "test").
    # log_csv : str, default="metrics_log.csv"
    #     Path to the CSV file where metrics will be logged.
    extra_info : dict, optional
        Additional information to include in the log (e.g., dataset version, model version).
    return_model : bool, default=False
        If True, return the fitted model along with the metrics DataFrame.

    Returns
    -------
    pd.DataFrame
        DataFrame containing the logged metrics for the split.
    Pipeline, optional
        The fitted model if `return_model` is True.
    """
    # Set the random seed for reproducibility
    np.random.seed(random_state)

    # change any uint cols to int
    for col in x_df.select_dtypes(include=['uint']).columns:
        x_df[col] = x_df[col].astype(int)

    # 1) Validate split_name
    if split_name not in ("train","validation","test"):
        raise ValueError(f"split_name must be one of train/validation/test, got {split_name!r}")

    # 2) Build signature
    config = {
      "pipeline_params": user_pipeline.get_params(deep=True),
      "x_columns"      : list(x_df.columns),
      "y_columns"      : list(y_df.name if hasattr(y_df, "name") else y_df.columns),
      "random_state": random_state,
      **(extra_info or {}),
    }
    config_key = make_config_key(config)

    flat_params = user_pipeline.get_params(deep=True)
    params_json_str = json.dumps(flat_params, sort_keys=True, separators=(",", ":"), default=str)

    # 6) Gather metrics
    if split_name == "train":
        out = user_pipeline.fit_transform(x_df, y_df)
        model = user_pipeline._final_estimator
        metrics_df = model.get_metrics(summarise=True).reset_index().rename(columns={'index':'group'})
    else:
        # Extract preprocessing and regressor
        preproc = user_pipeline[:-1]
        model = user_pipeline._final_estimator

        # Run transform
        x_transformed = preproc.transform(x_df)
        if not isinstance(x_transformed, pd.DataFrame):
            x_transformed = pd.DataFrame(
                x_transformed,
                index=x_df.index,
                columns=getattr(preproc, 'get_feature_names_out', lambda: [])()
            )

        # Build DataFrame
        transformed_df = pd.DataFrame(x_transformed, index=x_df.index)

        group_col = model.group_col
        if group_col not in transformed_df.columns:
            raise KeyError(
                f"Group column '{group_col}' is missing **after pipeline transform**. "
                "Ensure MultiMedianBinner is part of the pipeline and its output column "
                "matches the regressor's `group_col` parameter."
            )

        # Add target column
        transformed_df[model.y_var] = y_df.values

        # Compute metrics
        metrics_df = evaluate_on_split(model, transformed_df)

    # 7) Stamp metadata
    metrics_df["data_split"]     = split_name
    metrics_df["model_id_hash"] = config_key
    metrics_df["random_state"] = random_state
    metrics_df["pipeline_params_json"] = params_json_str
    metrics_df["log_time"] = datetime.now().isoformat()
    metrics_df["x_columns_used"] = ",".join(model.x_vars + model.fe_vars)
    for k, v in (extra_info or {}).items():
        metrics_df[k] = v
    x_cols_used = model.x_vars + model.fe_vars

    print(f"[LOG] {len(metrics_df)} rows for split={split_name}, model_id={config_key}, random_state={random_state}")
    return (metrics_df , x_cols_used , model) if return_model else (metrics_df, x_cols_used, None)


In [None]:
def regressor_orchestrator(
    user_pipeline: Pipeline,
    x_splits: dict,  # e.g. {"train": X_train, "validation": X_val, "test": X_test}
    y_splits: dict,  # e.g. {"train": y_train, ...}
    log_csv_path: str = "marginal_emissions_log.csv",
    extra_info: dict = None,
    force_run: bool = False,
    force_overwrite: bool = False,
    random_state: int = 12
) -> pd.DataFrame:
    """
    Orchestrate a model run. Skip if already logged. Return summary_df.
    """
    # Estimate model signature based on config
    config = {
        "pipeline_params": user_pipeline.get_params(deep=True),
        "x_columns"      : list(x_splits['train'].columns),
        "y_columns"      : list(y_splits['train'].name if hasattr(y_splits['train'], "name") else y_splits['train'].columns),
        "random_state": random_state,
        **(extra_info or {}),
    }
    model_key = make_config_key(config)

    if not force_run and not force_overwrite:
        # Check if the model is already logged
        try:
            existing = pd.read_csv(log_csv_path)
            if not force_run and model_key in existing["model_id_hash"].values and not force_overwrite:
                print(f"[SKIP] Model already logged (hash: {model_key})")
                return None
        except FileNotFoundError:
            pass

     # Run all splits
    logs = {}
    for split in ["train", "validation", "test"]:
        metrics_df, x_cols_used, _ = run_regressor_model(
            user_pipeline,
            x_df=x_splits[split],
            y_df=y_splits[split],
            split_name=split,
            extra_info=extra_info,
            return_model=False
        )
        logs[split] = metrics_df


    # Summarize
    summary_df = summarise_metrics_logs(logs["train"], logs["validation"], logs["test"], user_pipeline, x_cols_used)

    # Save
    save_summary_to_csv(summary_df, log_csv_path, force_overwrite=force_overwrite)

    return summary_df

###### FUNCTIONS - Grid Search

In [None]:
def run_grid_search(
    base_feature_pipeline: Pipeline,
    regressor_cls,
    regressor_kwargs: dict,
    grid_config: list[dict],
    x_splits: dict,
    y_splits: dict,
    log_path: str,
    global_extra_info: dict = None,
    force_run: bool = False,
    force_overwrite: bool = False,
    base_feature_pipeline_name: str = "BaseFeaturePipeline",
):
    """
    Run a grid search over binner configurations and log results.

    Parameters
    ----------
    base_feature_pipeline : Pipeline
        Pipeline with preprocessing steps (before binning and regression).
    regressor_cls : class
        The class of the regressor (e.g., GroupwiseRegressor).
    regressor_kwargs : dict
        Keyword arguments to initialize the regressor.
    grid_config : list of dicts
        Each dict should contain:
            - 'binner_class': class (e.g., MultiQuantileBinner)
            - 'binner_kwargs': dict of init args
            - 'label': str label for logging
    x_splits, y_splits : dict
        Train/val/test splits (must include 'train').
    log_path : str
        Where to save summary CSV.
    x_columns : list
        List of feature column names.
    global_extra_info : dict
        Any global metadata to tag each run with.
    force_run, force_overwrite : bool
        Passed through to regressor_orchestrator.
    """
    for config in grid_config:
        binner_class = config["binner_class"]
        binner_kwargs = config["binner_kwargs"]
        label = config.get("label", binner_class.__name__)

        binner = binner_class(**binner_kwargs)
        regressor_kwargs["random_state"] = regressor_kwargs.get("random_state", 12)
        regressor = regressor_cls(**regressor_kwargs)

        binner_name = binner_class.__name__
        regressor_name = regressor_cls.__name__
        full_pipeline = Pipeline([
            (base_feature_pipeline_name, base_feature_pipeline),
            (binner_name, binner),
            (regressor_name, regressor)
        ])

        print(f"\n[GRID] Running config: {label}")
        extra_info = {
            "binner_class": binner_class.__name__,
            "binner_params": binner_kwargs,
            "regressor_params": regressor_kwargs,
            "grid_label": label,
            **(global_extra_info or {})
        }

        summary_df = regressor_orchestrator(
            user_pipeline=full_pipeline,
            x_splits=x_splits,
            y_splits=y_splits,
            log_csv_path=log_path,
            extra_info=extra_info,
            force_run=force_run,
            force_overwrite=force_overwrite,
            random_state=regressor_kwargs.get("random_state", 12)
        )
        if summary_df is not None:
            print(f"[GRID] Logged: {label}")
        else:
            print(f"[GRID] Skipped: {label} (already logged)")

In [50]:
def all_nonempty_subsets(columns: list[str]) -> list[list[str]]:
    return [list(c) for i in range(1, len(columns)+1) for c in combinations(columns, i)]

def get_fe_vars(all_cols, x_vars):
    return [c for c in all_cols if c not in x_vars]


In [None]:
def build_x_fe_combinations_disjoint(candidate_x_vars: list[str],
                                      candidate_fe_vars: list[str],
                                      x_var_length: int=2) -> list[dict]:
    """
    Generate all disjoint non-empty combinations of x_vars and fe_vars.

    Parameters
    ----------
    candidate_x_vars : list of str
        Columns eligible to be used as predictors (x_vars).
    candidate_fe_vars : list of str
        Columns eligible to be used as fixed effects (fe_vars).

    Returns
    -------
    list of dicts
        Each dict has keys: {'x_vars': [...], 'fe_vars': [...]}
    """
    results = []
    x_subsets = all_nonempty_subsets(candidate_x_vars)
    fe_subsets = all_nonempty_subsets(candidate_fe_vars)

    for x_vars in x_subsets:
        if len(x_vars) != x_var_length:
            continue  # Skip anything not exactly x_var_length long

        for fe_vars in fe_subsets:
            if set(x_vars).isdisjoint(set(fe_vars)):
                results.append({"x_vars": x_vars, "fe_vars": fe_vars})

    return results


In [None]:
def build_grid_configs(
    candidate_binning_vars: list[str],
    candidate_bin_counts: list[int],
    candidate_x_vars: list[str],
    candidate_fe_vars: list[str],
    x_var_length: int = 2
):
    """
    Build grid search configurations by combining:
    - Binning variable combinations and bin counts
    - Disjoint combinations of x_vars and fe_vars

    Returns list of configs with:
    - bin_specs
    - x_vars
    - fe_vars
    - label
    """
    configs = []

    # 1. All non-empty combinations of binners
    for bin_vars in all_nonempty_subsets(candidate_binning_vars):
        for bin_count in candidate_bin_counts:
            bin_spec = {v: bin_count for v in bin_vars}

            # 2. x/fe combinations that don't overlap with binners
            x_fe_grid = build_x_fe_combinations_disjoint(
                candidate_x_vars,
                candidate_fe_vars,
                x_var_length=x_var_length
            )

            for combo in x_fe_grid:
                if set(combo["x_vars"]).isdisjoint(set(bin_vars)):
                    config = {
                        "x_vars": combo["x_vars"],
                        "fe_vars": combo["fe_vars"],
                        "bin_spec": bin_spec,
                        "label": f"bin_{bin_count}_{'-'.join(bin_vars)}__x_{'-'.join(combo['x_vars'])}__fe_{'-'.join(combo['fe_vars'])}"
                    }
                    configs.append(config)

    return configs

In [53]:
def build_median_binner_configs(candidate_binning_vars, candidate_x_vars, candidate_fe_vars,
                                 x_var_length=2, max_fe_len=3):
    configs = []
    for bin_vars in all_nonempty_subsets(candidate_binning_vars):
        x_fe_grid = build_x_fe_combinations_disjoint(candidate_x_vars, candidate_fe_vars,
                                                     x_var_length=x_var_length, max_fe_len=max_fe_len)
        for combo in x_fe_grid:
            if set(combo["x_vars"]).isdisjoint(set(bin_vars)):
                label = f"median_{'-'.join(bin_vars)}__x_{'-'.join(combo['x_vars'])}__fe_{'-'.join(combo['fe_vars'])}"
                configs.append({
                    "x_vars": combo["x_vars"],
                    "fe_vars": combo["fe_vars"],
                    "binning_vars": bin_vars,
                    "label": label
                })
    return configs

###### IMPLEMENTATION - SINGLE RUNNERS

In [None]:
# Feature Engineering Pipeline

feature_addition_pipeline = Pipeline([
    ("Add_Datetime_Features", DateTimeFeatureAdder(timestamp_col="timestamp")),
    ("Add_Original_Analysis_Features", AnalysisFeatureAdder(timestamp_col="timestamp", generation_col="total_generation", co2_col="tons_co2")),
])

# Binning Pipelines
original_multi_binner = MultiQuantileBinner(
    bin_specs={
        "surface_net_solar_radiation_kwh": 5,
        "wind_speed": 5,
    },
    group_col_name="original_quantile_group_id"
)

original_median_binner = MultiMedianBinner(
    variables=[
        "surface_net_solar_radiation_kwh",
        "wind_speed"
    ],
    group_col_name="median_group_id",
)

# REGRESSORS
original_multi_binner_regressor = GroupwiseRegressor(
    y_var="tons_co2",
    x_vars=["total_generation", "total_generation_sqrd"],
    fe_vars=["month", "hour"],
    group_col="original_quantile_group_id",
    min_group_size=20,
    track_metrics=True,
    verbose=True
)
original_median_regressor = GroupwiseRegressor(
    y_var="tons_co2",
    x_vars=["total_generation", "total_generation_sqrd"],
    fe_vars=["month", "hour"],
    group_col="median_group_id",
    min_group_size=20,
    track_metrics=True,
    verbose=True
)

# Pipelines
original_median_regressor_pipeline = Pipeline([
    ("Feature_Addition", feature_addition_pipeline),
    ("Multi_Median_Binner", original_median_binner),
    ("Groupwise_Regressor", original_median_regressor)
])

original_multi_binner_regressor_pipeline = Pipeline([
    ("Feature_Addition", feature_addition_pipeline),
    ("Multi_Quantile_Binner", original_multi_binner),
    ("Groupwise_Regressor", original_multi_binner_regressor)
])


In [55]:
# assuming full_pipeline = Pipeline([...,"regressor", reg])

train_pdf_x_all = train_pdf.drop(columns=["tons_co2"])
train_pdf_y = train_pdf["tons_co2"]

validation_pdf_x_all = validation_pdf.drop(columns=["tons_co2"])
validation_pdf_y = validation_pdf["tons_co2"]
test_pdf_x_all = test_pdf.drop(columns=["tons_co2"])
test_pdf_y = test_pdf["tons_co2"]


In [56]:
train_logs, x_cols_used_train, _ = run_regressor_model(original_median_regressor_pipeline, train_pdf_x_all, train_pdf_y, split_name="train")
val_logs, x_cols_used_val, _ = run_regressor_model(original_median_regressor_pipeline, validation_pdf_x_all, validation_pdf_y, split_name="validation")
test_logs, x_cols_used_test, _ = run_regressor_model(original_median_regressor_pipeline, test_pdf_x_all, test_pdf_y, split_name="test")

2025-07-28 17:14:20,681 INFO Fitted median for 'surface_net_solar_radiation_kwh': 2.8178
2025-07-28 17:14:20,681 INFO Fitted median for 'wind_speed': 5.3699
2025-07-28 17:14:20,926 INFO Created combined group 'median_group_id' with 4 unique values


[LOG] 4 rows for split=train, model_id=894d2bdd0405e5f14ad196757da92b91, random_state=12


2025-07-28 17:14:31,346 INFO Created combined group 'median_group_id' with 4 unique values


[LOG] 4 rows for split=validation, model_id=894d2bdd0405e5f14ad196757da92b91, random_state=12


2025-07-28 17:14:33,536 INFO Created combined group 'median_group_id' with 4 unique values


[LOG] 4 rows for split=test, model_id=894d2bdd0405e5f14ad196757da92b91, random_state=12


In [57]:
summarise_metrics_logs(train_logs, val_logs, test_logs, original_median_regressor_pipeline, x_cols_used_train)


Unnamed: 0,model_id_hash,random_state,params_json,log_time,model_name,pipeline_steps,pipeline_n_steps,x_columns,metrics_by_group,r2_train,...,mae_test,mape_train,mape_validation,mape_test,n_obs_train,n_obs_validation,n_obs_test,mse_train,mse_validation,mse_test
0,894d2bdd0405e5f14ad196757da92b91,12,"{""Feature_Addition"":""Pipeline(steps=[('Add_Dat...",2025-07-28T17:14:30.651676,GroupwiseRegressor,"[Feature_Addition, Multi_Median_Binner, Groupw...",3,"[total_generation, total_generation_sqrd, mont...","{'train': {0: {'r2': 0.8065053697213157, 'rmse...",0.774502,...,3526.598512,4.537927,3.844244,5.853099,583402.5,82023.75,191160.0,15844330.0,13271500.0,28101280.0


###### IMPLEMENTATION - Orchestrator

In [58]:
regressor_orchestrator(
    user_pipeline=original_median_regressor_pipeline,
    x_splits={
        "train": train_pdf_x_all,
        "validation": validation_pdf_x_all,
        "test": test_pdf_x_all
    },
    y_splits={
        "train": train_pdf_y,
        "validation": validation_pdf_y,
        "test": test_pdf_y
    }
    , log_csv_path="marginal_emissions_log.csv",
)

regressor_orchestrator(
    user_pipeline=original_multi_binner_regressor_pipeline,
    x_splits={
        "train": train_pdf_x_all,
        "validation": validation_pdf_x_all,
        "test": test_pdf_x_all
    },
    y_splits={
        "train": train_pdf_y,
        "validation": validation_pdf_y,
        "test": test_pdf_y
    }
    , log_csv_path="marginal_emissions_log.csv",
)

[SKIP] Model already logged (hash: 894d2bdd0405e5f14ad196757da92b91)
[SKIP] Model already logged (hash: 76f0a9257bb6368a06a600fb5668c6a9)


In [None]:

median_binner_v1 = MultiMedianBinner(
    variables=[
        "surface_net_solar_radiation_kwh",
        "wind_speed",
        "temperature",
    ],
    group_col_name="median_group_id",
)
# REGRESSORS
median_regressor_v1 = GroupwiseRegressor(
    y_var="tons_co2",
    x_vars=["total_generation", "total_generation_sqrd"],
    fe_vars=["month", "hour", "week_of_year"],
    group_col="median_group_id",
    min_group_size=20,
    track_metrics=True,
    verbose=True
)

# Pipelines
median_regressor_pipeline_v1 = Pipeline([
    ("Feature_Addition", feature_addition_pipeline),
    ("Multi_Median_Binner", median_binner_v1),
    ("Groupwise_Regressor", median_regressor_v1)
])


In [60]:
regressor_orchestrator(
    user_pipeline=median_regressor_pipeline_v1,
    x_splits={
        "train": train_pdf_x_all,
        "validation": validation_pdf_x_all,
        "test": test_pdf_x_all
    },
    y_splits={
        "train": train_pdf_y,
        "validation": validation_pdf_y,
        "test": test_pdf_y
    }
    , log_csv_path="marginal_emissions_log.csv",
    force_run=True,
    force_overwrite=True
)

2025-07-28 17:14:40,150 INFO Fitted median for 'surface_net_solar_radiation_kwh': 2.8178
2025-07-28 17:14:40,150 INFO Fitted median for 'wind_speed': 5.3699
2025-07-28 17:14:40,150 INFO Fitted median for 'temperature': 26.2531
2025-07-28 17:14:40,359 INFO Created combined group 'median_group_id' with 8 unique values


[LOG] 8 rows for split=train, model_id=a64a4739bae41a05c7b666c4e6f13489, random_state=12


2025-07-28 17:15:00,323 INFO Created combined group 'median_group_id' with 8 unique values


[LOG] 8 rows for split=validation, model_id=a64a4739bae41a05c7b666c4e6f13489, random_state=12


2025-07-28 17:15:03,470 INFO Created combined group 'median_group_id' with 8 unique values


[LOG] 8 rows for split=test, model_id=a64a4739bae41a05c7b666c4e6f13489, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv


Unnamed: 0,model_id_hash,random_state,params_json,log_time,model_name,pipeline_steps,pipeline_n_steps,x_columns,metrics_by_group,r2_train,...,mae_test,mape_train,mape_validation,mape_test,n_obs_train,n_obs_validation,n_obs_test,mse_train,mse_validation,mse_test
0,a64a4739bae41a05c7b666c4e6f13489,12,"{""Feature_Addition"":""Pipeline(steps=[('Add_Dat...",2025-07-28T17:14:59.574870,GroupwiseRegressor,"[Feature_Addition, Multi_Median_Binner, Groupw...",3,"[total_generation, total_generation_sqrd, mont...","{'train': {0: {'r2': 0.8254403191829682, 'rmse...",0.800277,...,3508.763327,4.166401,3.844666,5.824811,291701.25,41011.875,95580.0,14074530.0,13677890.0,28379600.0


###### Grid Search

In [61]:
multi_quantile_param_grid = build_grid_configs(
    candidate_binning_vars=["surface_net_solar_radiation_kwh", "wind_speed", "temperature", "precipitation_mm", "total_cloud_cover"],
    candidate_bin_counts=[3, 5, 10, 20, 50],
    candidate_x_vars=["total_generation", "total_generation_sqrd"],
    candidate_fe_vars=["month", "hour", "week_of_year", "day_of_week", "half_hour"]
)

grid_search_config = [
    {
        "binner_class": MultiQuantileBinner,
        "binner_kwargs": {"bin_specs": config["bin_spec"]},
        "label": config["label"],
        "x_vars": config["x_vars"],
        "fe_vars": config["fe_vars"],
    }
    for config in multi_quantile_param_grid
]

In [62]:
for config in grid_search_config:
    regressor_kwargs = {
        "y_var": "tons_co2",
        "x_vars": config.pop("x_vars"),
        "fe_vars": config.pop("fe_vars"),
        "group_col": "quantile_group_id",
        "min_group_size": 20,
        "track_metrics": True,
        "verbose": False,
        "random_state": 12,  # Ensure reproducibility
    }

    run_grid_search(
        base_feature_pipeline=feature_addition_pipeline,  # Replace with your pipeline (or `Pipeline([])` if none)
        regressor_cls=GroupwiseRegressor,
        regressor_kwargs=regressor_kwargs,
        grid_config=[config],  # Wrapped in list so you run one config at a time
        x_splits={
            "train": train_pdf_x_all,
            "validation": validation_pdf_x_all,
            "test": test_pdf_x_all,
        },
        y_splits={
            "train": train_pdf_y,
            "validation": validation_pdf_y,
            "test": test_pdf_y,
        },
        log_path="marginal_emissions_log.csv",
        global_extra_info={"model_type": "multi_binner"},
        force_run=False,
        force_overwrite=False,
        base_feature_pipeline_name="FeatureAdditionPipeline"
    )


[GRID] Running config: bin_3_surface_net_solar_radiation_kwh__x_total_generation-total_generation_sqrd__fe_month
[SKIP] Model already logged (hash: 0aa03ba44d921b2c12fdd68eb327542b)
[GRID] Skipped: bin_3_surface_net_solar_radiation_kwh__x_total_generation-total_generation_sqrd__fe_month (already logged)

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh__x_total_generation-total_generation_sqrd__fe_hour
[SKIP] Model already logged (hash: bd5b63cceff327f1ed6143315ad08152)
[GRID] Skipped: bin_3_surface_net_solar_radiation_kwh__x_total_generation-total_generation_sqrd__fe_hour (already logged)

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh__x_total_generation-total_generation_sqrd__fe_week_of_year
[SKIP] Model already logged (hash: cb1634b7d0913749cef56df589655f54)
[GRID] Skipped: bin_3_surface_net_solar_radiation_kwh__x_total_generation-total_generation_sqrd__fe_week_of_year (already logged)

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh__x_total

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 2389 rows for split=validation, model_id=778bbe21802a80f41a6dee51a6eb7dd0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 2400 rows for split=test, model_id=778bbe21802a80f41a6dee51a6eb7dd0, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_50_surface_net_solar_radiation_kwh-wind_speed__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_50_surface_net_solar_radiation_kwh-wind_speed__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week-half_hour
[LOG] 2450 rows for split=train, model_id=b99e56b765b3984b95dc5cf2e3c890b4, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 2389 rows for split=validation, model_id=b99e56b765b3984b95dc5cf2e3c890b4, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 2400 rows for split=test, model_id=b99e56b765b3984b95dc5cf2e3c890b4, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_50_surface_net_solar_radiation_kwh-wind_speed__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_50_surface_net_solar_radiation_kwh-wind_speed__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week-half_hour
[LOG] 2450 rows for split=train, model_id=6282e45b57d872b961a9f92bfc015233, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 2389 rows for split=validation, model_id=6282e45b57d872b961a9f92bfc015233, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 2400 rows for split=test, model_id=6282e45b57d872b961a9f92bfc015233, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_50_surface_net_solar_radiation_kwh-wind_speed__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month
[LOG] 11 rows for split=train, model_id=a2e83ec221510cc5f2e4afb4bf470abe, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=a2e83ec221510cc5f2e4afb4bf470abe, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=a2e83ec221510cc5f2e4afb4bf470abe, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour
[LOG] 11 rows for split=train, model_id=6a0b364956a5ea9a5f15ddd0bc25db72, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=6a0b364956a5ea9a5f15ddd0bc25db72, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=6a0b364956a5ea9a5f15ddd0bc25db72, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year
[LOG] 11 rows for split=train, model_id=21db27aeb5d04bdc1958351da44851fb, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=21db27aeb5d04bdc1958351da44851fb, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=21db27aeb5d04bdc1958351da44851fb, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week
[LOG] 11 rows for split=train, model_id=ada6c8a709b3b8abab1943416d8cafce, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=ada6c8a709b3b8abab1943416d8cafce, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=ada6c8a709b3b8abab1943416d8cafce, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_half_hour
[LOG] 11 rows for split=train, model_id=4a2c08fd512224ecc42bcc0e216a01a3, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=4a2c08fd512224ecc42bcc0e216a01a3, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=4a2c08fd512224ecc42bcc0e216a01a3, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour
[LOG] 11 rows for split=train, model_id=904bb2e9866da5407b852c1eb40a8a3f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=904bb2e9866da5407b852c1eb40a8a3f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=904bb2e9866da5407b852c1eb40a8a3f, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year
[LOG] 11 rows for split=train, model_id=fbae0d88867d6d74cbc8c2747a2a7752, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=fbae0d88867d6d74cbc8c2747a2a7752, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=fbae0d88867d6d74cbc8c2747a2a7752, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week
[LOG] 11 rows for split=train, model_id=27ab3fc66d0c4a8974e8d6a57e97acd0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=27ab3fc66d0c4a8974e8d6a57e97acd0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=27ab3fc66d0c4a8974e8d6a57e97acd0, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-half_hour
[LOG] 11 rows for split=train, model_id=3874525a0d920c5cbc69fa1b85043fda, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=3874525a0d920c5cbc69fa1b85043fda, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=3874525a0d920c5cbc69fa1b85043fda, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year
[LOG] 11 rows for split=train, model_id=0996f41c2940a04416ff66507499d76b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=0996f41c2940a04416ff66507499d76b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=0996f41c2940a04416ff66507499d76b, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week
[LOG] 11 rows for split=train, model_id=c68d4d250616aebc2f95e43f5f1b4803, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=c68d4d250616aebc2f95e43f5f1b4803, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=c68d4d250616aebc2f95e43f5f1b4803, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-half_hour
[LOG] 11 rows for split=train, model_id=a4ab71e8f19bdaca08d36eddec609e43, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=a4ab71e8f19bdaca08d36eddec609e43, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=a4ab71e8f19bdaca08d36eddec609e43, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week
[LOG] 11 rows for split=train, model_id=9a77e6f13e6f61139fff045a8d59056d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=9a77e6f13e6f61139fff045a8d59056d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=9a77e6f13e6f61139fff045a8d59056d, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-half_hour
[LOG] 11 rows for split=train, model_id=3815ad8fd1ec6a04bd464900f02798ad, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=3815ad8fd1ec6a04bd464900f02798ad, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=3815ad8fd1ec6a04bd464900f02798ad, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=73a9d0fb4ec00581e9496507a8c80d6e, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=73a9d0fb4ec00581e9496507a8c80d6e, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=73a9d0fb4ec00581e9496507a8c80d6e, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year
[LOG] 11 rows for split=train, model_id=81572d7b670ff9fc2f844a615a1fc3ef, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=81572d7b670ff9fc2f844a615a1fc3ef, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=81572d7b670ff9fc2f844a615a1fc3ef, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week
[LOG] 11 rows for split=train, model_id=e3d8772aae88f81c8ece75680fdfa553, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=e3d8772aae88f81c8ece75680fdfa553, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=e3d8772aae88f81c8ece75680fdfa553, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-half_hour
[LOG] 11 rows for split=train, model_id=89bd973a8511752cd3d1981b831f7c5a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=89bd973a8511752cd3d1981b831f7c5a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=89bd973a8511752cd3d1981b831f7c5a, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week
[LOG] 11 rows for split=train, model_id=c120762ccfd8d1560898780d03d4ea42, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=c120762ccfd8d1560898780d03d4ea42, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=c120762ccfd8d1560898780d03d4ea42, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-half_hour
[LOG] 11 rows for split=train, model_id=7a552c52fec6366aa80faafce7f53bd3, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=7a552c52fec6366aa80faafce7f53bd3, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=7a552c52fec6366aa80faafce7f53bd3, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=a92819b458c2bc60c4edfc6ba6e06c17, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=a92819b458c2bc60c4edfc6ba6e06c17, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=a92819b458c2bc60c4edfc6ba6e06c17, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week
[LOG] 11 rows for split=train, model_id=29e4705347a8ad2da77863bbe9c03c78, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=29e4705347a8ad2da77863bbe9c03c78, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=29e4705347a8ad2da77863bbe9c03c78, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-half_hour
[LOG] 11 rows for split=train, model_id=52af977991629ea7c71fdaccb34d1aec, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=52af977991629ea7c71fdaccb34d1aec, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=52af977991629ea7c71fdaccb34d1aec, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=0ddaadb3bf2ac306b3254c1e770efd43, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=0ddaadb3bf2ac306b3254c1e770efd43, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=0ddaadb3bf2ac306b3254c1e770efd43, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=53f26f53b04ea81fea46fb40c93ca64e, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=53f26f53b04ea81fea46fb40c93ca64e, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=53f26f53b04ea81fea46fb40c93ca64e, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week
[LOG] 11 rows for split=train, model_id=e514f0b6afdcfa1e4109966e7c635d75, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=e514f0b6afdcfa1e4109966e7c635d75, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=e514f0b6afdcfa1e4109966e7c635d75, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-half_hour
[LOG] 11 rows for split=train, model_id=317d7c6a026ad480b9ce3eed17743fb2, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=317d7c6a026ad480b9ce3eed17743fb2, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=317d7c6a026ad480b9ce3eed17743fb2, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=97f4b128229de12e3e0c93810267ee6b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=97f4b128229de12e3e0c93810267ee6b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=97f4b128229de12e3e0c93810267ee6b, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=cbdbf21217b23daeffc47a4fb56e1e1b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=cbdbf21217b23daeffc47a4fb56e1e1b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=cbdbf21217b23daeffc47a4fb56e1e1b, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=c065574485d7c08a37f49bccbe026b4a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=c065574485d7c08a37f49bccbe026b4a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=c065574485d7c08a37f49bccbe026b4a, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week-half_hour
[LOG] 11 rows for split=train, model_id=7a973196cfa695af542e3fa5aa0840e5, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=validation, model_id=7a973196cfa695af542e3fa5aa0840e5, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 11 rows for split=test, model_id=7a973196cfa695af542e3fa5aa0840e5, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_3_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month
[LOG] 28 rows for split=train, model_id=6f86be7f407d787438e7933cf07ec41c, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=6f86be7f407d787438e7933cf07ec41c, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=6f86be7f407d787438e7933cf07ec41c, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour
[LOG] 28 rows for split=train, model_id=91827169bc3bfccb739a4c3148812045, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=91827169bc3bfccb739a4c3148812045, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=91827169bc3bfccb739a4c3148812045, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year
[LOG] 28 rows for split=train, model_id=2f8e2c6afa4536cae1f8aacb13c91a88, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=2f8e2c6afa4536cae1f8aacb13c91a88, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=2f8e2c6afa4536cae1f8aacb13c91a88, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week
[LOG] 28 rows for split=train, model_id=4d3d6abab0482340dc4c33f827eac68d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=4d3d6abab0482340dc4c33f827eac68d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=4d3d6abab0482340dc4c33f827eac68d, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_half_hour
[LOG] 28 rows for split=train, model_id=a76c20c448d30cfdbfe62dad7f63db02, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=a76c20c448d30cfdbfe62dad7f63db02, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=a76c20c448d30cfdbfe62dad7f63db02, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour
[LOG] 28 rows for split=train, model_id=dbb8bd50d0dbec01df0943f181468fa8, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=dbb8bd50d0dbec01df0943f181468fa8, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=dbb8bd50d0dbec01df0943f181468fa8, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year
[LOG] 28 rows for split=train, model_id=dafec40d614b99fb5f239237a8f1d0a0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=dafec40d614b99fb5f239237a8f1d0a0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=dafec40d614b99fb5f239237a8f1d0a0, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week
[LOG] 28 rows for split=train, model_id=3228cb91adc5885bde3ea51701854085, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=3228cb91adc5885bde3ea51701854085, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=3228cb91adc5885bde3ea51701854085, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-half_hour
[LOG] 28 rows for split=train, model_id=5cd1496ba67e31fafec742fca5cb1496, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=5cd1496ba67e31fafec742fca5cb1496, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=5cd1496ba67e31fafec742fca5cb1496, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year
[LOG] 28 rows for split=train, model_id=c5a454426ae27f9e17e4f23d3366215c, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=c5a454426ae27f9e17e4f23d3366215c, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=c5a454426ae27f9e17e4f23d3366215c, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week
[LOG] 28 rows for split=train, model_id=43d8e19b9a444dd3a17032fa621b726a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=43d8e19b9a444dd3a17032fa621b726a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=43d8e19b9a444dd3a17032fa621b726a, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-half_hour
[LOG] 28 rows for split=train, model_id=e82a31733e326babd3145baf9192e9a2, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=e82a31733e326babd3145baf9192e9a2, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=e82a31733e326babd3145baf9192e9a2, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week
[LOG] 28 rows for split=train, model_id=6f562c73425d83d2e013985539b93bc0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=6f562c73425d83d2e013985539b93bc0, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=6f562c73425d83d2e013985539b93bc0, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-half_hour
[LOG] 28 rows for split=train, model_id=be9e476a2b128bc2a9da38328b01ccaf, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=be9e476a2b128bc2a9da38328b01ccaf, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=be9e476a2b128bc2a9da38328b01ccaf, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=ed57be91aa9370baceaad07fe49604ad, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=ed57be91aa9370baceaad07fe49604ad, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=ed57be91aa9370baceaad07fe49604ad, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year
[LOG] 28 rows for split=train, model_id=27114e7c7ab333cd49dc8ecadc4c3367, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=27114e7c7ab333cd49dc8ecadc4c3367, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=27114e7c7ab333cd49dc8ecadc4c3367, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week
[LOG] 28 rows for split=train, model_id=4f4b6f13b4faa4c6e3431def5df44c1f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=4f4b6f13b4faa4c6e3431def5df44c1f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=4f4b6f13b4faa4c6e3431def5df44c1f, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-half_hour
[LOG] 28 rows for split=train, model_id=194d8705bdac120f22d1adc27aea3852, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=194d8705bdac120f22d1adc27aea3852, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=194d8705bdac120f22d1adc27aea3852, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week
[LOG] 28 rows for split=train, model_id=22edd22ee1b9ba13382d5c404fe6a2a4, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=22edd22ee1b9ba13382d5c404fe6a2a4, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=22edd22ee1b9ba13382d5c404fe6a2a4, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-half_hour
[LOG] 28 rows for split=train, model_id=5731ea3f26e90be6b218e97ff0c6ce5d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=5731ea3f26e90be6b218e97ff0c6ce5d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=5731ea3f26e90be6b218e97ff0c6ce5d, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=0d4d6be0071da2b912077ac6276c2d6f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=0d4d6be0071da2b912077ac6276c2d6f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=0d4d6be0071da2b912077ac6276c2d6f, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week
[LOG] 28 rows for split=train, model_id=3e01b5c3bbe1939a3073194cf966a519, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=3e01b5c3bbe1939a3073194cf966a519, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=3e01b5c3bbe1939a3073194cf966a519, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-half_hour
[LOG] 28 rows for split=train, model_id=41cb9b64c235ee625ae997db4d44cb35, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=41cb9b64c235ee625ae997db4d44cb35, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=41cb9b64c235ee625ae997db4d44cb35, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=0d55af730de75b948ab040cb900e6282, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=0d55af730de75b948ab040cb900e6282, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=0d55af730de75b948ab040cb900e6282, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=d77f3fb7f6fbf8ccc1c4c3e2f8b1b0c1, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=d77f3fb7f6fbf8ccc1c4c3e2f8b1b0c1, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=d77f3fb7f6fbf8ccc1c4c3e2f8b1b0c1, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week
[LOG] 28 rows for split=train, model_id=f695d035036cfe011796c5547546bc27, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=f695d035036cfe011796c5547546bc27, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=f695d035036cfe011796c5547546bc27, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-half_hour
[LOG] 28 rows for split=train, model_id=bc732f732f2d147d1d3aa7daf0955c4c, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=bc732f732f2d147d1d3aa7daf0955c4c, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=bc732f732f2d147d1d3aa7daf0955c4c, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=c60d8ba837ca3b02a0412c9ae4461a97, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=c60d8ba837ca3b02a0412c9ae4461a97, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=c60d8ba837ca3b02a0412c9ae4461a97, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=3c7b7f47423dece159e75501658aaf24, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=3c7b7f47423dece159e75501658aaf24, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=3c7b7f47423dece159e75501658aaf24, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=01fc9aca5b4b400240c836643ce76067, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=01fc9aca5b4b400240c836643ce76067, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=01fc9aca5b4b400240c836643ce76067, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week-half_hour
[LOG] 28 rows for split=train, model_id=4ffc2bc9cbafcd0a3c3a88c7dc249f8d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=validation, model_id=4ffc2bc9cbafcd0a3c3a88c7dc249f8d, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 28 rows for split=test, model_id=4ffc2bc9cbafcd0a3c3a88c7dc249f8d, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_5_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year-day_of_week-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month
[LOG] 104 rows for split=train, model_id=26e85214faac3c481d8a2b1adb7dd942, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=26e85214faac3c481d8a2b1adb7dd942, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=26e85214faac3c481d8a2b1adb7dd942, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour
[LOG] 104 rows for split=train, model_id=ac305aea29f921f5326ed9ac86a39145, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=ac305aea29f921f5326ed9ac86a39145, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=ac305aea29f921f5326ed9ac86a39145, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year
[LOG] 104 rows for split=train, model_id=ddaa9bab325899d06c2eb37f4952900a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=ddaa9bab325899d06c2eb37f4952900a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=ddaa9bab325899d06c2eb37f4952900a, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week
[LOG] 104 rows for split=train, model_id=81f4a03a59767ad2e33d963695af8cf1, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=81f4a03a59767ad2e33d963695af8cf1, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=81f4a03a59767ad2e33d963695af8cf1, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_half_hour
[LOG] 104 rows for split=train, model_id=dd6a1b118fca235f804faa6db02e12e9, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=dd6a1b118fca235f804faa6db02e12e9, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=dd6a1b118fca235f804faa6db02e12e9, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour
[LOG] 104 rows for split=train, model_id=d05fd76f9ced643cf520c187fc3b677b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=d05fd76f9ced643cf520c187fc3b677b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=d05fd76f9ced643cf520c187fc3b677b, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year
[LOG] 104 rows for split=train, model_id=b9b64fe5df5c01eeeeb0c306f49e8db8, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=b9b64fe5df5c01eeeeb0c306f49e8db8, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=b9b64fe5df5c01eeeeb0c306f49e8db8, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week
[LOG] 104 rows for split=train, model_id=3ac5ea5f69e1f8b9fe8414ef7613627f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=3ac5ea5f69e1f8b9fe8414ef7613627f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=3ac5ea5f69e1f8b9fe8414ef7613627f, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-half_hour
[LOG] 104 rows for split=train, model_id=f01be333f5fa430788452a38a9685b08, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=f01be333f5fa430788452a38a9685b08, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=f01be333f5fa430788452a38a9685b08, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year
[LOG] 104 rows for split=train, model_id=76ad260a14f21a91e6e6fd325515cc75, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=76ad260a14f21a91e6e6fd325515cc75, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=76ad260a14f21a91e6e6fd325515cc75, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week
[LOG] 104 rows for split=train, model_id=0aca5a3a1f342a3f70da93865a1d0d7a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=0aca5a3a1f342a3f70da93865a1d0d7a, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=0aca5a3a1f342a3f70da93865a1d0d7a, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-half_hour
[LOG] 104 rows for split=train, model_id=7c38ca17362565aad068b13e060562cf, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=7c38ca17362565aad068b13e060562cf, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=7c38ca17362565aad068b13e060562cf, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week
[LOG] 104 rows for split=train, model_id=e798be32d7001c353eb22a77c22e9d38, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=e798be32d7001c353eb22a77c22e9d38, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=e798be32d7001c353eb22a77c22e9d38, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-half_hour
[LOG] 104 rows for split=train, model_id=ab438c1a87c53e11d32711cae443f73f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=ab438c1a87c53e11d32711cae443f73f, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=ab438c1a87c53e11d32711cae443f73f, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_week_of_year-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week-half_hour
[LOG] 104 rows for split=train, model_id=492b34fc7aa78f09c09590aa97222519, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=492b34fc7aa78f09c09590aa97222519, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=492b34fc7aa78f09c09590aa97222519, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_day_of_week-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year
[LOG] 104 rows for split=train, model_id=327d4f29aacf53b70da271dbe50b80d5, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=327d4f29aacf53b70da271dbe50b80d5, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=327d4f29aacf53b70da271dbe50b80d5, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-week_of_year

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week
[LOG] 104 rows for split=train, model_id=9a36b8e07b220240c3b444afa4d77c37, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=9a36b8e07b220240c3b444afa4d77c37, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=9a36b8e07b220240c3b444afa4d77c37, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-half_hour
[LOG] 104 rows for split=train, model_id=292d0a6a1f8d62aaea9fab80d2d58800, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=292d0a6a1f8d62aaea9fab80d2d58800, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=292d0a6a1f8d62aaea9fab80d2d58800, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-hour-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week
[LOG] 104 rows for split=train, model_id=66e4a0d795bdd020d99bbb6e02913024, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=66e4a0d795bdd020d99bbb6e02913024, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=66e4a0d795bdd020d99bbb6e02913024, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-half_hour
[LOG] 104 rows for split=train, model_id=1e86e4c7640490045f0c3a0470d9ee2b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=1e86e4c7640490045f0c3a0470d9ee2b, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=1e86e4c7640490045f0c3a0470d9ee2b, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-week_of_year-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week-half_hour
[LOG] 104 rows for split=train, model_id=3681ed78c8811934dd6147eb1b5d4d78, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=3681ed78c8811934dd6147eb1b5d4d78, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=3681ed78c8811934dd6147eb1b5d4d78, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_month-day_of_week-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week
[LOG] 104 rows for split=train, model_id=0c7406d72b17f4b1422a3986921863d7, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=0c7406d72b17f4b1422a3986921863d7, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=0c7406d72b17f4b1422a3986921863d7, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-day_of_week

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-half_hour
[LOG] 104 rows for split=train, model_id=a9ab013232a8b1887af94c7e94b440d2, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=validation, model_id=a9ab013232a8b1887af94c7e94b440d2, random_state=12


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[f"{var}_group"].fillna(1, inplace=True)  # or fallback to len(edges)-1


[LOG] 103 rows for split=test, model_id=a9ab013232a8b1887af94c7e94b440d2, random_state=12
[SAVE] Summary saved to marginal_emissions_log.csv
[GRID] Logged: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-week_of_year-half_hour

[GRID] Running config: bin_10_surface_net_solar_radiation_kwh-temperature__x_total_generation-total_generation_sqrd__fe_hour-day_of_week-half_hour


KeyboardInterrupt: 

In [None]:
candidate_binning_vars = ["surface_net_solar_radiation_kwh", "wind_speed", "temperature", "precipitation_mm", "total_cloud_cover"]
candidate_x_vars = ["total_generation", "total_generation_sqrd"]
candidate_fe_vars = ["month", "hour", "week_of_year", "day_of_week", "half_hour"]

multi_median_param_grid = build_median_binner_configs(
    candidate_binning_vars=candidate_binning_vars,
    candidate_x_vars=candidate_x_vars,
    candidate_fe_vars=candidate_fe_vars
)

grid_search_config = [
    {
        "binner_class": MultiMedianBinner,
        "label": config["label"],
        "x_vars": config["x_vars"],
        "fe_vars": config["fe_vars"],
    }
    for config in multi_median_param_grid
]

NEXT STEPS:


Figure out how to extract metrics (average RMSE, MAE, R2, and MSE, MAPE)
- Figure out how to log regults

go through my process of feature adding - corelaton and other analysis


-experiment with adding a few more variables to the analysis

Develop mew models
- Run once with just the plain data (as before
once with the new variables that I think would be good
)



NEXT - MY PROCESS
- ATTEMPT POlYNOMIAL WITH 3 and 4 terms

ATTEMPT OLS

ATTEMPT GAMS

ATTEMPT 



: 