In [None]:
# Pipeline
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Callable, Tuple, Self
import time
import gc
import warnings
import os
import shutil
import pickle
from flaml import AutoML
import os
from glob import glob
import inspect
import logging
logger = logging.getLogger(__name__)        
import cloudpickle
import hashlib
import inspect
from typing import Any, Dict, Optional


SEED = 42
def fallback_latest_notebook():
    notebooks = glob("*.ipynb")
    if not notebooks:
        return None
    notebooks = sorted(notebooks, key=os.path.getmtime, reverse=True)
    return notebooks[0]



warnings.filterwarnings('ignore', category=FutureWarning)


class InDiskCacheWrapper:
    """
    Wrapper class to enable in-disk caching for pipeline steps.
    It uses the InDiskCache class to cache artifacts on disk.
    """
    def __init__(self, step: "PipelineStep", cache_dir: str = ".cache", execute_params: Optional[Dict[str, Any]] = None):
        self.step = step
        self.cache_dir = os.path.join(cache_dir, step.name)
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
        self._execute_params = execute_params or {}

    def execute(self, *args: Any, **kwargs: Any) -> None:
        """if the step has a cache, it hashes the parameters and checks if the result is already cached.
        note that params could be any object, so it uses cloudpickle to serialize them.
        If the result is cached, it returns the cached result.
        If not, it executes the step and saves the result in the cache.
        """
        # Bind args/kwargs to parameter names using original signature
        bound = inspect.signature(self.step.execute).bind(*args, **kwargs)
        #bound.apply_defaults()

        # also checks que values from __init__ for the hash
        init_params = self.step.__dict__.copy()
        # si los parametros con los que se inicializo cambiaron entonces deberia missear el cache
        bound.apply_defaults()

        # Serialize input arguments with cloudpickle
        try:
            serialized = cloudpickle.dumps(bound.arguments)
            # Include init parameters in the serialization
            serialized += cloudpickle.dumps(init_params)
        except Exception as e:
            raise ValueError(f"Failed to serialize for cache: {e}")

        # Generate a hash key from inputs
        hash_key = hashlib.sha256(serialized).hexdigest()
        cache_file = os.path.join(self.cache_dir, f"{hash_key}.pkl")

        # Load from cache or compute and save
        if os.path.exists(cache_file):
            print(f"Loading cached result for {self.step.name} from {cache_file}")
            with open(cache_file, "rb") as f:
                return pickle.load(f)
        else:
            print(f"Cache miss for {self.step.name}, executing step and saving result to {cache_file}")
            result = self.step.execute(*args, **kwargs)
            with open(cache_file, "wb") as f:
                pickle.dump(result, f)
            return result

    def get_execute_params(self) -> Dict[str, Any]:
        """
        Get the parameters for the execute method of the wrapped step.
        """
        return self._execute_params
    
    @property
    def name(self) -> str:
        """
        Get the name of the step.
        """
        return self.step.name
    

class InMemoryCacheWrapper:
    """
    Wrapper class to enable in-memory caching for pipeline steps.
    It uses the InMemoryCache class to cache artifacts in memory.
    """
    cache = {}
    
    def __init__(self, step: "PipelineStep", execute_params: Optional[Dict[str, Any]] = None):
        self.step = step
        self._execute_params = execute_params or {}

    def execute(self, *args: Any, **kwargs: Any) -> None:
        """Execute the step and cache the result in memory."""
        # Bind args/kwargs to parameter names using original signature
        bound = inspect.signature(self.step.execute).bind(*args, **kwargs)

        init_params = self.step.__dict__.copy()
        # Merge init parameters with execute parameters
        bound.arguments.update(init_params)
        bound.apply_defaults()

        # Serialize input arguments with cloudpickle
        try:
            serialized = cloudpickle.dumps(bound.arguments)
        except Exception as e:
            raise ValueError(f"Failed to serialize for cache: {e}")

        # Generate a hash key from inputs
        hash_key = hashlib.sha256(serialized).hexdigest()

        # Load from cache or compute and save
        if hash_key in self.cache:
            print(f"Loading cached result for {self.step.name} from memory")
            return self.cache[hash_key]
        else:
            print(f"Cache miss for {self.step.name}, executing step and saving result in memory")
            result = self.step.execute(*args, **kwargs)
            self.cache[hash_key] = result
            return result

    def get_execute_params(self) -> Dict[str, Any]:
        """
        Get the parameters for the execute method of the wrapped step.
        """
        return self._execute_params
    
    @property
    def name(self) -> str:
        """
        Get the name of the step.
        """
        return self.step.name
    

class CachedPipelineMixin:
    def in_disk_cache(self, cache_dir: str = ".cache") -> Self:
        """
        It activate the in-disk cache using the InDisKCache class. returns the step itself.
        Args:
            cache_dir (str): Directory where the cache will be stored.
        """
        execute_params = self.get_execute_params()
        return InDiskCacheWrapper(self, cache_dir=cache_dir, execute_params=execute_params)
    
    def in_memory_cache(self) -> Self:
        """
        It activate the in-memory cache using the InMemoryCache class. returns the step itself.
        """
        execute_params = self.get_execute_params()
        return InMemoryCacheWrapper(self, execute_params=execute_params)
    

class PipelineStep(ABC, CachedPipelineMixin):
    """
    Abstract base class for pipeline steps.
    Each step in the pipeline must inherit from this class and implement the execute method.
    """
    def __init__(self, name: Optional[str] = None):
        """
        Initialize a pipeline step.

        Args:
            name (str): Name of the step for identification and logging purposes.
        """
        self._name = name or self.__class__.__name__

    @abstractmethod
    def execute(self, *args: Any, **kwargs: Any) -> None:
        """
        Execute the pipeline step.
    
        Args:
            pipeline (Pipeline): The pipeline instance that contains this step.
        """
        pass

    def save_artifact(self, pipeline: "Pipeline", artifact_name: str, artifact: Any) -> None:
        """
        Save an artifact produced by this step to the pipeline.

        Args:
            pipeline (Pipeline): The pipeline instance.
            artifact_name (str): Name to identify the artifact.
            artifact (Any): The artifact to save.
        """
        pipeline.save_artifact(artifact_name, artifact)

    def get_artifact(self, pipeline: "Pipeline", artifact_name: str, default=None, raise_not_found=True) -> Any:
        """
        Retrieve a stored artifact from the pipeline.

        Args:
            pipeline (Pipeline): The pipeline instance.
            artifact_name (str): Name of the artifact to retrieve.
            default: Default value to return if the artifact is not found.
            raise_not_found (bool): Whether to raise an error if the artifact is not found.

        Returns:
            Any: The requested artifact or default value.
        """
        return pipeline.get_artifact(artifact_name, default=default, raise_not_found=raise_not_found)
    
    def del_artifact(self, pipeline: "Pipeline", artifact_name: str, soft=True) -> None:
        """
        Delete a stored artifact from the pipeline and free memory.

        Args:
            pipeline (Pipeline): The pipeline instance.
            artifact_name (str): Name of the artifact to delete.
            soft (bool): If True, performs a soft delete; if False, forces garbage collection.
        """
        pipeline.del_artifact(artifact_name, soft=soft)

    def get_execute_params(self):
        sig = inspect.signature(self.execute)
        return sig.parameters

        
    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._name = value
    


class Pipeline:
    """
    Main pipeline class that manages the execution of steps and storage of artifacts.
    """
    def __init__(self, steps: Optional[List[PipelineStep]] = None, optimize_arftifacts_memory: bool = True, needs=None):
        """Initialize the pipeline."""
        self.steps: List[PipelineStep] = steps if steps is not None else []
        self.artifacts: Dict[str, Any] = {}
        self.optimize_arftifacts_memory = optimize_arftifacts_memory
        self.needs = needs or []
        self.finished = False

    def add_step(self, step: PipelineStep, position: Optional[int] = None) -> None:
        """
        Add a new step to the pipeline.

        Args:
            step (PipelineStep): The step to add.
            position (Optional[int]): Position where to insert the step. If None, appends to the end.
        """
        if position is not None:
            self.steps.insert(position, step)
        else:
            self.steps.append(step)

    def save_artifact(self, artifact_name: str, artifact: Any) -> None:
        """
        Save an artifact from a given step.

        Args:
            artifact_name (str): Name to identify the artifact.
            artifact (Any): The artifact to save.
        """
        if not self.optimize_arftifacts_memory:
            self.artifacts[artifact_name] = artifact
        else:
            # guarda el artifact en /tmp/ para no guardarlo en memoria
            if not os.path.exists("/tmp/"):
                os.makedirs("/tmp/")
            artifact_path = os.path.join("/tmp/", artifact_name)
            with open(artifact_path, 'wb') as f:
                pickle.dump(artifact, f)
            self.artifacts[artifact_name] = artifact_path

    def get_artifact(self, artifact_name: str, default=None, raise_not_found=True) -> Any:
        """
        Retrieve a stored artifact.

        Args:
            artifact_name (str): Name of the artifact to retrieve.

        Returns:
            Any: The requested artifact.
        """
        if not self.optimize_arftifacts_memory:
            return self.artifacts.get(artifact_name)
        else:
            artifact_path = self.artifacts.get(artifact_name)
            if artifact_path and os.path.exists(artifact_path):
                with open(artifact_path, 'rb') as f:
                    return pickle.load(f)
            else:
                if raise_not_found:
                    raise FileNotFoundError(f"Artifact {artifact_name} not found in /tmp/")
                return default
    
    def del_artifact(self, artifact_name: str, soft=True) -> None:
        """
        Delete a stored artifact and free memory.

        Args:
            artifact_name (str): Name of the artifact to delete.
        """
        del self.artifacts[artifact_name]
        if not soft:
            # Force garbage collection if not soft delete
            gc.collect()
    
    
    def run(self, verbose: bool = True, last_step_callback: Callable = None) -> None:
        """
        Execute all steps in sequence and log execution time.
        """        
        
        # Run steps from the last completed step
        if self.finished:
            if verbose:
                print("Pipeline has already finished. Skipping execution.")
            return
        
        for step in self.steps:
            if verbose:
                print(f"Executing step: {step.name}")
            start_time = time.time()
            params = self.__fill_params_from_step(step)
            artifacts_to_save = step.execute(**params)
            if artifacts_to_save is None:
                artifacts_to_save = {}
            self.__save_step_artifacts(artifacts_to_save)
            end_time = time.time()
            if verbose:
                print(f"Step {step.name} completed in {end_time - start_time:.2f} seconds")
        self.finished = True

    def __fill_params_from_step(self, step: PipelineStep) -> Dict[str, Any]:
        """
        Obtiene los nombres de los parametros de la implementacion de la funcion execute del paso. (excepto el pipeline el cual es obligatorio)
        luego obtengo todos los artefactos del pipeline y los paso como parametros al paso.
        """
        step_params = step.get_execute_params()
        params = {}
        for name, param in step_params.items():
            if name == 'pipeline':
                params[name] = self
            elif param.default is inspect.Parameter.empty:
                params[name] = self.get_artifact(name)
            else:
                params[name] = self.get_artifact(name, default=param.default, raise_not_found=False)
        return params

    

    def __save_step_artifacts(self, artifacts_to_save: Dict[str, Any]) -> None:
        """
        Save artifacts produced by a step to the pipeline.

        Args:
            artifacts_to_save (Dict[str, Any]): Artifacts to save.
        """

        for name, artifact in artifacts_to_save.items():
            self.save_artifact(name, artifact)



    def clear(self, collect_garbage: bool = False) -> None:
        """
        Clean up all artifacts and free memory.
        """
        if collect_garbage:
            del self.artifacts
            gc.collect()
        self.artifacts = {}
        self.last_step = None
        self.finished = False

# LoadDataFrameFromPickleStep
import pandas as pd
from typing import Optional
import pickle

class LoadDataFrameStep(PipelineStep):
    """
    Example step that loads a DataFrame.
    """
    def __init__(self, path: str, name: Optional[str] = None):
        super().__init__(name)
        self.path = path

    def execute(self) -> None:
        df = pd.read_parquet(self.path)
        df = df.drop(columns=["periodo"], errors='ignore')
        return {"df": df}
    

class LoadDataFrameFromPickleStep(PipelineStep):
    """
    Example step that loads a DataFrame from a pickle file.
    """
    def __init__(self, path: str, name: Optional[str] = None):
        super().__init__(name)
        self.path = path

    def execute(self) -> None:
        df = pd.read_pickle(self.path)
        return {"df": df}
    
    
class LoadScalerStep(PipelineStep):
    def __init__(self, artifact_name: str, file_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.file_name = file_name
        self.artifact_name = artifact_name

    def execute(self):
        """
        Carga un scaler previamente guardado desde un archivo.
        """
        with open(self.file_name, "rb") as f:
            scaler = pickle.load(f)
        return {self.artifact_name: scaler}   

# SplitDataFrameStep
from typing import Optional, Dict
import pandas as pd

class SplitDataFrameStep(PipelineStep):
    def __init__(
            self, 
            test_date="2019-12", 
            df="df", 
            gap=0,
            name: Optional[str] = None
        ):
        super().__init__(name)
        self.test_date = test_date
        self.df = df
        self.gap = gap 

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df)
        test_df = df[df["fecha"] == self.test_date]
        train_df = df[df["fecha"] < self.test_date]
        last_train_date = train_df["fecha"].max()
        if isinstance(last_train_date, pd.Period):
            last_train_date = last_train_date.to_timestamp()
        gap_date = pd.to_datetime(last_train_date) - pd.DateOffset(months=self.gap)
        # Convert gap_date to Period with same freq as fecha
        if pd.api.types.is_period_dtype(df["fecha"]):
            gap_date = pd.Period(gap_date, freq=df["fecha"].dt.freq)
        train_df = train_df[train_df["fecha"] < gap_date]
        return {
            "train_index": train_df.index,
            "test_index": test_df.index
        }


class PrepareXYStep(PipelineStep):
    def execute(self, df, train_index, test_index) -> None:
        columns = df.columns
        #features = [col for col in columns if col != "fecha" and "target" not in col]
        features = [col for col in columns if col != "fecha" and "target" not in col]
        targets = [col for col in columns if "target" in col]
        X_train = df.loc[train_index][features]
        y_train = df.loc[train_index][targets]
        X_test = df.loc[test_index][features]
        y_test = df.loc[test_index][targets]
        return {
            "features": features,
            "targets": targets,
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
        }


class CreateTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2)    
        return {"df": df, "target_col": self.target_col}
    

class CreateMultiDiffTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target_1'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1) - df[self.target_col]
        df['target_2'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1)
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target_1'] + x['target_2']
        }


class CreateTargetColumDiffStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:
        df.drop(columns=["target"], inplace=True, errors='ignore')
        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df[self.target_col]
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target']
        }
    

class PredictStep(PipelineStep):
    def execute(self, df, test_index, model, features) -> None:
        X_predict = df.loc[test_index][features]
        predictions = model.predict(X_predict)
        return {"predictions": predictions}


class IntegratePredictionsStep(PipelineStep):
    def execute(self, df, predictions, test_index, target_col, needs_integration=False) -> Dict:
        if not needs_integration:
            return {
                "y_test": df.loc[test_index, ["target"]]
            }
        # crea un nuevo dataframe que es la suma de todas las columnas de predicciones
        if predictions.ndim == 1:
            predictions_sum = pd.Series(predictions, index=test_index, name='predictions')
        else:
            predictions_sum = predictions.sum(axis=1)
        final_predictions = predictions_sum + df.loc[test_index, target_col]
        predictions = pd.Series(final_predictions, index=test_index, name='predictions')
        target_columns = [col for col in df.columns if 'target' in col]
        test_sum = df.loc[test_index, target_columns].sum(axis=1)
        y_test = test_sum + df.loc[test_index, target_col]
        y_test = pd.DataFrame(y_test, index=test_index, columns=["target"])
        
        # nuevo approach, uso integration_function
        
        
        return {
            "predictions": predictions,
            "y_test": y_test
        }
    

## legacy code
class IntegratePredictionsStepOld(PipelineStep):
    def execute(self, pipeline, predict_set, predictions, target_col, test) -> Dict:
        """
        Integra las predicciones al DataFrame de test.
        Si el target_col es una diferencia, se suma el último valor de target_col al target.
        """
        pred_original_df = pipeline.get_artifact(predict_set)
        predictions["predictions"] = predictions["predictions"] + pred_original_df[target_col]
        test["target"] = test["target"] + test[target_col]
        return {
            "predictions": predictions,
            "test": test
        } 



class SplitDataFrameStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df) -> None:
        sorted_dated = sorted(df["fecha"].unique())
        last_date = sorted_dated[-1] # es 12-2019
        last_test_date = sorted_dated[-3] # needs a gap because forecast moth+2
        last_train_date = sorted_dated[-4] #

        kaggle_pred = df[df["fecha"] == last_date]
        test = df[df["fecha"] == last_test_date]
        eval_data = df[df["fecha"] == last_train_date]
        train = df[(df["fecha"] < last_train_date)]
        return {
            "train": train,
            "eval_data": eval_data,
            "test": test,
            "kaggle_pred": kaggle_pred
        }
    

class PrepareXYStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, train, eval_data, test, kaggle_pred) -> None:
        features = [col for col in train.columns if col not in
                        ['fecha', 'target']]
        target = 'target'

        X_train = pd.concat([train[features], eval_data[features]]) # [train + eval] + [eval] -> [test] 
        y_train = pd.concat([train[target], eval_data[target]])

        X_train_alone = train[features]
        y_train_alone = train[target]

        X_eval = eval_data[features]
        y_eval = eval_data[target]

        X_test = test[features]
        y_test = test[target]

        X_train_final = pd.concat([train[features], eval_data[features], test[features]])
        y_train_final = pd.concat([train[target], eval_data[target], test[target]])

        X_kaggle = kaggle_pred[features]
        return {
            "X_train": X_train,
            "y_train": y_train,
            "X_train_alone": X_train_alone,
            "y_train_alone": y_train_alone,
            "X_eval": X_eval,
            "y_eval": y_eval,
            "X_test": X_test,
            "y_test": y_test,
            "X_train_final": X_train_final,
            "y_train_final": y_train_final,
            "X_kaggle": X_kaggle
        }
        

# CreateTargetColumStep
from typing import Optional, Dict
import pandas as pd

class SplitDataFrameStep(PipelineStep):
    def __init__(
            self, 
            test_date="2019-12", 
            df="df", 
            gap=0,
            name: Optional[str] = None
        ):
        super().__init__(name)
        self.test_date = test_date
        self.df = df
        self.gap = gap 

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df)
        test_df = df[df["fecha"] == self.test_date]
        train_df = df[df["fecha"] < self.test_date]
        last_train_date = train_df["fecha"].max()
        if isinstance(last_train_date, pd.Period):
            last_train_date = last_train_date.to_timestamp()
        gap_date = pd.to_datetime(last_train_date) - pd.DateOffset(months=self.gap)
        # Convert gap_date to Period with same freq as fecha
        if pd.api.types.is_period_dtype(df["fecha"]):
            gap_date = pd.Period(gap_date, freq=df["fecha"].dt.freq)
        train_df = train_df[train_df["fecha"] < gap_date]
        return {
            "train_index": train_df.index,
            "test_index": test_df.index
        }


class PrepareXYStep(PipelineStep):
    def execute(self, df, train_index, test_index) -> None:
        columns = df.columns
        #features = [col for col in columns if col != "fecha" and "target" not in col]
        features = [col for col in columns if col != "fecha" and "target" not in col]
        targets = [col for col in columns if "target" in col]
        X_train = df.loc[train_index][features]
        y_train = df.loc[train_index][targets]
        X_test = df.loc[test_index][features]
        y_test = df.loc[test_index][targets]
        return {
            "features": features,
            "targets": targets,
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
        }


class CreateTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2)    
        return {"df": df, "target_col": self.target_col}
    

class CreateMultiDiffTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target_1'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1) - df[self.target_col]
        df['target_2'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1)
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target_1'] + x['target_2']
        }


class CreateTargetColumDiffStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:
        df.drop(columns=["target"], inplace=True, errors='ignore')
        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df[self.target_col]
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target']
        }
    

class PredictStep(PipelineStep):
    def execute(self, df, test_index, model, features) -> None:
        X_predict = df.loc[test_index][features]
        predictions = model.predict(X_predict)
        return {"predictions": predictions}


class IntegratePredictionsStep(PipelineStep):
    def execute(self, df, predictions, test_index, target_col, needs_integration=False) -> Dict:
        if not needs_integration:
            return {
                "y_test": df.loc[test_index, ["target"]]
            }
        # crea un nuevo dataframe que es la suma de todas las columnas de predicciones
        if predictions.ndim == 1:
            predictions_sum = pd.Series(predictions, index=test_index, name='predictions')
        else:
            predictions_sum = predictions.sum(axis=1)
        final_predictions = predictions_sum + df.loc[test_index, target_col]
        predictions = pd.Series(final_predictions, index=test_index, name='predictions')
        target_columns = [col for col in df.columns if 'target' in col]
        test_sum = df.loc[test_index, target_columns].sum(axis=1)
        y_test = test_sum + df.loc[test_index, target_col]
        y_test = pd.DataFrame(y_test, index=test_index, columns=["target"])
        
        # nuevo approach, uso integration_function
        
        
        return {
            "predictions": predictions,
            "y_test": y_test
        }
    

## legacy code
class IntegratePredictionsStepOld(PipelineStep):
    def execute(self, pipeline, predict_set, predictions, target_col, test) -> Dict:
        """
        Integra las predicciones al DataFrame de test.
        Si el target_col es una diferencia, se suma el último valor de target_col al target.
        """
        pred_original_df = pipeline.get_artifact(predict_set)
        predictions["predictions"] = predictions["predictions"] + pred_original_df[target_col]
        test["target"] = test["target"] + test[target_col]
        return {
            "predictions": predictions,
            "test": test
        } 



class SplitDataFrameStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df) -> None:
        sorted_dated = sorted(df["fecha"].unique())
        last_date = sorted_dated[-1] # es 12-2019
        last_test_date = sorted_dated[-3] # needs a gap because forecast moth+2
        last_train_date = sorted_dated[-4] #

        kaggle_pred = df[df["fecha"] == last_date]
        test = df[df["fecha"] == last_test_date]
        eval_data = df[df["fecha"] == last_train_date]
        train = df[(df["fecha"] < last_train_date)]
        return {
            "train": train,
            "eval_data": eval_data,
            "test": test,
            "kaggle_pred": kaggle_pred
        }
    

class PrepareXYStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, train, eval_data, test, kaggle_pred) -> None:
        features = [col for col in train.columns if col not in
                        ['fecha', 'target']]
        target = 'target'

        X_train = pd.concat([train[features], eval_data[features]]) # [train + eval] + [eval] -> [test] 
        y_train = pd.concat([train[target], eval_data[target]])

        X_train_alone = train[features]
        y_train_alone = train[target]

        X_eval = eval_data[features]
        y_eval = eval_data[target]

        X_test = test[features]
        y_test = test[target]

        X_train_final = pd.concat([train[features], eval_data[features], test[features]])
        y_train_final = pd.concat([train[target], eval_data[target], test[target]])

        X_kaggle = kaggle_pred[features]
        return {
            "X_train": X_train,
            "y_train": y_train,
            "X_train_alone": X_train_alone,
            "y_train_alone": y_train_alone,
            "X_eval": X_eval,
            "y_eval": y_eval,
            "X_test": X_test,
            "y_test": y_test,
            "X_train_final": X_train_final,
            "y_train_final": y_train_final,
            "X_kaggle": X_kaggle
        }
        

# ScaleFeatureStep
import pandas as pd
from abc import ABC, abstractmethod
from typing import Dict, Optional


class PipelineScaler(ABC):
    def __init__(self, column: str):
        self.column = column
        self.scaler_data = None

    @abstractmethod
    def fit(self, df: pd.DataFrame):
        pass

    @abstractmethod
    def transform(self, df: pd.DataFrame) -> pd.Series:
        pass

    @abstractmethod
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        pass

    @abstractmethod
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        pass

# TODO: hacer transformacion log1p si es necesario
# TODO: debuggear, por alguna razon da mal

class PipelineRobustScaler(PipelineScaler):
    
    def fit(self, df: pd.DataFrame):
        grouped = df.groupby(['product_id', 'customer_id'])[self.column]  # SeriesGroupBy
        median = grouped.median()
        q1 = grouped.apply(lambda x: x.quantile(0.25))
        q3 = grouped.apply(lambda x: x.quantile(0.75))
        iqr = q3 - q1

        agg = pd.DataFrame({
            f'{self.column}_median_scaler': median,
            f'{self.column}_iqr_scaler': iqr
        })
        print(agg.head())
        self.scaler_data = agg
        return self

    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_median_scaler']) / (df[f'{self.column}_iqr_scaler'])
        # replace inf and -inf with NaN
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        # original nans
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_median_scaler', f'{self.column}_iqr_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]

    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)

    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")

        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        # reconstruyo los indices originales
        df.set_index(df_index, inplace=True)
        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_iqr_scaler'])) + df[f'{self.column}_median_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_median_scaler', f'{self.column}_iqr_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]


class PipelineStandarScaler(PipelineScaler):

    def fit(self, df: pd.DataFrame):
        agg = df.groupby(['product_id', 'customer_id'])[self.column].agg(['mean', 'std']).rename(
            columns={'mean': f'{self.column}_mean_scaler', 'std': f'{self.column}_std_scaler'})
        self.scaler_data = agg
        #self.scaler_data.fillna(0, inplace=True)
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_mean_scaler']) / (df[f'{self.column}_std_scaler'])
        # replace inf and -inf with NaN
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        # original nans
        # hago un fill nan de las rows que no eran nan en la serie original
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_mean_scaler', f'{self.column}_std_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]
    
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)
    
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")

        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        # reconstruyo los indices originales
        df.set_index(df_index, inplace=True)
        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_std_scaler'])) + df[f'{self.column}_mean_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_mean_scaler', f'{self.column}_std_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]
    

class PipelineMinMaxScaler(PipelineScaler):

    def fit(self, df: pd.DataFrame):
        agg = df.groupby(['product_id', 'customer_id'])[self.column].agg(['min', 'max']).rename(
            columns={'min': f'{self.column}_min_scaler', 'max': f'{self.column}_max_scaler'})
        # seteo el minimo con 0 asi queda estandarlizado en todas las series
        agg[f'{self.column}_min_scaler'] = 0
        self.scaler_data = agg
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_min_scaler']) / (df[f'{self.column}_max_scaler'] - df[f'{self.column}_min_scaler'])
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        # original nans
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_min_scaler', f'{self.column}_max_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]
    
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)
    
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(df_index, inplace=True)

        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_max_scaler'] - df[f'{self.column}_min_scaler'])) + df[f'{self.column}_min_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_min_scaler', f'{self.column}_max_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]
    

class ScaleFeatureStep(PipelineStep):
    def __init__(self, column: str, regex=False, override=False, scaler=PipelineStandarScaler, name = None,):
        super().__init__(name)
        self.column = column
        self.scaler_cls = scaler
        self.regex = regex
        self.override = override

    def execute(self, df: pd.DataFrame, train_index) -> Dict:
        # si regex es True, busco todas las columnas que coincidan con el regex
        if self.regex:
            columns = df.filter(regex=self.column, axis=1).columns.tolist()
            print(f"Columns found matching regex '{self.column}': {columns}")
            if not columns:
                raise ValueError(f"No columns found matching regex '{self.column}'")
        else:
            columns = [self.column]
        scalers = {}
        for column in columns:
            scaler = self.scaler_cls(
                column=column,
            )
            if self.override:
                column_scaled = column
            else:
                column_scaled = f"{column}_scaled"
            scaler.fit(df[["product_id", "customer_id", column]])
            df[column_scaled] = scaler.transform(df[["product_id", "customer_id", column]])
            scalers[f"scaler_{column_scaled}"] = scaler
        ret = {"df": df, **scalers}
        return ret
    

class InverseScalePredictionsStep(PipelineStep):
    def execute(self, predictions, df, test_index, scaler_target=None) -> Dict:
        """
        Inverse scale the predictions using the provided grouped scaler.
        """
        if not scaler_target:
            return

        # creo un df predictions_df que tiene predictions, product_id y customer_id de df para los indices de predictions
        predictions_df = pd.DataFrame(predictions, index=predictions.index)
        predictions_df["product_id"] = df["product_id"]
        predictions_df["customer_id"] = df["customer_id"]
        predictions_df.columns = ["target", "product_id", "customer_id"]
        predictions = scaler_target.inverse_transform(predictions_df)
        predictions = pd.Series(predictions, name="predictions")
        predictions.index = test_index
        predictions.fillna(0, inplace=True)

        df["target"] = scaler_target.inverse_transform(df[["target", "product_id", "customer_id"]])    
 
        return {
            "predictions": predictions,
            "df": df
        }

# ReduceMemoryUsageStep
from typing import Dict, Optional, List, Tuple
import pandas as pd
import numpy as np
from scipy.stats import linregress



class CreateSerieIdStep(PipelineStep):

    def execute(self, df: pd.DataFrame) -> Dict:
        """
        Creo una nueva columna serieID que es la combinacion entre serie_id y customer_id
        """
        df['serieID'] = df['product_id'].astype(str) + df['customer_id'].astype(str)
        df['serieID'] = df['serieID'].astype('uint64')
        # le resto el valor menimo asi empieza en 1
        df["serieID"] = df["serieID"] - df["serieID"].min() + 1
        # la paso a uint32
        df["serieID"] = df["serieID"].astype("uint32")
        return {"df": df}
    


class DropMinSerieMonthStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, months: int = 3):
        super().__init__(name)
        self.months = months

    def execute(self, df: pd.DataFrame) -> Dict:
        """
        Agrupo las series por customer_id y product_id, cuento el largo de cada serie y elimino las series que tienen menos de self.months meses
        """
        # Agrupo por serieID y cuento los meses únicos
        series_counts = df.groupby('serieID')['mes'].nunique()
        
        # Filtrar series con menos de self.months meses
        valid_series = series_counts[series_counts >= self.months].index
        
        # Filtrar el DataFrame original
        df = df[df['serieID'].isin(valid_series)]
        print(f"Number of series dropped : {len(series_counts) - len(valid_series)}")
        
        return {"df": df}
    

class FilterProductsIDStep(PipelineStep):
    def __init__(self, product_file = "product_id_apredecir201912.txt", dfs=["df"], name: Optional[str] = None):
        super().__init__(name)
        self.file = product_file
        self.dfs = dfs

    def execute(self, pipeline: Pipeline) -> None:
        """ el txt es un csv que tiene columna product_id separado por tabulaciones """
        converted_dfs = {}
        for df_key in self.dfs:
            df = pipeline.get_artifact(df_key)
            product_ids = pd.read_csv(self.file, sep="\t")["product_id"].tolist()
            df = df[df["product_id"].isin(product_ids)]
            converted_dfs[df_key] = df
            print(f"Filtered DataFrame {df_key} shape: {df.shape}")
        return converted_dfs
    

class FilterProductForTestingStep(PipelineStep):
    def __init__(self, total_products_ids: int = 100, name: Optional[str] = None, random=True):
        super().__init__(name)
        self.total_products_ids = total_products_ids
        self.random = random
        
    def execute(self, df: pd.DataFrame) -> pd.DataFrame:
        """ Filtra el DataFrame para que contenga solo los primeros total_products_ids productos """
        unique_products = df['product_id'].unique()
        if len(unique_products) > self.total_products_ids:
            if self.random:
                products = np.random.choice(unique_products, size=self.total_products_ids, replace=False)
            else:
                products = unique_products[:self.total_products_ids]
            df = df[df['product_id'].isin(products)]
        print(f"Filtered DataFrame shape: {df.shape}")
        return {"df": df}
    

class CastDataTypesStep(PipelineStep):
    def __init__(self, dtypes: Dict[str, str], name: Optional[str] = None):
        super().__init__(name)
        self.dtypes = dtypes

    def execute(self, df: pd.DataFrame) -> None:
        for col, dtype in self.dtypes.items():
            df[col] = df[col].astype(dtype)
        print(df.info())
        return {"df": df}
    

class ReduceMemoryUsageStep(PipelineStep):

    def execute(self, df):
        initial_mem_usage = df.memory_usage().sum() / 1024**2
        for col in df.columns:
            if pd.api.types.is_numeric_dtype(df[col]):
                c_min = df[col].min()
                c_max = df[col].max()
                if pd.api.types.is_float_dtype(df[col]):
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                elif pd.api.types.is_integer_dtype(df[col]):
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
        
        final_mem_usage = df.memory_usage().sum() / 1024**2
        print('--- Memory usage before: {:.2f} MB'.format(initial_mem_usage))
        print('--- Memory usage after: {:.2f} MB'.format(final_mem_usage))
        print('--- Decreased memory usage by {:.1f}%\n'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))
        return {"df": df}      

        
class ChangeDataTypesStep(PipelineStep):
    def __init__(self, dtypes: Dict[str, str], name: Optional[str] = None):
        super().__init__(name)
        self.dtypes = dtypes

    def execute(self, df) -> None:
        for original_dtype, dtype in self.dtypes.items():
            for col in df.select_dtypes(include=[original_dtype]).columns:
                df[col] = df[col].astype(dtype)
        print(df.info())
        return {"df": df}
    

class FilterFirstDateStep(PipelineStep):
    def __init__(self, first_date: str, name: Optional[str] = None):
        super().__init__(name)
        self.first_date = first_date

    def execute(self, df) -> None:
        df = df[df["fecha"] >= self.first_date]
        print(f"Filtered DataFrame shape: {df.shape}")
        return {"df": df}
    

class FeatureEngineeringLagStep(PipelineStep):
    def __init__(self, lags: List[int], columns: List, name: Optional[str] = None):
        super().__init__(name)
        self.lags = lags
        self.columns = columns
        self.all = all

    def execute(self, df: pd.DataFrame) -> dict:
        # Ordenar por grupo y fecha para que los lags sean correctos

        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        
        # Crear lags usando groupby y shift (vectorizado)
        grouped = df.groupby(['product_id', 'customer_id'])
        for column in self.columns:
            for lag in self.lags:
                df[f"{column}_lag_{lag}"] = grouped[column].shift(lag)
        return {"df": df}
    

class RollingMeanFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).mean()
            )
        return {"df": df}
    

class RollingStdFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_std_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).std()
            )
        return {"df": df}


class RollingSkewFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_skew_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).skew()
            )
        return {"df": df}
    

class RollingKurtosisFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_kurtosis_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).kurtosis()
            )
        return {"df": df}
    

class RollingZscoreFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            rolling_mean = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).mean()
            )
            rolling_std = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).std()
            )
            df[f'{col}_rolling_zscore_{self.window}'] = (df[col] - rolling_mean) / (rolling_std + 1e-6)
        return {"df": df}
    

class RollingAutocorrelationFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], lags: List[int], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns
        self.lags = lags

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        
        for col in self.columns:
            for lag in self.lags:
                df[f'{col}_rolling_autocorr_{lag}_{self.window}'] = grouped[col].transform(
                    lambda x: x.rolling(self.window, min_periods=1).apply(
                        lambda y: y.autocorr(lag=lag), raw=False
                    )
                )
        return {"df": df}


class RollingMaxFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_max_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).max()
            )
        return {"df": df}
    

class RollingMinFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_min_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).min()
            )
        return {"df": df}


class RollingStdFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_std_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).std()
            )
        return {"df": df}
    

class ExponentialMovingAverageStep(PipelineStep):
    def __init__(self, span: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.span = span
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_ema_{self.span}'] = grouped[col].transform(
                lambda x: x.ewm(span=self.span, adjust=False).mean()
            )
        return {"df": df}
    

class TrendFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        
        def calculate_trend(series):
            return series.rolling(self.window).apply(
                lambda x: linregress(np.arange(len(x)), x)[0], raw=False
            )
        
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_trend_{self.window}'] = grouped[col].transform(calculate_trend)
        return {"df": df}
    

class DiffFeatureStep(PipelineStep):
    def __init__(self, periods: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.periods = periods
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_diff_{self.periods}'] = grouped[col].diff(self.periods)
        return {"df": df}
    

class RollingMedianFeatureStep(PipelineStep):
    def __init__(self, window: int, columns: List[str], name: Optional[str] = None):
        super().__init__(name)
        self.window = window
        self.columns = columns

    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(by=['product_id', 'customer_id', 'fecha'])
        grouped = df.groupby(['product_id', 'customer_id'])
        for col in self.columns:
            df[f'{col}_rolling_median_{self.window}'] = grouped[col].transform(
                lambda x: x.rolling(self.window, min_periods=1).median()
            )
        return {"df": df}
    

class CreateTotalCategoryStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, cat: str = "cat1", tn: str = "tn"):
        super().__init__(name)
        self.cat = cat
        self.tn = tn
    
    def execute(self, df: pd.DataFrame) -> Dict:
        df = df.sort_values(['fecha', self.cat])
        df[f"{self.tn}_{self.cat}_vendidas"] = (
            df.groupby(['fecha', self.cat])[self.tn]
              .transform('sum')
        )
        return {"df": df}


class CreateWeightByCustomerStep(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df: pd.DataFrame) -> Dict:
        # Aseguramos orden estable (opcional, mejora legibilidad)
        df = df.sort_values(['fecha', 'customer_id'])
        
        # 1) Sumatoria de 'tn' por (fecha, customer_id) directamente en cada fila
        df['tn_customer_vendidas'] = (
            df.groupby(['fecha', 'customer_id'])['tn']
              .transform('sum')
        )
        # 2) Sumatoria total de 'tn' por fecha
        df['tn_total_vendidas'] = (
            df.groupby('fecha')['tn']
              .transform('sum')
        )
        # 3) Ratio
        df['customer_weight'] = df['tn_customer_vendidas'] / df['tn_total_vendidas']
        return {"df": df}
    

class CreateWeightByProductStep(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df: pd.DataFrame) -> Dict:
        # Aseguramos orden estable (opcional, mejora legibilidad)
        df = df.sort_values(['fecha', 'product_id'])
        # 1) Sumatoria de 'tn' por (fecha, product_id) directamente en cada fila
        df['tn_product_vendidas'] = (
            df.groupby(['fecha', 'product_id'])['tn']
              .transform('sum')
        )
        # 2) Sumatoria total de 'tn' por fecha
        df['tn_total_vendidas'] = (
            df.groupby('fecha')['tn']
              .transform('sum')
        )
        # 3) Ratio
        df['product_weight'] = df['tn_product_vendidas'] / df['tn_total_vendidas']
        return {"df": df}
    

class FeatureDivInteractionStep(PipelineStep):
    def __init__(self, columns: List[Tuple[str, str]], name: Optional[str] = None):
        super().__init__(name)
        self.columns = columns

    def execute(self, df) -> None:
        for col1, col2 in self.columns:
            df[f"{col1}_div_{col2}"] = df[col1] / (df[col2] + 1e-6)  # Evitar división por cero
        return {"df": df}


class FeatureProdInteractionStep(PipelineStep):
    def __init__(self, columns: List[Tuple[str, str]], name: Optional[str] = None):
        super().__init__(name)
        self.columns = columns

    def execute(self, df) -> None:
        for col1, col2 in self.columns:
            df[f"{col1}_prod_{col2}"] = df[col1] * df[col2]
        return {"df": df}
    

class DateRelatedFeaturesStep(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df) -> None:
        df["year"] = df["fecha"].dt.year
        df["mes"] = df["fecha"].dt.month
        return {"df": df}

# PrepareXYStep
from typing import Optional, Dict
import pandas as pd

class SplitDataFrameStep(PipelineStep):
    def __init__(
            self, 
            test_date="2019-12", 
            df="df", 
            gap=0,
            name: Optional[str] = None
        ):
        super().__init__(name)
        self.test_date = test_date
        self.df = df
        self.gap = gap 

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df)
        test_df = df[df["fecha"] == self.test_date]
        train_df = df[df["fecha"] < self.test_date]
        last_train_date = train_df["fecha"].max()
        if isinstance(last_train_date, pd.Period):
            last_train_date = last_train_date.to_timestamp()
        gap_date = pd.to_datetime(last_train_date) - pd.DateOffset(months=self.gap)
        # Convert gap_date to Period with same freq as fecha
        if pd.api.types.is_period_dtype(df["fecha"]):
            gap_date = pd.Period(gap_date, freq=df["fecha"].dt.freq)
        train_df = train_df[train_df["fecha"] < gap_date]
        return {
            "train_index": train_df.index,
            "test_index": test_df.index
        }


class PrepareXYStep(PipelineStep):
    def execute(self, df, train_index, test_index) -> None:
        columns = df.columns
        #features = [col for col in columns if col != "fecha" and "target" not in col]
        features = [col for col in columns if col != "fecha" and "target" not in col]
        targets = [col for col in columns if "target" in col]
        X_train = df.loc[train_index][features]
        y_train = df.loc[train_index][targets]
        X_test = df.loc[test_index][features]
        y_test = df.loc[test_index][targets]
        return {
            "features": features,
            "targets": targets,
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
        }


class CreateTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2)    
        return {"df": df, "target_col": self.target_col}
    

class CreateMultiDiffTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target_1'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1) - df[self.target_col]
        df['target_2'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1)
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target_1'] + x['target_2']
        }


class CreateTargetColumDiffStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:
        df.drop(columns=["target"], inplace=True, errors='ignore')
        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df[self.target_col]
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target']
        }
    

class PredictStep(PipelineStep):
    def execute(self, df, test_index, model, features) -> None:
        X_predict = df.loc[test_index][features]
        predictions = model.predict(X_predict)
        return {"predictions": predictions}


class IntegratePredictionsStep(PipelineStep):
    def execute(self, df, predictions, test_index, target_col, needs_integration=False) -> Dict:
        if not needs_integration:
            return {
                "y_test": df.loc[test_index, ["target"]]
            }
        # crea un nuevo dataframe que es la suma de todas las columnas de predicciones
        if predictions.ndim == 1:
            predictions_sum = pd.Series(predictions, index=test_index, name='predictions')
        else:
            predictions_sum = predictions.sum(axis=1)
        final_predictions = predictions_sum + df.loc[test_index, target_col]
        predictions = pd.Series(final_predictions, index=test_index, name='predictions')
        target_columns = [col for col in df.columns if 'target' in col]
        test_sum = df.loc[test_index, target_columns].sum(axis=1)
        y_test = test_sum + df.loc[test_index, target_col]
        y_test = pd.DataFrame(y_test, index=test_index, columns=["target"])
        
        # nuevo approach, uso integration_function
        
        
        return {
            "predictions": predictions,
            "y_test": y_test
        }
    

## legacy code
class IntegratePredictionsStepOld(PipelineStep):
    def execute(self, pipeline, predict_set, predictions, target_col, test) -> Dict:
        """
        Integra las predicciones al DataFrame de test.
        Si el target_col es una diferencia, se suma el último valor de target_col al target.
        """
        pred_original_df = pipeline.get_artifact(predict_set)
        predictions["predictions"] = predictions["predictions"] + pred_original_df[target_col]
        test["target"] = test["target"] + test[target_col]
        return {
            "predictions": predictions,
            "test": test
        } 



class SplitDataFrameStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df) -> None:
        sorted_dated = sorted(df["fecha"].unique())
        last_date = sorted_dated[-1] # es 12-2019
        last_test_date = sorted_dated[-3] # needs a gap because forecast moth+2
        last_train_date = sorted_dated[-4] #

        kaggle_pred = df[df["fecha"] == last_date]
        test = df[df["fecha"] == last_test_date]
        eval_data = df[df["fecha"] == last_train_date]
        train = df[(df["fecha"] < last_train_date)]
        return {
            "train": train,
            "eval_data": eval_data,
            "test": test,
            "kaggle_pred": kaggle_pred
        }
    

class PrepareXYStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, train, eval_data, test, kaggle_pred) -> None:
        features = [col for col in train.columns if col not in
                        ['fecha', 'target']]
        target = 'target'

        X_train = pd.concat([train[features], eval_data[features]]) # [train + eval] + [eval] -> [test] 
        y_train = pd.concat([train[target], eval_data[target]])

        X_train_alone = train[features]
        y_train_alone = train[target]

        X_eval = eval_data[features]
        y_eval = eval_data[target]

        X_test = test[features]
        y_test = test[target]

        X_train_final = pd.concat([train[features], eval_data[features], test[features]])
        y_train_final = pd.concat([train[target], eval_data[target], test[target]])

        X_kaggle = kaggle_pred[features]
        return {
            "X_train": X_train,
            "y_train": y_train,
            "X_train_alone": X_train_alone,
            "y_train_alone": y_train_alone,
            "X_eval": X_eval,
            "y_eval": y_eval,
            "X_test": X_test,
            "y_test": y_test,
            "X_train_final": X_train_final,
            "y_train_final": y_train_final,
            "X_kaggle": X_kaggle
        }
        

# TrainModelStep
import numpy as np
import pandas as pd
import lightgbm as lgb
from typing import Dict, Optional
from sklearn.neural_network import MLPRegressor

LGB_DEFAULT_PARAMS = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.01,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 5,
    "n_estimators": 1500,
    "verbose": -1
}

class TotalErrorMetric:
    def __init__(self, df_eval):
        self.df_eval = df_eval

    def __call__(self, preds, train_data):
        labels = train_data.get_label()
        df_temp = self.df_eval.copy()
        df_temp['preds'] = preds
        df_temp['labels'] = labels
        # Agrupar por product_id y calcular el error
        por_producto = df_temp.groupby("product_id").agg({'labels': 'sum', 'preds': 'sum'})
        # Calcular el error personalizado
        error = np.sum(np.abs(por_producto['labels'] - por_producto['preds'])) / np.sum(np.abs(por_producto['labels']))
        # LightGBM espera que el segundo valor sea mayor cuando el modelo es mejor
        return 'total_error', error, False
    

class CustomMetricAutoML:
    def __init__(self, df_eval):
        self.df_eval = df_eval

    def __call__(self, X_val, y_val, estimator, *args, **kwargs):
        df_temp = X_val.copy()
        df_temp['preds'] = estimator.predict(X_val)
        df_temp['labels'] = y_val

        por_producto = df_temp.groupby("product_id").agg({'labels': 'sum', 'preds': 'sum'})
        
        error = np.sum(np.abs(por_producto['labels'] - por_producto['preds'])) / np.sum(por_producto['labels'])
        
        return error, {"total_error": error}

from sklearn.model_selection import KFold, cross_val_score
# import deepclone
from copy import deepcopy

class EnsambleKFoldWrapper:
    # esta funcion se inicializa con un modelo, los clona y entra N modelos, uno por cada kfold.
    # cuando hace la prediccion, promedia las predicciones de los N modelos.
    def __init__(self, model, n_splits=5):
        self.model = model
        self.n_splits = n_splits
        self.models = []

    def fit(self, X, y, X_val, y_val):
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        for train_index, val_index in kf.split(X):
            X_train = X.iloc[train_index]
            y_train = y.iloc[train_index]
            model_clone = deepcopy(self.model)  # Clona el modelo para cada fold
            model_clone.fit(X_train, y_train, X_val, y_val)
            self.models.append(model_clone)
        return self
    def predict(self, X):
        if not self.models:
            raise ValueError("No models have been trained yet.")
        predictions = np.mean([model.predict(X) for model in self.models], axis=0)
        return pd.Series(predictions, index=X.index, name='predictions')


class XGBOOSTPipelineModel:
    def __init__(self, params: Dict = None):
        self.params = params or {}
        self.model = None

    def set_params(self, **params):
        self.params.update(params)

    def fit(self, X_train, y_train, X_eval, y_eval):
        from xgboost import XGBRegressor
        if isinstance(y_train, pd.DataFrame) and y_train.shape[1] > 1:
            raise ValueError("y_train must be a Series for single-target regression.")
        self.model = XGBRegressor(**self.params, enable_categorical=True)
        eval_sets = []
        y_eval = y_eval.dropna()
        if not y_eval.empty:
            X_eval = X_eval.loc[y_eval.index]
            eval_sets = [(X_eval, y_eval)]
        y_train = y_train.dropna()
        X_train = X_train.loc[y_train.index]
        self.model.fit(X_train, y_train, eval_set=eval_sets, verbose=True)
        return self
    
    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        return pd.Series(self.model.predict(X), index=X.index, name='predictions')


class LGBPipelineModel:
    def __init__(self, params: Dict = LGB_DEFAULT_PARAMS):
        self.params = params
        self.model = None

    def set_params(self, **params):
        self.params.update(params)

    def fit(self, X_train, y_train, X_eval, y_eval):
        # si y_train tiene mas de una collumna uso mulltitarget
        if isinstance(y_train, pd.DataFrame) and y_train.shape[1] > 1:
            self.model = LGBMultiTargetPipelineModel(self.params).fit(X_train, y_train, X_eval, y_eval)
        else:
            self.model = LGBPipelineSingleTargetModel(self.params).fit(X_train, y_train, X_eval, y_eval)
        return self
    
    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        return self.model.predict(X)

    def plot_importance(self, max_num_features=20):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        lgb.plot_importance(self.model.model, max_num_features=max_num_features)

class LGBBase:
    def set_params(self, **params):
        self.params.update(params)

    def _make_datasets(self, X_train, y_train, X_eval, y_eval):
        # droppeo los indices de X_train donde y_train es nan
        y_train = y_train.dropna()
        X_train = X_train.loc[y_train.index]
        cat_features = [col for col in X_train.columns if X_train[col].dtype.name == 'category']
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
        y_eval = y_eval.dropna()
        # si y_val esta vacio, no creo el dataset de evaluacion
        print(f"Validation set size: {len(y_eval)}")
        if y_eval.empty:
            return train_data, None
        del X_train, y_train  # borro X_train, y_train para liberar memoria
        X_eval = X_eval.loc[y_eval.index]
        print(f"X_eval first 5 rows:\n{X_eval.head()}")
        print(f"y_eval first 5 rows:\n{y_eval.head()}")
        eval_data =lgb.Dataset(X_eval, label=y_eval, reference=train_data, categorical_feature=cat_features)
        return train_data, eval_data
      
    def _train_model(self, train_data, eval_data=None):
        if eval_data is None:
            eval_params = {}

        else:
            eval_params = {
                "valid_sets": [eval_data],
                "feval": TotalErrorMetric(eval_data.data)
            }
        callbacks = [
            lgb.log_evaluation(100),
            #lgb.early_stopping(50),
        ]
        model = lgb.train(
            self.params,
            train_data,
            callbacks=callbacks,
            **eval_params
        )
        return model
    

class LGBMultiTargetPipelineModel(LGBBase):
    def __init__(self, params: Dict = LGB_DEFAULT_PARAMS):
        self.params = params
        self.models = {}

    def fit(self, X_train, y_train, X_eval, y_eval):
        if not isinstance(y_train, pd.DataFrame):
            raise ValueError("y_train must be a DataFrame for multi-target regression.")
        for target in y_train.columns:
            print(f"Training model for target: {target}")
            train_data, eval_data = self._make_datasets(X_train, y_train[target], X_eval, y_eval[target])
            model = self._train_model(train_data, eval_data)
            self.models[target] = model
        return self
    
    def predict(self, X):
        if not self.models:
            raise ValueError("No models have been trained yet.")
        predictions = {}
        for target, model in self.models.items():
            predictions[target] = model.predict(X)
        return pd.DataFrame(predictions, index=X.index)


class LGBPipelineSingleTargetModel(LGBBase):
    def __init__(self, params: Dict = LGB_DEFAULT_PARAMS):
        self.params = params
        self.model = None

    def fit(self, X_train, y_train, X_eval, y_eval):
        if isinstance(y_train, pd.DataFrame) and y_train.shape[1] > 1:
            raise ValueError("y_train must be a Series for single-target regression.")
        train_data, eval_data = self._make_datasets(X_train, y_train, X_eval, y_eval)
        # borro X_train, y _train, X_eval, y_eval para liberar memoria
        del X_train, y_train, X_eval, y_eval
        self.model = self._train_model(train_data, eval_data)
        return self
    
    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        return pd.Series(self.model.predict(X), index=X.index, name='predictions')
    
    
class TrainModelStep(PipelineStep):
    def __init__(self, model_cls = LGBPipelineModel, name: Optional[str] = None, params={}, folds=0):
        super().__init__(name)
        self.model_cls = model_cls
        self.params = params
        self.folds = folds  # Number of folds for cross-validation, if applicable

    def execute(self, X_test, y_test, X_train, y_train, params={}) -> None:
        params = params or self.params
        if self.folds > 1:
            model = self.model_cls()
            model.set_params(**params)
            model = EnsambleKFoldWrapper(model, n_splits=self.folds)
            model.fit(X_train, y_train, X_test, y_test)
        else:
            model = self.model_cls()
            model.set_params(**params)
            model.fit(X_train, y_train, X_test, y_test)
        return {"model": model}
    

## LEGACY
class TrainModelLGBStep(PipelineStep):
    def __init__(self, params: Dict = LGB_DEFAULT_PARAMS, train_eval_sets = {}, name: Optional[str] = None):
        super().__init__(name)
        if not params:
            params = {
                "objective": "regression",
                "boosting_type": "gbdt",
                "num_leaves": 31,
                "learning_rate": 0.05,
                "feature_fraction": 0.9,
                "bagging_fraction": 0.8,
                "bagging_freq": 5,
                "n_estimators": 1000,
                "verbose": -1
            }
        if not train_eval_sets:
            train_eval_sets = {
                "X_train": "X_train",
                "y_train": "y_train",
                "X_eval": "X_eval",
                "y_eval": "y_eval",
                "eval_data": "eval_data",
            }
        self.params = params
        self.train_eval_sets = train_eval_sets

    def execute(self, pipeline: Pipeline, params=None) -> None:
        X_train = pipeline.get_artifact(self.train_eval_sets["X_train"])
        y_train = pipeline.get_artifact(self.train_eval_sets["y_train"])
        X_eval = pipeline.get_artifact(self.train_eval_sets["X_eval"])
        y_eval = pipeline.get_artifact(self.train_eval_sets["y_eval"])
        df_eval = pipeline.get_artifact(self.train_eval_sets["eval_data"])

        cat_features = [col for col in X_train.columns if X_train[col].dtype.name == 'category']

        
        params = params or self.params
        weight = X_train['weight'] if 'weight' in X_train.columns else None
        weight_eval = X_eval['weight'] if 'weight' in X_eval.columns else None
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features, weight=weight)
        eval_data = lgb.Dataset(X_eval, label=y_eval, reference=train_data, categorical_feature=cat_features, weight=weight_eval)
        custom_metric = CustomMetric(df_eval, product_id_col='product_id')
        callbacks = [
            #lgb.early_stopping(50),
            lgb.log_evaluation(100),
        ]
        model = lgb.train(
            params,
            train_data,
            #num_boost_round=1200,
            #num_boost_round=50, # test
            valid_sets=[eval_data],
            feval=custom_metric,
            callbacks=callbacks,
        )
        return {"model": model}
    

class MLPPipelineModel:
    def __init__(self, params: Dict = None):
        self.params = params or {
            "hidden_layer_sizes": (256, 128, 64, 32),
            "activation": "relu",
            "solver": "adam",
            "alpha": 0.0001,
            "batch_size": "auto",
            "learning_rate": "adaptive",
            "learning_rate_init": 0.001,
            "max_iter": 500,
            "early_stopping": True,
            "random_state": 42,
            "verbose": True,
        }
        self.model = None
        self.feature_columns = None  # Para guardar las columnas después del one-hot

    def set_params(self, **params):
        self.params.update(params)

    def _preprocess(self, X):
        # Convierte variables categóricas a dummies
        X_proc = pd.get_dummies(X, drop_first=True)
        # Si ya entrenamos, aseguramos que las columnas coincidan
        if self.feature_columns is not None:
            for col in self.feature_columns:
                if col not in X_proc:
                    X_proc[col] = 0
            X_proc = X_proc[self.feature_columns]
        # Reemplaza NaN por 0
        X_proc = X_proc.fillna(0)
        return X_proc

    def fit(self, X_train, y_train, X_eval=None, y_eval=None):
        if isinstance(y_train, pd.DataFrame) and y_train.shape[1] > 1:
            raise ValueError("y_train debe ser una Serie para regresión single-target.")
        y_train = y_train.dropna()
        X_train = X_train.loc[y_train.index]
        X_train_proc = pd.get_dummies(X_train, drop_first=True)
        X_train_proc = X_train_proc.fillna(0)  # <--- asegurate de esto
        self.feature_columns = X_train_proc.columns.tolist()
        self.model = MLPRegressor(**self.params)
        self.model.fit(X_train_proc, y_train)
        return self

    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been entrenado aún.")
        X_proc = self._preprocess(X)
        return pd.Series(self.model.predict(X_proc), index=X.index, name='predictions')

# PredictStep
from typing import Optional, Dict
import pandas as pd

class SplitDataFrameStep(PipelineStep):
    def __init__(
            self, 
            test_date="2019-12", 
            df="df", 
            gap=0,
            name: Optional[str] = None
        ):
        super().__init__(name)
        self.test_date = test_date
        self.df = df
        self.gap = gap 

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df)
        test_df = df[df["fecha"] == self.test_date]
        train_df = df[df["fecha"] < self.test_date]
        last_train_date = train_df["fecha"].max()
        if isinstance(last_train_date, pd.Period):
            last_train_date = last_train_date.to_timestamp()
        gap_date = pd.to_datetime(last_train_date) - pd.DateOffset(months=self.gap)
        # Convert gap_date to Period with same freq as fecha
        if pd.api.types.is_period_dtype(df["fecha"]):
            gap_date = pd.Period(gap_date, freq=df["fecha"].dt.freq)
        train_df = train_df[train_df["fecha"] < gap_date]
        return {
            "train_index": train_df.index,
            "test_index": test_df.index
        }


class PrepareXYStep(PipelineStep):
    def execute(self, df, train_index, test_index) -> None:
        columns = df.columns
        #features = [col for col in columns if col != "fecha" and "target" not in col]
        features = [col for col in columns if col != "fecha" and "target" not in col]
        targets = [col for col in columns if "target" in col]
        X_train = df.loc[train_index][features]
        y_train = df.loc[train_index][targets]
        X_test = df.loc[test_index][features]
        y_test = df.loc[test_index][targets]
        return {
            "features": features,
            "targets": targets,
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
        }


class CreateTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2)    
        return {"df": df, "target_col": self.target_col}
    

class CreateMultiDiffTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target_1'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1) - df[self.target_col]
        df['target_2'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1)
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target_1'] + x['target_2']
        }


class CreateTargetColumDiffStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:
        df.drop(columns=["target"], inplace=True, errors='ignore')
        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df[self.target_col]
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target']
        }
    

class PredictStep(PipelineStep):
    def execute(self, df, test_index, model, features) -> None:
        X_predict = df.loc[test_index][features]
        predictions = model.predict(X_predict)
        return {"predictions": predictions}


class IntegratePredictionsStep(PipelineStep):
    def execute(self, df, predictions, test_index, target_col, needs_integration=False) -> Dict:
        if not needs_integration:
            return {
                "y_test": df.loc[test_index, ["target"]]
            }
        # crea un nuevo dataframe que es la suma de todas las columnas de predicciones
        if predictions.ndim == 1:
            predictions_sum = pd.Series(predictions, index=test_index, name='predictions')
        else:
            predictions_sum = predictions.sum(axis=1)
        final_predictions = predictions_sum + df.loc[test_index, target_col]
        predictions = pd.Series(final_predictions, index=test_index, name='predictions')
        target_columns = [col for col in df.columns if 'target' in col]
        test_sum = df.loc[test_index, target_columns].sum(axis=1)
        y_test = test_sum + df.loc[test_index, target_col]
        y_test = pd.DataFrame(y_test, index=test_index, columns=["target"])
        
        # nuevo approach, uso integration_function
        
        
        return {
            "predictions": predictions,
            "y_test": y_test
        }
    

## legacy code
class IntegratePredictionsStepOld(PipelineStep):
    def execute(self, pipeline, predict_set, predictions, target_col, test) -> Dict:
        """
        Integra las predicciones al DataFrame de test.
        Si el target_col es una diferencia, se suma el último valor de target_col al target.
        """
        pred_original_df = pipeline.get_artifact(predict_set)
        predictions["predictions"] = predictions["predictions"] + pred_original_df[target_col]
        test["target"] = test["target"] + test[target_col]
        return {
            "predictions": predictions,
            "test": test
        } 



class SplitDataFrameStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df) -> None:
        sorted_dated = sorted(df["fecha"].unique())
        last_date = sorted_dated[-1] # es 12-2019
        last_test_date = sorted_dated[-3] # needs a gap because forecast moth+2
        last_train_date = sorted_dated[-4] #

        kaggle_pred = df[df["fecha"] == last_date]
        test = df[df["fecha"] == last_test_date]
        eval_data = df[df["fecha"] == last_train_date]
        train = df[(df["fecha"] < last_train_date)]
        return {
            "train": train,
            "eval_data": eval_data,
            "test": test,
            "kaggle_pred": kaggle_pred
        }
    

class PrepareXYStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, train, eval_data, test, kaggle_pred) -> None:
        features = [col for col in train.columns if col not in
                        ['fecha', 'target']]
        target = 'target'

        X_train = pd.concat([train[features], eval_data[features]]) # [train + eval] + [eval] -> [test] 
        y_train = pd.concat([train[target], eval_data[target]])

        X_train_alone = train[features]
        y_train_alone = train[target]

        X_eval = eval_data[features]
        y_eval = eval_data[target]

        X_test = test[features]
        y_test = test[target]

        X_train_final = pd.concat([train[features], eval_data[features], test[features]])
        y_train_final = pd.concat([train[target], eval_data[target], test[target]])

        X_kaggle = kaggle_pred[features]
        return {
            "X_train": X_train,
            "y_train": y_train,
            "X_train_alone": X_train_alone,
            "y_train_alone": y_train_alone,
            "X_eval": X_eval,
            "y_eval": y_eval,
            "X_test": X_test,
            "y_test": y_test,
            "X_train_final": X_train_final,
            "y_train_final": y_train_final,
            "X_kaggle": X_kaggle
        }
        

# InverseScalePredictionsStep
import pandas as pd
from abc import ABC, abstractmethod
from typing import Dict, Optional


class PipelineScaler(ABC):
    def __init__(self, column: str):
        self.column = column
        self.scaler_data = None

    @abstractmethod
    def fit(self, df: pd.DataFrame):
        pass

    @abstractmethod
    def transform(self, df: pd.DataFrame) -> pd.Series:
        pass

    @abstractmethod
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        pass

    @abstractmethod
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        pass

# TODO: hacer transformacion log1p si es necesario
# TODO: debuggear, por alguna razon da mal

class PipelineRobustScaler(PipelineScaler):
    
    def fit(self, df: pd.DataFrame):
        grouped = df.groupby(['product_id', 'customer_id'])[self.column]  # SeriesGroupBy
        median = grouped.median()
        q1 = grouped.apply(lambda x: x.quantile(0.25))
        q3 = grouped.apply(lambda x: x.quantile(0.75))
        iqr = q3 - q1

        agg = pd.DataFrame({
            f'{self.column}_median_scaler': median,
            f'{self.column}_iqr_scaler': iqr
        })
        print(agg.head())
        self.scaler_data = agg
        return self

    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_median_scaler']) / (df[f'{self.column}_iqr_scaler'])
        # replace inf and -inf with NaN
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        # original nans
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_median_scaler', f'{self.column}_iqr_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]

    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)

    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")

        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        # reconstruyo los indices originales
        df.set_index(df_index, inplace=True)
        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_iqr_scaler'])) + df[f'{self.column}_median_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_median_scaler', f'{self.column}_iqr_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]


class PipelineStandarScaler(PipelineScaler):

    def fit(self, df: pd.DataFrame):
        agg = df.groupby(['product_id', 'customer_id'])[self.column].agg(['mean', 'std']).rename(
            columns={'mean': f'{self.column}_mean_scaler', 'std': f'{self.column}_std_scaler'})
        self.scaler_data = agg
        #self.scaler_data.fillna(0, inplace=True)
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_mean_scaler']) / (df[f'{self.column}_std_scaler'])
        # replace inf and -inf with NaN
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        # original nans
        # hago un fill nan de las rows que no eran nan en la serie original
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_mean_scaler', f'{self.column}_std_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]
    
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)
    
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")

        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        # reconstruyo los indices originales
        df.set_index(df_index, inplace=True)
        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_std_scaler'])) + df[f'{self.column}_mean_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_mean_scaler', f'{self.column}_std_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]
    

class PipelineMinMaxScaler(PipelineScaler):

    def fit(self, df: pd.DataFrame):
        agg = df.groupby(['product_id', 'customer_id'])[self.column].agg(['min', 'max']).rename(
            columns={'min': f'{self.column}_min_scaler', 'max': f'{self.column}_max_scaler'})
        # seteo el minimo con 0 asi queda estandarlizado en todas las series
        agg[f'{self.column}_min_scaler'] = 0
        self.scaler_data = agg
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_min_scaler']) / (df[f'{self.column}_max_scaler'] - df[f'{self.column}_min_scaler'])
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        # original nans
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_min_scaler', f'{self.column}_max_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]
    
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)
    
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(df_index, inplace=True)

        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_max_scaler'] - df[f'{self.column}_min_scaler'])) + df[f'{self.column}_min_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_min_scaler', f'{self.column}_max_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]
    

class ScaleFeatureStep(PipelineStep):
    def __init__(self, column: str, regex=False, override=False, scaler=PipelineStandarScaler, name = None,):
        super().__init__(name)
        self.column = column
        self.scaler_cls = scaler
        self.regex = regex
        self.override = override

    def execute(self, df: pd.DataFrame, train_index) -> Dict:
        # si regex es True, busco todas las columnas que coincidan con el regex
        if self.regex:
            columns = df.filter(regex=self.column, axis=1).columns.tolist()
            print(f"Columns found matching regex '{self.column}': {columns}")
            if not columns:
                raise ValueError(f"No columns found matching regex '{self.column}'")
        else:
            columns = [self.column]
        scalers = {}
        for column in columns:
            scaler = self.scaler_cls(
                column=column,
            )
            if self.override:
                column_scaled = column
            else:
                column_scaled = f"{column}_scaled"
            scaler.fit(df[["product_id", "customer_id", column]])
            df[column_scaled] = scaler.transform(df[["product_id", "customer_id", column]])
            scalers[f"scaler_{column_scaled}"] = scaler
        ret = {"df": df, **scalers}
        return ret
    

class InverseScalePredictionsStep(PipelineStep):
    def execute(self, predictions, df, test_index, scaler_target=None) -> Dict:
        """
        Inverse scale the predictions using the provided grouped scaler.
        """
        if not scaler_target:
            return

        # creo un df predictions_df que tiene predictions, product_id y customer_id de df para los indices de predictions
        predictions_df = pd.DataFrame(predictions, index=predictions.index)
        predictions_df["product_id"] = df["product_id"]
        predictions_df["customer_id"] = df["customer_id"]
        predictions_df.columns = ["target", "product_id", "customer_id"]
        predictions = scaler_target.inverse_transform(predictions_df)
        predictions = pd.Series(predictions, name="predictions")
        predictions.index = test_index
        predictions.fillna(0, inplace=True)

        df["target"] = scaler_target.inverse_transform(df[["target", "product_id", "customer_id"]])    
 
        return {
            "predictions": predictions,
            "df": df
        }

# IntegratePredictionsStep
from typing import Optional, Dict
import pandas as pd

class SplitDataFrameStep(PipelineStep):
    def __init__(
            self, 
            test_date="2019-12", 
            df="df", 
            gap=0,
            name: Optional[str] = None
        ):
        super().__init__(name)
        self.test_date = test_date
        self.df = df
        self.gap = gap 

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df)
        test_df = df[df["fecha"] == self.test_date]
        train_df = df[df["fecha"] < self.test_date]
        last_train_date = train_df["fecha"].max()
        if isinstance(last_train_date, pd.Period):
            last_train_date = last_train_date.to_timestamp()
        gap_date = pd.to_datetime(last_train_date) - pd.DateOffset(months=self.gap)
        # Convert gap_date to Period with same freq as fecha
        if pd.api.types.is_period_dtype(df["fecha"]):
            gap_date = pd.Period(gap_date, freq=df["fecha"].dt.freq)
        train_df = train_df[train_df["fecha"] < gap_date]
        return {
            "train_index": train_df.index,
            "test_index": test_df.index
        }


class PrepareXYStep(PipelineStep):
    def execute(self, df, train_index, test_index) -> None:
        columns = df.columns
        #features = [col for col in columns if col != "fecha" and "target" not in col]
        features = [col for col in columns if col != "fecha" and "target" not in col]
        targets = [col for col in columns if "target" in col]
        X_train = df.loc[train_index][features]
        y_train = df.loc[train_index][targets]
        X_test = df.loc[test_index][features]
        y_test = df.loc[test_index][targets]
        return {
            "features": features,
            "targets": targets,
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
        }


class CreateTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2)    
        return {"df": df, "target_col": self.target_col}
    

class CreateMultiDiffTargetColumStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:

        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target_1'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1) - df[self.target_col]
        df['target_2'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-1)
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target_1'] + x['target_2']
        }


class CreateTargetColumDiffStep(PipelineStep):
    def __init__(self, name: Optional[str] = None, target_col: str = 'tn'):
        super().__init__(name)
        self.target_col = target_col

    def execute(self, df: pd.DataFrame) -> Dict:
        df.drop(columns=["target"], inplace=True, errors='ignore')
        df = df.sort_values(['product_id', 'customer_id', 'fecha'])
        df['target'] = df.groupby(['product_id', 'customer_id'])[self.target_col].shift(-2) - df[self.target_col]
        return {
            "df": df, 
            "target_col": self.target_col,
            "needs_integration": True,
            #"integration_function": lambda x: x[self.target] + x['target']
        }
    

class PredictStep(PipelineStep):
    def execute(self, df, test_index, model, features) -> None:
        X_predict = df.loc[test_index][features]
        predictions = model.predict(X_predict)
        return {"predictions": predictions}


class IntegratePredictionsStep(PipelineStep):
    def execute(self, df, predictions, test_index, target_col, needs_integration=False) -> Dict:
        if not needs_integration:
            return {
                "y_test": df.loc[test_index, ["target"]]
            }
        # crea un nuevo dataframe que es la suma de todas las columnas de predicciones
        if predictions.ndim == 1:
            predictions_sum = pd.Series(predictions, index=test_index, name='predictions')
        else:
            predictions_sum = predictions.sum(axis=1)
        final_predictions = predictions_sum + df.loc[test_index, target_col]
        predictions = pd.Series(final_predictions, index=test_index, name='predictions')
        target_columns = [col for col in df.columns if 'target' in col]
        test_sum = df.loc[test_index, target_columns].sum(axis=1)
        y_test = test_sum + df.loc[test_index, target_col]
        y_test = pd.DataFrame(y_test, index=test_index, columns=["target"])
        
        # nuevo approach, uso integration_function
        
        
        return {
            "predictions": predictions,
            "y_test": y_test
        }
    

## legacy code
class IntegratePredictionsStepOld(PipelineStep):
    def execute(self, pipeline, predict_set, predictions, target_col, test) -> Dict:
        """
        Integra las predicciones al DataFrame de test.
        Si el target_col es una diferencia, se suma el último valor de target_col al target.
        """
        pred_original_df = pipeline.get_artifact(predict_set)
        predictions["predictions"] = predictions["predictions"] + pred_original_df[target_col]
        test["target"] = test["target"] + test[target_col]
        return {
            "predictions": predictions,
            "test": test
        } 



class SplitDataFrameStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, df) -> None:
        sorted_dated = sorted(df["fecha"].unique())
        last_date = sorted_dated[-1] # es 12-2019
        last_test_date = sorted_dated[-3] # needs a gap because forecast moth+2
        last_train_date = sorted_dated[-4] #

        kaggle_pred = df[df["fecha"] == last_date]
        test = df[df["fecha"] == last_test_date]
        eval_data = df[df["fecha"] == last_train_date]
        train = df[(df["fecha"] < last_train_date)]
        return {
            "train": train,
            "eval_data": eval_data,
            "test": test,
            "kaggle_pred": kaggle_pred
        }
    

class PrepareXYStepOld(PipelineStep):
    def __init__(self, name: Optional[str] = None):
        super().__init__(name)

    def execute(self, train, eval_data, test, kaggle_pred) -> None:
        features = [col for col in train.columns if col not in
                        ['fecha', 'target']]
        target = 'target'

        X_train = pd.concat([train[features], eval_data[features]]) # [train + eval] + [eval] -> [test] 
        y_train = pd.concat([train[target], eval_data[target]])

        X_train_alone = train[features]
        y_train_alone = train[target]

        X_eval = eval_data[features]
        y_eval = eval_data[target]

        X_test = test[features]
        y_test = test[target]

        X_train_final = pd.concat([train[features], eval_data[features], test[features]])
        y_train_final = pd.concat([train[target], eval_data[target], test[target]])

        X_kaggle = kaggle_pred[features]
        return {
            "X_train": X_train,
            "y_train": y_train,
            "X_train_alone": X_train_alone,
            "y_train_alone": y_train_alone,
            "X_eval": X_eval,
            "y_eval": y_eval,
            "X_test": X_test,
            "y_test": y_test,
            "X_train_final": X_train_final,
            "y_train_final": y_train_final,
            "X_kaggle": X_kaggle
        }
        

# EvaluatePredictionsSteps
import pandas as pd
import numpy as np
import lightgbm as lgb
from xgboost import XGBRegressor
import xgboost as xgb
import os
import pickle
from typing import Optional
import datetime
import matplotlib.pyplot as plt

class EvaluatePredictionsSteps(PipelineStep):

    def execute(self, df, y_test, predictions, test_index) -> None:
        
        eval_df_total = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "target": y_test["target"].values,
            "predictions": predictions.values
        })

        eval_df = eval_df_total.groupby(["product_id"]).agg({
            "target": "sum",
            "predictions": "sum"
        }).reset_index()

        eval_df['tn_real'] = eval_df['target']
        eval_df['tn_pred'] = eval_df['predictions']

        total_error = np.sum(np.abs(eval_df['tn_real'] - eval_df['tn_pred'])) / np.sum(eval_df['tn_real'])
        print(f"Error en test: {total_error:.4f}")
        print("\nTop 5 productos con mayor error absoluto:")
        eval_df['error_absoluto'] = np.abs(eval_df['tn_real'] - eval_df['tn_pred'])
        print(eval_df.sort_values('error_absoluto', ascending=False).head())
        return {
            "eval_df": eval_df,
            "eval_df_total": eval_df_total,
            "total_error": total_error
        }


class PlotFeatureImportanceStep(PipelineStep):
    def execute(self, model) -> None:
        importance = pd.DataFrame()
        model.plot_importance(max_num_features=20)


class KaggleSubmissionStep(PipelineStep):
    def execute(self, df, test_index, predictions) -> None:
        submission_aux_df = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "predictions": predictions.values
        })
        submission = submission_aux_df.groupby("product_id")["predictions"].sum().reset_index()
        submission.columns = ["product_id", "tn"]
        return {"submission": submission}
    

class SaveSubmissionStep(PipelineStep):
    def __init__(self, exp_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.exp_name = exp_name

    def execute(self, submission, total_error) -> None:
        # Create the experiment directory
        exp_name = f"{str(datetime.datetime.now())}_{self.exp_name}"
        exp_dir = f"experiments/{exp_name}"
        os.makedirs(exp_dir, exist_ok=True)
        # Save the submission file
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error}.csv"), index=False)
        

class SaveExperimentStep(PipelineStep):
    def __init__(self, exp_name: str, save_dataframes=False, name = None):
        super().__init__(name)
        self.exp_name = exp_name
        self.save_dataframes = save_dataframes

    def execute(self, pipeline) -> None:

        # Create the experiment directory
        exp_dir = f"experiments/{self.exp_name}"
        os.makedirs(exp_dir, exist_ok=True)

        # obtengo el model
        model = pipeline.get_artifact("model")
        # Save the model as a pickle file
        with open(os.path.join(exp_dir, "model.pkl"), "wb") as f:
            pickle.dump(model, f)
        # guardo el error total de test
        total_error = pipeline.get_artifact("total_error")
        with open(os.path.join(exp_dir, "total_error.txt"), "w") as f:
            f.write(str(total_error))

        # Save the submission file
        submission = pipeline.get_artifact("submission")
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error:.4f}.csv"), index=False)

        # borro submission model y error de los artifacts
        pipeline.del_artifact("submission")
        
        # Guardo los artifacts restantes que son dataframes como csvs
        if self.save_dataframes:
            for artifact_name, artifact in pipeline.artifacts.items():
                if isinstance(artifact, pd.DataFrame):
                    artifact.to_csv(os.path.join(exp_dir, f"{artifact_name}.csv"), index=False)


        # Save a copy of the notebook
        #notebook_path = fallback_latest_notebook()
        #shutil.copy(notebook_path, os.path.join(exp_dir, f"notebook_{self.exp_name}.ipynb"))


class SaveDataFrameStep(PipelineStep):
    def __init__(self, df_name: str, file_name: str, ext = "pickle", name: Optional[str] = None):
        super().__init__(name)
        self.df_name = df_name
        self.file_name = file_name
        self.ext = ext

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df_name)
        if self.ext == "pickle":
            df.to_pickle(self.file_name)
        elif self.ext == "parquet":
            df.to_parquet(f"{self.file_name}.parquet", index=False)
        elif self.ext == "csv":
            df.to_csv(f"{self.file_name}.csv", index=False)
        else:
            raise ValueError(f"Unsupported file extension: {self.ext}")


class SaveScalerStep(PipelineStep):
    def __init__(self, scaler_name: str, file_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.scaler_name = scaler_name
        self.file_name = file_name

    def execute(self, pipeline) -> None:
        scaler = pipeline.get_artifact(self.scaler_name)
        with open(self.file_name, "wb") as f:
            pickle.dump(scaler, f)

# PlotFeatureImportanceStep
import pandas as pd
import numpy as np
import lightgbm as lgb
from xgboost import XGBRegressor
import xgboost as xgb
import os
import pickle
from typing import Optional
import datetime
import matplotlib.pyplot as plt

class EvaluatePredictionsSteps(PipelineStep):

    def execute(self, df, y_test, predictions, test_index) -> None:
        
        eval_df_total = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "target": y_test["target"].values,
            "predictions": predictions.values
        })

        eval_df = eval_df_total.groupby(["product_id"]).agg({
            "target": "sum",
            "predictions": "sum"
        }).reset_index()

        eval_df['tn_real'] = eval_df['target']
        eval_df['tn_pred'] = eval_df['predictions']

        total_error = np.sum(np.abs(eval_df['tn_real'] - eval_df['tn_pred'])) / np.sum(eval_df['tn_real'])
        print(f"Error en test: {total_error:.4f}")
        print("\nTop 5 productos con mayor error absoluto:")
        eval_df['error_absoluto'] = np.abs(eval_df['tn_real'] - eval_df['tn_pred'])
        print(eval_df.sort_values('error_absoluto', ascending=False).head())
        return {
            "eval_df": eval_df,
            "eval_df_total": eval_df_total,
            "total_error": total_error
        }


class PlotFeatureImportanceStep(PipelineStep):
    def execute(self, model) -> None:
        importance = pd.DataFrame()
        model.plot_importance(max_num_features=20)


class KaggleSubmissionStep(PipelineStep):
    def execute(self, df, test_index, predictions) -> None:
        submission_aux_df = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "predictions": predictions.values
        })
        submission = submission_aux_df.groupby("product_id")["predictions"].sum().reset_index()
        submission.columns = ["product_id", "tn"]
        return {"submission": submission}
    

class SaveSubmissionStep(PipelineStep):
    def __init__(self, exp_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.exp_name = exp_name

    def execute(self, submission, total_error) -> None:
        # Create the experiment directory
        exp_name = f"{str(datetime.datetime.now())}_{self.exp_name}"
        exp_dir = f"experiments/{exp_name}"
        os.makedirs(exp_dir, exist_ok=True)
        # Save the submission file
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error}.csv"), index=False)
        

class SaveExperimentStep(PipelineStep):
    def __init__(self, exp_name: str, save_dataframes=False, name = None):
        super().__init__(name)
        self.exp_name = exp_name
        self.save_dataframes = save_dataframes

    def execute(self, pipeline) -> None:

        # Create the experiment directory
        exp_dir = f"experiments/{self.exp_name}"
        os.makedirs(exp_dir, exist_ok=True)

        # obtengo el model
        model = pipeline.get_artifact("model")
        # Save the model as a pickle file
        with open(os.path.join(exp_dir, "model.pkl"), "wb") as f:
            pickle.dump(model, f)
        # guardo el error total de test
        total_error = pipeline.get_artifact("total_error")
        with open(os.path.join(exp_dir, "total_error.txt"), "w") as f:
            f.write(str(total_error))

        # Save the submission file
        submission = pipeline.get_artifact("submission")
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error:.4f}.csv"), index=False)

        # borro submission model y error de los artifacts
        pipeline.del_artifact("submission")
        
        # Guardo los artifacts restantes que son dataframes como csvs
        if self.save_dataframes:
            for artifact_name, artifact in pipeline.artifacts.items():
                if isinstance(artifact, pd.DataFrame):
                    artifact.to_csv(os.path.join(exp_dir, f"{artifact_name}.csv"), index=False)


        # Save a copy of the notebook
        #notebook_path = fallback_latest_notebook()
        #shutil.copy(notebook_path, os.path.join(exp_dir, f"notebook_{self.exp_name}.ipynb"))


class SaveDataFrameStep(PipelineStep):
    def __init__(self, df_name: str, file_name: str, ext = "pickle", name: Optional[str] = None):
        super().__init__(name)
        self.df_name = df_name
        self.file_name = file_name
        self.ext = ext

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df_name)
        if self.ext == "pickle":
            df.to_pickle(self.file_name)
        elif self.ext == "parquet":
            df.to_parquet(f"{self.file_name}.parquet", index=False)
        elif self.ext == "csv":
            df.to_csv(f"{self.file_name}.csv", index=False)
        else:
            raise ValueError(f"Unsupported file extension: {self.ext}")


class SaveScalerStep(PipelineStep):
    def __init__(self, scaler_name: str, file_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.scaler_name = scaler_name
        self.file_name = file_name

    def execute(self, pipeline) -> None:
        scaler = pipeline.get_artifact(self.scaler_name)
        with open(self.file_name, "wb") as f:
            pickle.dump(scaler, f)

# KaggleSubmissionStep
import pandas as pd
import numpy as np
import lightgbm as lgb
from xgboost import XGBRegressor
import xgboost as xgb
import os
import pickle
from typing import Optional
import datetime
import matplotlib.pyplot as plt

class EvaluatePredictionsSteps(PipelineStep):

    def execute(self, df, y_test, predictions, test_index) -> None:
        
        eval_df_total = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "target": y_test["target"].values,
            "predictions": predictions.values
        })

        eval_df = eval_df_total.groupby(["product_id"]).agg({
            "target": "sum",
            "predictions": "sum"
        }).reset_index()

        eval_df['tn_real'] = eval_df['target']
        eval_df['tn_pred'] = eval_df['predictions']

        total_error = np.sum(np.abs(eval_df['tn_real'] - eval_df['tn_pred'])) / np.sum(eval_df['tn_real'])
        print(f"Error en test: {total_error:.4f}")
        print("\nTop 5 productos con mayor error absoluto:")
        eval_df['error_absoluto'] = np.abs(eval_df['tn_real'] - eval_df['tn_pred'])
        print(eval_df.sort_values('error_absoluto', ascending=False).head())
        return {
            "eval_df": eval_df,
            "eval_df_total": eval_df_total,
            "total_error": total_error
        }


class PlotFeatureImportanceStep(PipelineStep):
    def execute(self, model) -> None:
        importance = pd.DataFrame()
        model.plot_importance(max_num_features=20)


class KaggleSubmissionStep(PipelineStep):
    def execute(self, df, test_index, predictions) -> None:
        submission_aux_df = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "predictions": predictions.values
        })
        submission = submission_aux_df.groupby("product_id")["predictions"].sum().reset_index()
        submission.columns = ["product_id", "tn"]
        return {"submission": submission}
    

class SaveSubmissionStep(PipelineStep):
    def __init__(self, exp_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.exp_name = exp_name

    def execute(self, submission, total_error) -> None:
        # Create the experiment directory
        exp_name = f"{str(datetime.datetime.now())}_{self.exp_name}"
        exp_dir = f"experiments/{exp_name}"
        os.makedirs(exp_dir, exist_ok=True)
        # Save the submission file
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error}.csv"), index=False)
        

class SaveExperimentStep(PipelineStep):
    def __init__(self, exp_name: str, save_dataframes=False, name = None):
        super().__init__(name)
        self.exp_name = exp_name
        self.save_dataframes = save_dataframes

    def execute(self, pipeline) -> None:

        # Create the experiment directory
        exp_dir = f"experiments/{self.exp_name}"
        os.makedirs(exp_dir, exist_ok=True)

        # obtengo el model
        model = pipeline.get_artifact("model")
        # Save the model as a pickle file
        with open(os.path.join(exp_dir, "model.pkl"), "wb") as f:
            pickle.dump(model, f)
        # guardo el error total de test
        total_error = pipeline.get_artifact("total_error")
        with open(os.path.join(exp_dir, "total_error.txt"), "w") as f:
            f.write(str(total_error))

        # Save the submission file
        submission = pipeline.get_artifact("submission")
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error:.4f}.csv"), index=False)

        # borro submission model y error de los artifacts
        pipeline.del_artifact("submission")
        
        # Guardo los artifacts restantes que son dataframes como csvs
        if self.save_dataframes:
            for artifact_name, artifact in pipeline.artifacts.items():
                if isinstance(artifact, pd.DataFrame):
                    artifact.to_csv(os.path.join(exp_dir, f"{artifact_name}.csv"), index=False)


        # Save a copy of the notebook
        #notebook_path = fallback_latest_notebook()
        #shutil.copy(notebook_path, os.path.join(exp_dir, f"notebook_{self.exp_name}.ipynb"))


class SaveDataFrameStep(PipelineStep):
    def __init__(self, df_name: str, file_name: str, ext = "pickle", name: Optional[str] = None):
        super().__init__(name)
        self.df_name = df_name
        self.file_name = file_name
        self.ext = ext

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df_name)
        if self.ext == "pickle":
            df.to_pickle(self.file_name)
        elif self.ext == "parquet":
            df.to_parquet(f"{self.file_name}.parquet", index=False)
        elif self.ext == "csv":
            df.to_csv(f"{self.file_name}.csv", index=False)
        else:
            raise ValueError(f"Unsupported file extension: {self.ext}")


class SaveScalerStep(PipelineStep):
    def __init__(self, scaler_name: str, file_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.scaler_name = scaler_name
        self.file_name = file_name

    def execute(self, pipeline) -> None:
        scaler = pipeline.get_artifact(self.scaler_name)
        with open(self.file_name, "wb") as f:
            pickle.dump(scaler, f)

# SaveSubmissionStep
import pandas as pd
import numpy as np
import lightgbm as lgb
from xgboost import XGBRegressor
import xgboost as xgb
import os
import pickle
from typing import Optional
import datetime
import matplotlib.pyplot as plt

class EvaluatePredictionsSteps(PipelineStep):

    def execute(self, df, y_test, predictions, test_index) -> None:
        
        eval_df_total = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "target": y_test["target"].values,
            "predictions": predictions.values
        })

        eval_df = eval_df_total.groupby(["product_id"]).agg({
            "target": "sum",
            "predictions": "sum"
        }).reset_index()

        eval_df['tn_real'] = eval_df['target']
        eval_df['tn_pred'] = eval_df['predictions']

        total_error = np.sum(np.abs(eval_df['tn_real'] - eval_df['tn_pred'])) / np.sum(eval_df['tn_real'])
        print(f"Error en test: {total_error:.4f}")
        print("\nTop 5 productos con mayor error absoluto:")
        eval_df['error_absoluto'] = np.abs(eval_df['tn_real'] - eval_df['tn_pred'])
        print(eval_df.sort_values('error_absoluto', ascending=False).head())
        return {
            "eval_df": eval_df,
            "eval_df_total": eval_df_total,
            "total_error": total_error
        }


class PlotFeatureImportanceStep(PipelineStep):
    def execute(self, model) -> None:
        importance = pd.DataFrame()
        model.plot_importance(max_num_features=20)


class KaggleSubmissionStep(PipelineStep):
    def execute(self, df, test_index, predictions) -> None:
        submission_aux_df = pd.DataFrame({
            "product_id": df.loc[test_index, "product_id"],
            "customer_id": df.loc[test_index, "customer_id"],
            "predictions": predictions.values
        })
        submission = submission_aux_df.groupby("product_id")["predictions"].sum().reset_index()
        submission.columns = ["product_id", "tn"]
        return {"submission": submission}
    

class SaveSubmissionStep(PipelineStep):
    def __init__(self, exp_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.exp_name = exp_name

    def execute(self, submission, total_error) -> None:
        # Create the experiment directory
        exp_name = f"{str(datetime.datetime.now())}_{self.exp_name}"
        exp_dir = f"experiments/{exp_name}"
        os.makedirs(exp_dir, exist_ok=True)
        # Save the submission file
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error}.csv"), index=False)
        

class SaveExperimentStep(PipelineStep):
    def __init__(self, exp_name: str, save_dataframes=False, name = None):
        super().__init__(name)
        self.exp_name = exp_name
        self.save_dataframes = save_dataframes

    def execute(self, pipeline) -> None:

        # Create the experiment directory
        exp_dir = f"experiments/{self.exp_name}"
        os.makedirs(exp_dir, exist_ok=True)

        # obtengo el model
        model = pipeline.get_artifact("model")
        # Save the model as a pickle file
        with open(os.path.join(exp_dir, "model.pkl"), "wb") as f:
            pickle.dump(model, f)
        # guardo el error total de test
        total_error = pipeline.get_artifact("total_error")
        with open(os.path.join(exp_dir, "total_error.txt"), "w") as f:
            f.write(str(total_error))

        # Save the submission file
        submission = pipeline.get_artifact("submission")
        submission.to_csv(os.path.join(exp_dir, f"submission_{self.exp_name}_{total_error:.4f}.csv"), index=False)

        # borro submission model y error de los artifacts
        pipeline.del_artifact("submission")
        
        # Guardo los artifacts restantes que son dataframes como csvs
        if self.save_dataframes:
            for artifact_name, artifact in pipeline.artifacts.items():
                if isinstance(artifact, pd.DataFrame):
                    artifact.to_csv(os.path.join(exp_dir, f"{artifact_name}.csv"), index=False)


        # Save a copy of the notebook
        #notebook_path = fallback_latest_notebook()
        #shutil.copy(notebook_path, os.path.join(exp_dir, f"notebook_{self.exp_name}.ipynb"))


class SaveDataFrameStep(PipelineStep):
    def __init__(self, df_name: str, file_name: str, ext = "pickle", name: Optional[str] = None):
        super().__init__(name)
        self.df_name = df_name
        self.file_name = file_name
        self.ext = ext

    def execute(self, pipeline) -> None:
        df = pipeline.get_artifact(self.df_name)
        if self.ext == "pickle":
            df.to_pickle(self.file_name)
        elif self.ext == "parquet":
            df.to_parquet(f"{self.file_name}.parquet", index=False)
        elif self.ext == "csv":
            df.to_csv(f"{self.file_name}.csv", index=False)
        else:
            raise ValueError(f"Unsupported file extension: {self.ext}")


class SaveScalerStep(PipelineStep):
    def __init__(self, scaler_name: str, file_name: str, name: Optional[str] = None):
        super().__init__(name)
        self.scaler_name = scaler_name
        self.file_name = file_name

    def execute(self, pipeline) -> None:
        scaler = pipeline.get_artifact(self.scaler_name)
        with open(self.file_name, "wb") as f:
            pickle.dump(scaler, f)

# PipelineMinMaxScaler
import pandas as pd
from abc import ABC, abstractmethod
from typing import Dict, Optional


class PipelineScaler(ABC):
    def __init__(self, column: str):
        self.column = column
        self.scaler_data = None

    @abstractmethod
    def fit(self, df: pd.DataFrame):
        pass

    @abstractmethod
    def transform(self, df: pd.DataFrame) -> pd.Series:
        pass

    @abstractmethod
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        pass

    @abstractmethod
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        pass

# TODO: hacer transformacion log1p si es necesario
# TODO: debuggear, por alguna razon da mal

class PipelineRobustScaler(PipelineScaler):
    
    def fit(self, df: pd.DataFrame):
        grouped = df.groupby(['product_id', 'customer_id'])[self.column]  # SeriesGroupBy
        median = grouped.median()
        q1 = grouped.apply(lambda x: x.quantile(0.25))
        q3 = grouped.apply(lambda x: x.quantile(0.75))
        iqr = q3 - q1

        agg = pd.DataFrame({
            f'{self.column}_median_scaler': median,
            f'{self.column}_iqr_scaler': iqr
        })
        print(agg.head())
        self.scaler_data = agg
        return self

    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_median_scaler']) / (df[f'{self.column}_iqr_scaler'])
        # replace inf and -inf with NaN
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        # original nans
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_median_scaler', f'{self.column}_iqr_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]

    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)

    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")

        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        # reconstruyo los indices originales
        df.set_index(df_index, inplace=True)
        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_iqr_scaler'])) + df[f'{self.column}_median_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_median_scaler', f'{self.column}_iqr_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]


class PipelineStandarScaler(PipelineScaler):

    def fit(self, df: pd.DataFrame):
        agg = df.groupby(['product_id', 'customer_id'])[self.column].agg(['mean', 'std']).rename(
            columns={'mean': f'{self.column}_mean_scaler', 'std': f'{self.column}_std_scaler'})
        self.scaler_data = agg
        #self.scaler_data.fillna(0, inplace=True)
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_mean_scaler']) / (df[f'{self.column}_std_scaler'])
        # replace inf and -inf with NaN
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        # original nans
        # hago un fill nan de las rows que no eran nan en la serie original
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_mean_scaler', f'{self.column}_std_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]
    
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)
    
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")

        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        # reconstruyo los indices originales
        df.set_index(df_index, inplace=True)
        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_std_scaler'])) + df[f'{self.column}_mean_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_mean_scaler', f'{self.column}_std_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]
    

class PipelineMinMaxScaler(PipelineScaler):

    def fit(self, df: pd.DataFrame):
        agg = df.groupby(['product_id', 'customer_id'])[self.column].agg(['min', 'max']).rename(
            columns={'min': f'{self.column}_min_scaler', 'max': f'{self.column}_max_scaler'})
        # seteo el minimo con 0 asi queda estandarlizado en todas las series
        agg[f'{self.column}_min_scaler'] = 0
        self.scaler_data = agg
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        original_index = df.index
        original_nans = df[self.column].isna()
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(original_index, inplace=True)
        df[f'{self.column}_scaled'] = (df[self.column] - df[f'{self.column}_min_scaler']) / (df[f'{self.column}_max_scaler'] - df[f'{self.column}_min_scaler'])
        df[f'{self.column}_scaled'].replace([float('inf'), float('-inf')], pd.NA, inplace=True)
        # original nans
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].fillna(0)
        df[f'{self.column}_scaled'] = df[f'{self.column}_scaled'].where(~original_nans, other=pd.NA)
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_min_scaler', f'{self.column}_max_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}_scaled"]
    
    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        return self.fit(df).transform(df)
    
    def inverse_transform(self, df: pd.DataFrame) -> pd.Series:
        if self.scaler_data is None:
            raise ValueError("Scaler has not been fitted yet.")
        
        # agrego columnas temporales
        df_index = df.index
        df = df.merge(self.scaler_data, on=['product_id', 'customer_id'], how='left')
        df.set_index(df_index, inplace=True)

        df[f"{self.column}"] = (df[f'{self.column}'] * (df[f'{self.column}_max_scaler'] - df[f'{self.column}_min_scaler'])) + df[f'{self.column}_min_scaler']
        # elimino las columnas temporales
        df.drop(columns=[f'{self.column}_min_scaler', f'{self.column}_max_scaler'], inplace=True, errors='ignore')
        return df[f"{self.column}"]
    

class ScaleFeatureStep(PipelineStep):
    def __init__(self, column: str, regex=False, override=False, scaler=PipelineStandarScaler, name = None,):
        super().__init__(name)
        self.column = column
        self.scaler_cls = scaler
        self.regex = regex
        self.override = override

    def execute(self, df: pd.DataFrame, train_index) -> Dict:
        # si regex es True, busco todas las columnas que coincidan con el regex
        if self.regex:
            columns = df.filter(regex=self.column, axis=1).columns.tolist()
            print(f"Columns found matching regex '{self.column}': {columns}")
            if not columns:
                raise ValueError(f"No columns found matching regex '{self.column}'")
        else:
            columns = [self.column]
        scalers = {}
        for column in columns:
            scaler = self.scaler_cls(
                column=column,
            )
            if self.override:
                column_scaled = column
            else:
                column_scaled = f"{column}_scaled"
            scaler.fit(df[["product_id", "customer_id", column]])
            df[column_scaled] = scaler.transform(df[["product_id", "customer_id", column]])
            scalers[f"scaler_{column_scaled}"] = scaler
        ret = {"df": df, **scalers}
        return ret
    

class InverseScalePredictionsStep(PipelineStep):
    def execute(self, predictions, df, test_index, scaler_target=None) -> Dict:
        """
        Inverse scale the predictions using the provided grouped scaler.
        """
        if not scaler_target:
            return

        # creo un df predictions_df que tiene predictions, product_id y customer_id de df para los indices de predictions
        predictions_df = pd.DataFrame(predictions, index=predictions.index)
        predictions_df["product_id"] = df["product_id"]
        predictions_df["customer_id"] = df["customer_id"]
        predictions_df.columns = ["target", "product_id", "customer_id"]
        predictions = scaler_target.inverse_transform(predictions_df)
        predictions = pd.Series(predictions, name="predictions")
        predictions.index = test_index
        predictions.fillna(0, inplace=True)

        df["target"] = scaler_target.inverse_transform(df[["target", "product_id", "customer_id"]])    
 
        return {
            "predictions": predictions,
            "df": df
        }

In [None]:

model_test_grande = Pipeline(
    steps=[
        LoadDataFrameFromPickleStep(path="df_fe_big.pickle"),
        SplitDataFrameStep(df="df", test_date="2019-10", gap=1),
        CreateTargetColumStep(target_col="tn"),
        ScaleFeatureStep(column=".*tn.*", override=False, regex=True),  
        ScaleFeatureStep(column="target", override=True, scaler=PipelineMinMaxScaler),
        ReduceMemoryUsageStep(),
        PrepareXYStep(),
        TrainModelStep(folds=0, params={"max_bin":1024, 'num_leaves': 31, "n_estimators": 700, "learning_rate": 0.01}),
        PredictStep(),
        InverseScalePredictionsStep(),
        IntegratePredictionsStep(),
        EvaluatePredictionsSteps(),
        PlotFeatureImportanceStep(),
        LoadDataFrameFromPickleStep(path="df_fe_big.pickle"),
        SplitDataFrameStep(df="df", test_date="2019-12", gap=1),
        CreateTargetColumStep(target_col="tn"),
        ScaleFeatureStep(column=".*tn.*", override=False, regex=True),
        ScaleFeatureStep(column="target", override=True, scaler=PipelineMinMaxScaler),
        ReduceMemoryUsageStep(),

        PrepareXYStep(),
        TrainModelStep(folds=0, params={"max_bin":1024, 'num_leaves': 31, "n_estimators": 700, "learning_rate": 0.01}),
        PredictStep(),
        InverseScalePredictionsStep(),
        IntegratePredictionsStep(),
        PlotFeatureImportanceStep(),
        KaggleSubmissionStep(),
        SaveSubmissionStep(exp_name="test_new_pipeline_150features"),
    ]
)