In [None]:
%load_ext autoreload
%autoreload 2

# wSAA

> Module description for wSAA classes

In [None]:
#| default_exp wSAA

In [None]:
#| hide
from nbdev.showdoc import *

# from nbdev.qmd import *

## Packages

In [None]:
#| export
from __future__ import annotations
from fastcore.docments import *
from fastcore.test import *
from fastcore.utils import *

import pandas as pd
import numpy as np
import copy
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.base import MetaEstimatorMixin
from lightgbm.sklearn import LGBMModel
from dddex.baseClasses import BaseWeightsBasedEstimator
from dddex.utils import restructureWeightsDataList, restructureWeightsDataList_multivariate

## wSAA - Random Forest

In [None]:
#| export 

class RandomForestWSAA(RandomForestRegressor, BaseWeightsBasedEstimator):
    
    def fit(self, 
            X: np.ndarray, # Feature matrix
            y: np.ndarray, # Target values
            **kwargs):

        super().fit(X = X, 
                    y = y, 
                    **kwargs)
        
        self.yTrain = y
        
        self.leafIndicesTrain = self.apply(X)
    
    #---
    
    def getWeights(self, 
                   X: np.ndarray, # Feature matrix for which conditional density estimates are computed.
                   # Specifies structure of the returned density estimates. One of: 
                   # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'
                   outputType: str='onlyPositiveWeights', 
                   # Optional. List with length X.shape[0]. Values are multiplied to the estimated 
                   # density of each sample for scaling purposes.
                   scalingList: list=None, 
                   ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.
        
        __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__
        
        #---
        
        leafIndicesDf = self.apply(X)
        
        weightsDataList = list()

        for leafIndices in leafIndicesDf:
            leafComparisonMatrix = (self.leafIndicesTrain == leafIndices) * 1
            nObsInSameLeaf = np.sum(leafComparisonMatrix, axis = 0)

            # It can happen that RF decides that the best strategy is to fit no tree at
            # all and simply average all results (happens when min_child_sample is too high, for example).
            # In this case 'leafComparisonMatrix' mustn't be averaged because there has been only a single tree.
            if len(leafComparisonMatrix.shape) == 1:
                weights = leafComparisonMatrix / nObsInSameLeaf
            else:
                weights = np.mean(leafComparisonMatrix / nObsInSameLeaf, axis = 1)

            weightsPosIndex = np.where(weights > 0)[0]

            weightsDataList.append((weights[weightsPosIndex], weightsPosIndex))

        #---

        # Check if self.yTrain is a 2D array with more than one column.
        if len(self.yTrain.shape) > 1:
            if self.yTrain.shape[1] > 1:

                if not outputType in ['all', 'onlyPositiveWeights', 'summarized']:
                    raise ValueError("outputType must be one of 'all', 'onlyPositiveWeights', 'summarized' for multivariate y.")
                
                weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, 
                                                                        outputType = outputType, 
                                                                        y = self.yTrain, 
                                                                        scalingList = scalingList,
                                                                        equalWeights = False) 
            
        else:
            weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                        outputType = outputType, 
                                                        y = self.yTrain, 
                                                        scalingList = scalingList,
                                                        equalWeights = False)
            
        
                

        return weightsDataList
    
    #---
    
    def predict(self: BaseWeightsBasedEstimator, 
                X: np.ndarray, # Feature matrix for which conditional quantiles are computed.
                probs: list, # Probabilities for which quantiles are computed.
                outputAsDf: bool=True, # Determines output. Either a dataframe with probs as columns or a dict with probs as keys.
                # Optional. List with length X.shape[0]. Values are multiplied to the predictions
                # of each sample to rescale values.
                scalingList: list=None, 
                ): 
        
        __doc__ = BaseWeightsBasedEstimator.predict.__doc__
        
        return super(MetaEstimatorMixin, self).predict(X = X,
                                                       probs = probs, 
                                                       scalingList = scalingList)
    
    #---
    
    def pointPredict(self,
                     X: np.ndarray, # Feature Matrix
                     **kwargs):
        """Original `predict` method to generate point forecasts"""
        
        return super().predict(X = X,
                               **kwargs)


## wSAA - Random Forest2

In [None]:
# #| export 

# # We attempt here to speed up the computation of the weights by interpreting every single
# # tree as a lookup table. This way we don't have to compare the leaf-Indices arrays of each
# # training sample and each test sample.
# # Unfortunately, despite the fact that this strategy works very well for a single tree,
# # it doesn't work for the whole forest because the structure of the output of the lookup 
# # tables per tree makes it difficult to aggregate the received weights per tree 
# # over all trees.

# class RandomForestWSAA2(RandomForestRegressor, BaseWeightsBasedEstimator):
    
#     def fit(self, 
#             X: np.ndarray, # Feature matrix
#             y: np.ndarray, # Target values
#             **kwargs):

#         super().fit(X = X, 
#                     y = y, 
#                     **kwargs)
        
#         self.yTrain = y
        
#         leafIndices = self.apply(X)

#         indicesPerBinPerTree = list()

#         for indexTree in range(self.n_estimators):
#             leafIndicesPerTree = leafIndices[:, indexTree]

#             indicesPerBin = defaultdict(list)

#             for index, leafIndex in enumerate(leafIndicesPerTree):
#                 indicesPerBin[leafIndex].append(index)

#             indicesPerBinPerTree.append(indicesPerBin)
        
#         self.indicesPerBinPerTree = indicesPerBinPerTree

        
    
#     #---
    
#     def getWeights(self, 
#                    X: np.ndarray, # Feature matrix for which conditional density estimates are computed.
#                    # Specifies structure of the returned density estimates. One of: 
#                    # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'
#                    outputType: str='onlyPositiveWeights', 
#                    # Optional. List with length X.shape[0]. Values are multiplied to the estimated 
#                    # density of each sample for scaling purposes.
#                    scalingList: list=None, 
#                    ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.
        
#         __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__
        
#         #---
        
#         leafIndicesPerTree = self.apply(X)
        
#         weightsDataList = list()

#         for leafIndices in leafIndicesPerTree:
            
#             weights = np.zeros(self.yTrain.shape[0])

#             for indexTree in range(len(leafIndices)):
#                 indicesPosWeight = self.indicesPerBinPerTree[indexTree][leafIndices[indexTree]]

#                 weightsNew = np.zeros(self.yTrain.shape[0])
#                 np.put(weightsNew, indicesPosWeight, 1 / len(indicesPosWeight))
                
#                 weights = weights + weightsNew

#             weights = weights / len(leafIndices)

#             weightsPosIndex = np.where(weights > 0)[0]

#             weightsDataList.append((weights[weightsPosIndex], weightsPosIndex))

#         #---

#         # Check if self.yTrain is a 2D array with more than one column.
#         if len(self.yTrain.shape) > 1:
#             if self.yTrain.shape[1] > 1:

#                 if not outputType in ['all', 'onlyPositiveWeights', 'summarized']:
#                     raise ValueError("outputType must be one of 'all', 'onlyPositiveWeights', 'summarized' for multivariate y.")
                
#                 weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, 
#                                                                         outputType = outputType, 
#                                                                         y = self.yTrain, 
#                                                                         scalingList = scalingList,
#                                                                         equalWeights = False) 
            
#         else:
#             weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
#                                                         outputType = outputType, 
#                                                         y = self.yTrain, 
#                                                         scalingList = scalingList,
#                                                         equalWeights = False)
            
        
                

#         return weightsDataList
    
#     #---
    
#     def predict(self: BaseWeightsBasedEstimator, 
#                 X: np.ndarray, # Feature matrix for which conditional quantiles are computed.
#                 probs: list, # Probabilities for which quantiles are computed.
#                 outputAsDf: bool=True, # Determines output. Either a dataframe with probs as columns or a dict with probs as keys.
#                 # Optional. List with length X.shape[0]. Values are multiplied to the predictions
#                 # of each sample to rescale values.
#                 scalingList: list=None, 
#                 ): 
        
#         __doc__ = BaseWeightsBasedEstimator.predict.__doc__
        
#         return super(MetaEstimatorMixin, self).predict(X = X,
#                                                        probs = probs, 
#                                                        scalingList = scalingList)
    
#     #---
    
#     def pointPredict(self,
#                      X: np.ndarray, # Feature Matrix
#                      **kwargs):
#         """Original `predict` method to generate point forecasts"""
        
#         return super().predict(X = X,
#                                **kwargs)


## wSAA - Random Forest LightGBM

In [None]:
#| export 

class RandomForestWSAA_LGBM(LGBMRegressor, BaseWeightsBasedEstimator):
    
    def fit(self, 
            X: np.ndarray, # Feature matrix
            y: np.ndarray, # Target values
            **kwargs):

        super().fit(X = X, 
                    y = y, 
                    **kwargs)
        
        self.yTrain = y
        
        self.leafIndicesTrain = self.pointPredict(X, pred_leaf = True)
    
    #---
    
    def getWeights(self, 
                   X: np.ndarray, # Feature matrix for which conditional density estimates are computed.
                   # Specifies structure of the returned density estimates. One of: 
                   # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'
                   outputType: str='onlyPositiveWeights', 
                   # Optional. List with length X.shape[0]. Values are multiplied to the estimated 
                   # density of each sample for scaling purposes.
                   scalingList: list=None, 
                   ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.
        
        __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__
        
        #---
        
        leafIndicesDf = self.pointPredict(X, pred_leaf = True)
        
        weightsDataList = list()

        for leafIndices in leafIndicesDf:
            leafComparisonMatrix = (self.leafIndicesTrain == leafIndices) * 1
            nObsInSameLeaf = np.sum(leafComparisonMatrix, axis = 0)

            # It can happen that RF decides that the best strategy is to fit no tree at
            # all and simply average all results (happens when min_child_sample is too high, for example).
            # In this case 'leafComparisonMatrix' mustn't be averaged because there has been only a single tree.
            if len(leafComparisonMatrix.shape) == 1:
                weights = leafComparisonMatrix / nObsInSameLeaf
            else:
                weights = np.mean(leafComparisonMatrix / nObsInSameLeaf, axis = 1)

            weightsPosIndex = np.where(weights > 0)[0]

            weightsDataList.append((weights[weightsPosIndex], weightsPosIndex))

        #---

        weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                     outputType = outputType, 
                                                     y = self.yTrain, 
                                                     scalingList = scalingList,
                                                     equalWeights = False)

        return weightsDataList
    
    #---
    
    def predict(self: BaseWeightsBasedEstimator, 
                X: np.ndarray, # Feature matrix for which conditional quantiles are computed.
                probs: list, # Probabilities for which quantiles are computed.
                outputAsDf: bool=True, # Determines output. Either a dataframe with probs as columns or a dict with probs as keys.
                # Optional. List with length X.shape[0]. Values are multiplied to the predictions
                # of each sample to rescale values.
                scalingList: list=None, 
                ): 
        
        __doc__ = BaseWeightsBasedEstimator.predict.__doc__
        
        return super(LGBMModel, self).predict(X = X,
                                             probs = probs, 
                                             scalingList = scalingList)
    
    #---
    
    def pointPredict(self,
                     X: np.ndarray, # Feature Matrix
                     **kwargs):
        """Original `predict` method to generate point forecasts"""
        
        return super().predict(X = X,
                               **kwargs)


## SAA

In [None]:
#| export

class SampleAverageApproximation(BaseWeightsBasedEstimator):
    """SAA is a featureless approach that assumes the density of the target variable is given
    by assigning equal probability to each historical observation of said target variable."""
    
    def __init__(self):
        
        self.yTrain = None
        
    #---
        
    def __str__(self):
        return "SAA()"
    __repr__ = __str__ 
    
    #---
    
    def fit(self: SAA, 
            y: np.ndarray, # Target values which form the estimated density function based on the SAA algorithm.
            ):
        self.yTrain = y
    
    #---
    
    def getWeights(self, 
                   X: np.ndarray=None, # Feature matrix for which conditional density estimates are computed.
                   # Specifies structure of the returned density estimates. One of: 
                   # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'
                   outputType: str='onlyPositiveWeights', 
                   # Optional. List with length X.shape[0]. Values are multiplied to the estimated 
                   # density of each sample for scaling purposes.
                   scalingList: list=None, 
                   ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.
        
        __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__
        
        # If no scaling is necessary, we can simply compute the data of the weights for a single observation and
        # then simply duplicate it.
        if (X is None) or (scalingList is None) or (outputType == 'onlyPositiveWeights'):
            
            neighborsList = [np.arange(len(self.yTrain))]
            weightsDataList = [(np.repeat(1 / len(neighbors), len(neighbors)), np.array(neighbors)) for neighbors in neighborsList]
            
            weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                         outputType = outputType, 
                                                         y = self.yTrain,
                                                         scalingList = scalingList,
                                                         equalWeights = True)
            
            if not X is None:
                weightsDataList = weightsDataList * X.shape[0]
        
        else:
            
            neighborsList = [np.arange(len(self.yTrain))] * X.shape[0]

            weightsDataList = [(np.repeat(1 / len(neighbors), len(neighbors)), np.array(neighbors)) for neighbors in neighborsList]

            weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                         outputType = outputType, 
                                                         y = self.yTrain,
                                                         scalingList = scalingList,
                                                         equalWeights = True)

        return weightsDataList
    
    #---
    
    def predict(self: SampleAverageApproximation, 
                X: np.ndarray, # Feature matrix for which conditional quantiles are computed.
                probs: list, # Probabilities for which quantiles are computed.
                # Optional. List with length X.shape[0]. Values are multiplied to the predictions
                # of each sample to rescale values.
                scalingList: list=None, 
                ) -> np.ndarray: 
        
        """
        Predict p-quantiles based on a reweighting of the empirical distribution function.
        In comparison to all other weights-based approaches, SAA only needs to compute
        the quantile predictions for one observation and then simply duplicate them.
        """
        
        
        # CHECKS
        if isinstance(probs, int) or isinstance(probs, float):
            if probs >= 0 and probs <= 1:
                probs = [probs]
            else:
                raise ValueError("The values specified via 'probs' must lie between 0 and 1!")           
                 
        if any([prob > 1 or prob < 0 for prob in probs]):
            raise ValueError("The values specified via 'probs' must lie between 0 and 1!")
            
        try:
            probs = np.array(probs)
        except:
            raise ValueError("Can't convert `probs` to 1-dimensional array.")
        
        #---

        distributionData = self.getWeights(X = None,
                                           outputType = 'cumulativeDistribution',
                                           scalingList = None)        

        # A tolerance term of 10^-8 is substracted from prob to account for rounding errors due to numerical precision.
        quantileIndices = np.searchsorted(a = distributionData[0][0], v = probs - 10**-8, side = 'left')
        quantiles = distributionData[0][1][quantileIndices]
        
        quantilesDf = pd.DataFrame([quantiles])
        quantilesDf.columns = probs
        
        quantilesDf_duplicated = pd.concat([quantilesDf] * X.shape[0], axis = 0).reset_index(drop = True)
        
        if not scalingList is None:
            quantilesDf_duplicated = (quantilesDf_duplicated.T * np.array(scalingList)).T
        
        return quantilesDf_duplicated
    

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

# Test Code

In [None]:
# #| hide

# from lightgbm import LGBMRegressor
# import lightgbm as lgb
# from dddex.loadData import *
# from datasetsDynamic.loadDataYaz import loadDataYaz
# import ipdb
# import inspect
# from sklearn.base import RegressorMixin
# from sklearn.ensemble import RandomForestRegressor


# data, XTrain, yTrain, XTest, yTest = loadDataBakery()

In [None]:
# #| hide

# data, XTrain, yTrain, XTest, yTest = loadDataYaz(testDays = 14,
#                                                  daysToCut = 0,
#                                                  normalizeDemand = True,
#                                                  unstacked = True,
#                                                  returnXY = True)

# # RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1, max_depth = 3)
# # RF.fit(X = XTrain, y = yTrain)

# # Duplicate XTrain and yTrain m times
# m = 100
# XTrain = np.vstack([XTrain for i in range(m)])
# yTrain = np.vstack([yTrain for i in range(m)])

# print(XTrain.shape)
# print(yTrain.shape)

# # Add gaussian to XTrain and yTrain
# XTrain = XTrain + np.random.normal(0, 0.1, XTrain.shape)
# yTrain = yTrain + np.random.normal(0, 0.1, yTrain.shape)

(72300, 227)
(72300, 7)


In [None]:
# %%time
# RFWSAA = RandomForestWSAA(n_estimators = 10, n_jobs = 1, max_depth = 4)
# RFWSAA.fit(X = XTrain, y = yTrain)

CPU times: user 51.7 s, sys: 0 ns, total: 51.7 s
Wall time: 51.7 s


In [None]:
# %%time
# RFWSAA2 = RandomForestWSAA2(n_estimators = 10, max_depth = 4, n_jobs = 1)
# RFWSAA2.fit(X = XTrain, y = yTrain)

CPU times: user 51.7 s, sys: 0 ns, total: 51.7 s
Wall time: 51.7 s


In [None]:
# n = 10000

In [None]:
# %%time
# weights = RFWSAA.getWeights(X = XTrain[:n])

CPU times: user 15.4 s, sys: 2.92 s, total: 18.3 s
Wall time: 18.3 s


In [None]:
# %%time
# weights2 = RFWSAA2.getWeights(X = XTrain[:n])

CPU times: user 1min 20s, sys: 1.44 s, total: 1min 21s
Wall time: 1min 21s


In [None]:
# RF.apply(XTrain).shape

# indicesPerBinPerTree = list()

# for indexTree, tree in enumerate(RF.estimators_):
#     leafIndicesTrain = tree.apply(XTrain)

#     indicesPerBin = defaultdict(list)

#     for index, leafIndex in enumerate(leafIndicesTrain):
#         indicesPerBin[leafIndex].append(index)

#     indicesPerBinPerTree.append(indicesPerBin)
    
    


In [None]:
# #| hide

# RF = RandomForestWSAA_LGBM(max_depth = 2,
#                            n_estimators = 10,
#                            n_jobs = 1,
#                            boosting_type = 'rf',
#                            subsample_freq = 1,
#                            subsample = 0.9)                           