In [1]:
%load_ext autoreload
%autoreload 2

# Level-Set Based Kernel Density Estimation
> Defining the classes `LevelSetKDEx` and `LevelSetKDEx_kNN` which turn any point predictor into a conditional kernel density estimator.

In [2]:
#| default_exp levelSetKDEx

In [3]:
#| hide
from nbdev.showdoc import *
# from nbdev.qmd import *

In [4]:
#| export
from __future__ import annotations
from fastcore.docments import *
from fastcore.test import *
from fastcore.utils import *

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors 
from collections import defaultdict, Counter
from joblib import Parallel, delayed, dump, load

from dddex.basePredictor import BasePredictor, restructureWeightsDataList
from dddex.wSAA import SAA

In the following we define the classes `LevelSetKDEx` and `LevelSetKDEx_kNN` where KDE is short for 'Kernel Density Estimator' and the 'x' is supposed to signal that both classes can be defined based on any arbitrary point predictor. The name 'LevelSet' stems from the fact that every approach presented in this notebook interprets the values of the point forecasts as a similarity measure between samples. The point predictor is specified by the argument `estimator` and must have a `.predict()`-method and should have been trained before hand. 

Both classes `LevelSetKDEx` and `LevelSetKDEx_kNN` fulfill the same task: By first running `.fit(XTrain, yTrain)` and then calling `.generateWeights(XTest)`, they both output an estimation of the conditional density of every sample specified by 'XTest'. The basic idea for both approaches is also identical: Suppose we have a single test sample at hand. At first, we compare the value of the point prediction of this sample and the values of the point predictions of the training samples computed via `estimator.predict(XTrain)` and `estimator.predict(XTest)`, respectively. Based on this comparison, we select 'binSize'-many training samples that we deem the most similar to the test sample at hand. The concrete way we select the training samples constitutes the only difference between `LevelSetKDEx` and `LevelSetKDEx_kNN`. Finally, the empirical distribution of the y-values of these training samples then acts as our estimation of the conditional distribution.

Further details on how both approaches work approaches can be found below.

## Level-Set Approach based on Bin Building

In [5]:
#| export

class LevelSetKDEx(BasePredictor):
    """
    `LevelSetKDEx`
    """
    
    def __init__(self, 
                 estimator, # (Fitted) object with a .predict-method.
                 binSize: int = None # Size of the bins created to group the training samples.
                 ):
        
        if not (hasattr(estimator, 'predict') and callable(estimator.predict)):
            raise ValueError("'estimator' has to have a 'predict'-method!")
        else:
            self.estimator = estimator
            
        if not (isinstance(binSize, (int, np.integer)) or binSize is None):
            raise ValueError("'binSize' has to be integer (or None if it is supposed to be tuned)!")
        else:
            self.binSize = binSize
        
        self.estimator = estimator
        self.binSize = binSize
        
        self.y = None
        self.yPred = None
        self.indicesPerBin = None
        self.lowerBoundPerBin = None
        
    #---
    
    def __str__(self):
        return f"LevelSetKDEx(estimator = {self.estimator}, binSize = {self.binSize})"
    __repr__ = __str__      
    
    #---
    
    def fit(self, 
            X: np.ndarray, # Feature matrix used by 'estimator' to predict 'y'.
            y: np.ndarray, # 1-dimensional target variable corresponding to the features 'X'.
            ):

        if self.binSize > y.shape[0]:
            raise ValueError("'binSize' mustn't be bigger than the size of 'y'!")
        
        # IMPORTANT: In case 'y' is given as a pandas.Series, we can potentially run into indexing 
        # problems later on.
        y = np.array(y)
        
        yPred = self.estimator.predict(X)
        
        indicesPerBin, lowerBoundPerBin = generateBins(binSize = self.binSize,
                                                       yPred = yPred)

        self.y = y
        self.yPred = yPred
        self.indicesPerBin = indicesPerBin
        self.lowerBoundPerBin = lowerBoundPerBin
        
    #---
    
    def getWeights(self, 
                   X: np.ndarray, # Feature matrix of samples for which conditional density estimates are computed.
                   outputType: 'all' | # Specifies structure of output.
                               'onlyPositiveWeights' | 
                               'summarized' | 
                               'cumulativeDistribution' | 
                               'cumulativeDistributionSummarized' = 'onlyPositiveWeights', 
                   scalingList: list | np.ndarray | None = None, # List or array with same size as self.y containing floats being multiplied with self.y.
                   ):
        
        binPerPred = np.searchsorted(a = self.lowerBoundPerBin, v = self.estimator.predict(X), side = 'right') - 1
        neighborsList = [self.indicesPerBin[binIndex] for binIndex in binPerPred]
        
        weightsDataList = [(np.repeat(1 / len(neighbors), len(neighbors)), np.array(neighbors)) for neighbors in neighborsList]
        
        
        weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                     outputType = outputType, 
                                                     y = self.y,
                                                     scalingList = scalingList,
                                                     equalWeights = True)
        
        return weightsDataList
    

In [6]:
# show_doc(LevelSetKDEx)

In [7]:
# show_doc(LevelSetKDEx.fit)

In [8]:
# show_doc(LevelSetKDEx.getWeights)

#### Generate Bins

In [9]:
#| export

def generateBins(binSize: int, # Size of the bins of values being grouped together.
                 yPred: np.ndarray, # 1-dimensional array of predicted values.
                 ):
    "Used to generate the bin-structure induced by the Level-Set-Forecaster algorithm"
    
    predIndicesSort = np.argsort(yPred)
    yPredSorted = yPred[predIndicesSort]

    currentBinSize = 0
    binIndex = 0
    trainIndicesLeft = len(yPred)
    indicesPerBin = defaultdict(list)
    lowerBoundPerBin = dict()
    
    for i in range(len(yPred)):
        
        if i == 0:
            lowerBoundPerBin[binIndex] = np.NINF
            
        currentBinSize += 1
        trainIndicesLeft -= 1

        indicesPerBin[binIndex].append(predIndicesSort[i])
        
        if trainIndicesLeft < binSize:
            indicesPerBin[binIndex].extend(predIndicesSort[np.arange(i+1, len(yPred), 1)])
            break

        if currentBinSize >= binSize and yPredSorted[i] < yPredSorted[i+1]:
            lowerBoundPerBin[binIndex + 1] = (yPredSorted[i] + yPredSorted[i+1]) / 2
            binIndex += 1
            currentBinSize = 0
           
    indicesPerBin = {binIndex: np.array(indices) for binIndex, indices in indicesPerBin.items()}
    
    lowerBoundPerBin = pd.Series(lowerBoundPerBin)
    lowerBoundPerBin.index.name = 'binIndex'
    
    return indicesPerBin, lowerBoundPerBin

## Level-Set Approach based on kNN

In [10]:
#| export

class LevelSetKDEx_kNN(BasePredictor):
    """
     `LevelSetKDEx_kNN` turns any point predictor that has a .predict-method 
    into an estimator of the condititional density of the underlying distribution.
    The basic idea of each level-set based approach is to interprete the point forecast
    generated by the underlying point predictor as a similarity measure of samples.
    In the case of the `LevelSetKDEx_kNN` defined here, for every new samples
    'binSize'-many training samples are computed whose point forecast is closest
    to the point forecast of the new sample.
    The resulting empirical distribution of these 'nearest' training samples are 
    viewed as our estimation of the conditional distribution of each the new sample 
    at hand.
    
    NOTE 1: The `LevelSetKDEx_kNN` class can only be applied to estimators that 
    have been fitted already.
    
    NOTE 2: In contrast to the standard `LevelSetKDEx`, it is possible to apply
    `LevelSetKDEx_kNN` to arbitrary dimensional point predictors.
    """
    
    def __init__(self, 
                 estimator, # Object with a .predict-method (fitted).
                 binSize: int | None = None, # Size of the neighbors considered to compute conditional density.
                 ):
        
        if not (hasattr(estimator, 'predict') and callable(estimator.predict)):
            raise ValueError("'estimator' has to have a 'predict'-method!")
        else:
            self.estimator = estimator
            
        if not isinstance(binSize, (int, np.integer)):
            raise ValueError("'binSize' has to be integer!")
        else:
            self.binSize = binSize
        
        self.estimator = estimator
        self.binSize = binSize
        
        self.y = None
        self.yPred = None
        self.nearestNeighborsOnPreds = None
        
    #---
    
    def __str__(self):
        return f"LevelSetKDEx_kNN(estimator = {self.estimator}, binSize = {self.binSize})"
    __repr__ = __str__   
    
    #---
    
    def fit(self:LevelSetKDEx_kNN, 
            X: np.ndarray, # Feature matrix used by 'estimator' to predict 'y'.
            y: np.ndarray, # Target variable corresponding to features 'X'.
            ):

        if self.binSize > y.shape[0]:
            raise ValueError("'binSize' mustn't be bigger than the size of 'y'!")

        yPred = self.estimator.predict(X)
        yPred_reshaped = np.reshape(yPred, newshape = (len(yPred), 1))

        nn = NearestNeighbors(algorithm = 'kd_tree')
        nn.fit(X = yPred_reshaped)

        #---

        self.y = y
        self.yPred = yPred
        self.nearestNeighborsOnPreds = nn
        
    #---
    
    def getWeights(self: LevelSetKDEx_kNN, 
                   X: np.ndarray, # Feature matrix of samples for which conditional density estimates are computed.
                   weightsByDistance = False,
                   outputType: 'all' | # Specifies structure of output.
                               'onlyPositiveWeights' | 
                               'summarized' | 
                               'cumulativeDistribution' | 
                               'cumulativeDistributionSummarized' = 'onlyPositiveWeights', 
                   scalingList: list | np.ndarray | None = None, # List or array with same size as self.y containing floats being multiplied with self.y.
                   ):

        nn = self.nearestNeighborsOnPreds

        #---

        yPred = self.estimator.predict(X)   
        yPred_reshaped = np.reshape(yPred, newshape = (len(yPred), 1))

        distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPred_reshaped, 
                                                         n_neighbors = self.binSize + 1)

        #---

        neighborsList = list(neighborsMatrix[:, 0:self.binSize])
        distanceCheck = np.where(distancesMatrix[:, self.binSize - 1] == distancesMatrix[:, self.binSize])
        indicesToMod = distanceCheck[0]

        for index in indicesToMod:
            distanceExtremePoint = np.absolute(yPred[index] - self.yPred[neighborsMatrix[index, self.binSize-1]])

            neighborsByRadius = nn.radius_neighbors(X = yPred_reshaped[index:index + 1], 
                                                    radius = distanceExtremePoint, return_distance = False)[0]
            neighborsList[index] = neighborsByRadius

        #---
        
        if weightsByDistance:
            binSizesReal = [len(neighbors) for neighbors in neighborsList]
            binSizeMax = max(binSizesReal)
            
            distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPred_reshaped, 
                                                             n_neighbors = binSizeMax)
            
            inverseDistancesMatrix = 1 / distancesMatrix

            weightsDataList = [(inverseDistancesMatrix[i, 0:binSizesReal[i]] / inverseDistancesMatrix[i, 0:binSizesReal[i]].sum(), 
                                np.array(neighborsList[i])) 
                                for i in range(len(neighborsList))]
            
            weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                         outputType = outputType, 
                                                         y = self.y,
                                                         scalingList = scalingList,
                                                         equalWeights = False)
            
        else:
            weightsDataList = [(np.repeat(1 / len(neighbors), len(neighbors)), np.array(neighbors)) for neighbors in neighborsList]

            weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
                                                         outputType = outputType, 
                                                         y = self.y,
                                                         scalingList = scalingList,
                                                         equalWeights = True)

        return weightsDataList
      

In [11]:
# show_doc(LevelSetKDEx_kNN)

In [12]:
# show_doc(LevelSetKDEx_kNN.fit)

In [13]:
# show_doc(LevelSetKDEx_kNN.getWeights)

## Bin-Size CV

In [14]:
#| export

class binSizeCV:

    def __init__(self,
                 estimator, # Object with a .predict-method (fitted).
                 cvFolds, # Specifies cross-validation-splits. Identical to 'cv' used for cross-validation in sklearn.
                 LSF_type: 'LSF' | 'LSF_kNN', # Specifies which LSF-Object we work with during cross-validation.
                 binSizeGrid: list | np.ndarray = [4, 7, 10, 15, 20, 30, 40, 50, 60, 70, 80, 
                                                   100, 125, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900,
                                                   1000, 1250, 1500, 1750, 2000, 2500, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], # binSize (int) values being evaluated.                
                 probs: list | np.ndarray = [i / 100 for i in range(1, 100, 1)], # list or array of floats between 0 and 1. p-quantiles being predicted to evaluate performance of LSF.
                 refitPerProb: bool = False, # If True, for each p-quantile a fitted LSF with best binSize to predict it is returned. Otherwise only one LSF is returned that is best over all probs.
                 n_jobs: int | None = None, # number of folds being computed in parallel.
                 ):
        
        # CHECKS
        
        if isinstance(estimator, (LevelSetKDEx, LevelSetKDEx_kNN)):
            raise ValueError("'estimator' has to be a point predictor and not a LevelSetKDEx-Object!")   
        elif not (hasattr(estimator, 'predict') and callable(estimator.predict)):
            raise ValueError("'estimator' has to have a 'predict'-method!")
        else:
            self.estimator = estimator
            
        if LSF_type is None or not LSF_type in ["LSF", "LSF_kNN"]:
            raise ValueError("LSF_type must be specified and must either be 'LSF' or 'LSF_kNN'!")
        else:
            self.LSF_type = LSF_type
            
        if np.any(np.array(probs) > 1) or np.any(np.array(probs) < 0): 
            raise ValueError("probs must only contain numbers between 0 and 1!")
        else:
            self.probs = probs
        
        #---
        
        self.binSizeGrid = binSizeGrid        
        self.cvFolds = cvFolds
        self.refitPerProb = refitPerProb
        self.n_jobs = n_jobs
        
        self.best_binSize = None
        self.best_binSize_perProb = None
        self.best_estimatorLSx = None
        self.cv_results = None
        self.cv_results_raw = None
        

In [15]:
# show_doc(binSizeCV)

In [16]:
#| export

@patch
def fit(self: binSizeCV, 
        X, 
        y):
    
    # Making sure that X and y are arrays to ensure correct subsetting via implicit indices.
    X = np.array(X)
    y = np.array(y)
    
    nSmallestTrainSample = len(self.cvFolds[0][0])
    self.binSizeGrid = [binSize for binSize in self.binSizeGrid if binSize <= nSmallestTrainSample]
    
    scoresPerFold = Parallel(n_jobs = self.n_jobs)(delayed(scoresForFold)(cvFold = cvFold,
                                                                          binSizeGrid = self.binSizeGrid,
                                                                          probs = self.probs,
                                                                          estimator = self.estimator,
                                                                          LSF_type = self.LSF_type,
                                                                          y = y,
                                                                          X = X) for cvFold in self.cvFolds)    

    self.cv_results_raw = scoresPerFold
    meanCostsDf = sum(scoresPerFold) / len(scoresPerFold)
    self.cv_results = meanCostsDf
    
    #---

    meanCostsPerBinSize = meanCostsDf.mean(axis = 1)
    binSizeBestOverall = meanCostsPerBinSize.index[np.argmin(meanCostsPerBinSize)]
    self.best_binSize = binSizeBestOverall

    binSizeBestPerProb = meanCostsDf.idxmin(axis = 0)
    self.best_binSize_perProb = binSizeBestPerProb

    #---

    if self.refitPerProb:

        LSFDict = dict()
        for binSize in binSizeBestPerProb.unique():

            if self.LSF_type == 'LSF':
                LSF = LevelSetKDEx(estimator = self.estimator, 
                                         binSize = binSize)
            else:
                LSF = LevelSetKDEx_kNN(estimator = self.estimator, 
                                             binSize = binSize)

            LSF.fit(X = X, y = y)
            LSFDict[binSize] = LSF

        self.best_estimatorLSx = {prob: LSFDict[binSizeBestPerProb.loc[prob]] 
                                  for prob in binSizeBestPerProb.index}

    else:
        if self.LSF_type == 'LSF':
            LSF = LevelSetKDEx(estimator = self.estimator, 
                                     binSize = binSizeBestOverall)
        else:
            LSF = LevelSetKDEx_kNN(estimator = self.estimator, 
                                         binSize = binSizeBestOverall)

        LSF.fit(X = X, y = y)

        self.best_estimatorLSx = LSF

In [17]:
# show_doc(binSizeCV.fit)

#### Scores for Single Fold

In [18]:
#| export

# This function evaluates the newsvendor performance for different bin sizes for one specific fold.
# The considered bin sizes

def scoresForFold(cvFold, binSizeGrid, probs, estimator, LSF_type, y, X):
    
    indicesTrain = cvFold[0]
    indicesTest = cvFold[1]
    
    yTrainFold = y[indicesTrain]
    XTrainFold = X[indicesTrain]
    
    yTestFold = y[indicesTest]
    XTestFold = X[indicesTest]
    
    estimator.fit(X = XTrainFold, y = yTrainFold)
    
    #---
       
    SAA_fold = SAA()
    SAA_fold.fit(y = yTrainFold)
    
    # By setting 'X = None', the SAA results are only computed for a single observation (they are independent of X anyway).
    # In order to receive the final dataframe of SAA results, we simply duplicate this single row as many times as needed.
    quantilesDictSAAOneOb = SAA_fold.predictQ(X = None, probs = probs, outputAsDf = False)
    quantilesDictSAA = {prob: np.repeat(quantile, len(XTestFold)) for prob, quantile in quantilesDictSAAOneOb.items()}
    
    #---
                                                   
    costRatiosPerBinSize = defaultdict(dict)

    for binSize in iter(binSizeGrid):
        
        if LSF_type == 'LSF':
            estimatorLSF = LevelSetKDEx(estimator = estimator,
                                        binSize = binSize)
        else:
            estimatorLSF = LevelSetKDEx_kNN(estimator = estimator,
                                            binSize = binSize)
        
        estimatorLSF.fit(X = XTrainFold,
                         y = yTrainFold)
        
        quantilesDict = estimatorLSF.predictQ(X = XTestFold,
                                              probs = probs,
                                              outputAsDf = False)
        
        #---

        costRatioDict = dict()
        
        for prob in probs:            
            costRatioDict[prob] = getCostRatio(decisions = quantilesDict[prob], 
                                               decisionsSAA = quantilesDictSAA[prob], 
                                               yTest = yTestFold, 
                                               prob = prob)
        
        costRatiosPerBinSize[binSize] = costRatioDict
    
    #---
    
    costRatioDf = pd.DataFrame.from_dict(costRatiosPerBinSize, orient = 'index')
    
    return costRatioDf

##### Get Cost Ratio

In [19]:
#| export

def getCostRatio(decisions, decisionsSAA, yTest, prob):

    # Newsvendor Costs of our model
    cost = np.array([prob * (yTest[i] - decisions[i]) if yTest[i] > decisions[i] 
                     else (1 - prob) * (decisions[i] - yTest[i]) 
                     for i in range(len(yTest))]).sum()
    
    # Newsvendor Costs of SAA
    costSAA = np.array([prob * (yTest[i] - decisionsSAA[i]) if yTest[i] > decisionsSAA[i] 
                        else (1 - prob) * (decisionsSAA[i] - yTest[i]) 
                        for i in range(len(yTest))]).sum()
    
    #---
    
    # We have to capture the special case of costSAA == 0, because then we can't compute the 
    # Cost-Ratio using the actual definition.
    if costSAA > 0:
        costRatio = cost / costSAA
    else:
        if cost == 0:
            costRatio = 0
        else:
            costRatio = 1
    
    return costRatio

In [20]:
## Bin-Size CV 2

In [21]:
# #| export

# class binSizeCV2:

#     def __init__(self,
#                  estimator, # Object with a .predict-method (fitted).
#                  paramGrid = None,
#                  binSizeGrid: list | np.ndarray = [4, 7, 10, 15, 20, 30, 40, 50, 60, 70, 80, 
#                                                    100, 125, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900,
#                                                    1000, 1250, 1500, 1750, 2000, 2500, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000], # binSize (int) values being evaluated.         
#                  cvFolds, # Specifies cross-validation-splits. Identical to 'cv' used for cross-validation in sklearn.
#                  LSF_type: 'LSF' | 'LSF_kNN', # Specifies which LSF-Object we work with during cross-validation.       
#                  probs: list | np.ndarray = [i / 100 for i in range(1, 100, 1)], # list or array of floats between 0 and 1. p-quantiles being predicted to evaluate performance of LSF.
#                  refitPerProb: bool = False, # If True, for each p-quantile a fitted LSF with best binSize to predict it is returned. Otherwise only one LSF is returned that is best over all probs.
#                  n_jobs: int | None = None, # number of folds being computed in parallel.
#                  ):
        
#         # CHECKS
        
#         if isinstance(estimator, (LevelSetKDEx, LevelSetKDEx_kNN)):
#             raise ValueError("'estimator' has to be a point predictor and not a LevelSetKDEx-Object!")   
#         elif not (hasattr(estimator, 'predict') and callable(estimator.predict)):
#             raise ValueError("'estimator' has to have a 'predict'-method!")
#         else:
#             self.estimator = estimator
            
#         if LSF_type is None or not LSF_type in ["LSF", "LSF_kNN"]:
#             raise ValueError("LSF_type must be specified and must either be 'LSF' or 'LSF_kNN'!")
#         else:
#             self.LSF_type = LSF_type
            
#         if np.any(np.array(probs) > 1) or np.any(np.array(probs) < 0): 
#             raise ValueError("probs must only contain numbers between 0 and 1!")
#         else:
#             self.probs = probs
        
#         #---
        
#         self.binSizeGrid = binSizeGrid        
#         self.cvFolds = cvFolds
#         self.refitPerProb = refitPerProb
#         self.n_jobs = n_jobs
        
#         self.best_binSize = None
#         self.best_binSize_perProb = None
#         self.best_estimatorLSx = None
#         self.cv_results = None
#         self.cv_results_raw = None
        

In [22]:
# #| export

# @patch
# def fit(self: binSizeCV2, 
#         X, 
#         y):
    
#     scoresPerFold = Parallel(n_jobs = self.n_jobs)(delayed(scoresForFold)(cvFold = cvFold,
#                                                                           binSizeGrid = self.binSizeGrid,
#                                                                           probs = self.probs,
#                                                                           estimator = self.estimator,
#                                                                           LSF_type = self.LSF_type,
#                                                                           y = y,
#                                                                           X = X) for cvFold in cvFolds)    

#     self.cv_results_raw = scoresPerFold

#     #---

#     nvCostsMatrix = scoresPerFold[0]

#     for i in range(1, len(scoresPerFold)):
#         nvCostsMatrix = nvCostsMatrix + scoresPerFold[i]

#     nvCostsMatrix = nvCostsMatrix / len(cvFolds)

#     self.cv_results = nvCostsMatrix

#     #---

#     meanCostsDf = nvCostsMatrix.mean(axis = 1)
#     binSizeBestOverall = meanCostsDf.index[np.argmax(meanCostsDf)]
#     self.best_binSize = binSizeBestOverall

#     binSizeBestPerProb = nvCostsMatrix.idxmax(axis = 0)
#     self.best_binSize_perProb = binSizeBestPerProb

#     #---

#     if self.refitPerProb:

#         LSFDict = dict()
#         for binSize in binSizeBestPerProb.unique():

#             if self.LSF_type == 'LSF':
#                 LSF = LevelSetKDEx(estimator = self.estimator, 
#                                          binSize = binSize)
#             else:
#                 LSF = LevelSetKDEx_kNN(estimator = self.estimator, 
#                                              binSize = binSize)

#             LSF.fit(X = X, y = y)
#             LSFDict[binSize] = LSF

#         self.best_estimatorLSx = {prob: LSFDict[binSizeBestPerProb.loc[prob]] 
#                                   for prob in binSizeBestPerProb.index}

#     else:
#         if self.LSF_type == 'LSF':
#             LSF = LevelSetKDEx(estimator = self.estimator, 
#                                      binSize = binSizeBestOverall)
#         else:
#             LSF = LevelSetKDEx_kNN(estimator = self.estimator, 
#                                          binSize = binSizeBestOverall)

#         LSF.fit(X = X, y = y)

#         self.best_estimatorLSx = LSF

In [23]:
#| hide
import nbdev; nbdev.nbdev_export()

# Test Code

In [25]:
from lightgbm import LGBMRegressor
from dddex.loadData import *

data, XTrain, yTrain, XTest, yTest = loadDataYaz(testDays = 14, 
                                                 returnXY = True,
                                                 daysToCut = 0)

In [29]:
LGBM = LGBMRegressor(boosting_type = 'gbdt',
                     n_jobs = 1)

LGBM.fit(X = XTrain, y = yTrain)

LGBMRegressor(n_jobs=1)

In [30]:
LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 100)
LS_KDEx_kNN.fit(XTrain, yTrain)

In [39]:
test = LS_KDEx_kNN.predictQ(X = XTest, weightsByDistance = True, outputAsDf = True)

SyntaxError: positional argument follows keyword argument (4268062944.py, line 1)

In [34]:
test

Unnamed: 0,0.1,0.5,0.9
0,0.040000,0.040000,0.120000
1,0.090909,0.160000,0.235294
2,0.042254,0.240000,0.240000
3,0.058824,0.200000,0.240000
4,0.040000,0.120000,0.160000
...,...,...,...
93,0.170455,0.235294,0.295455
94,0.215909,0.240000,0.340909
95,0.176471,0.233333,0.304878
96,0.322581,0.352941,0.440860


In [35]:
test2 = LS_KDEx_kNN.predictQ(X = XTest, weightsByDistance = False, outputAsDf = True)

In [36]:
test2

Unnamed: 0,0.1,0.5,0.9
0,0.000000,0.080000,0.120000
1,0.080000,0.160000,0.240000
2,0.028169,0.080000,0.160000
3,0.040000,0.120000,0.200000
4,0.040000,0.117647,0.160000
...,...,...,...
93,0.154930,0.231707,0.294118
94,0.176471,0.267606,0.340909
95,0.176471,0.247312,0.340909
96,0.295455,0.366197,0.500000
