In [1]:
#| hide
import numpy as np
import pandas as pd
from math import isclose

from lightgbm import LGBMRegressor
from dddex.levelSetKDEx import *
from dddex.wSAA import *
from dddex.loadData import *
from dddex.utils import *
from fastcore.test import *

## SAA

In [2]:
SAA = SampleAverageApproximation()

In [13]:
SAA.fit(y = np.arange(100))
weightsData = SAA.getWeights(outputType = 'onlyPositiveWeights')

test_eq(len(weightsData), 1)
test_eq(weightsData[0][0], np.repeat(0.01, 100))
test_eq(weightsData[0][1], np.arange(100))

weightsData = SAA.getWeights(X = np.identity(10))
test_eq(len(weightsData), 10)

for i in range(10):
    assert np.array_equal(weightsData[i][0], weightsData[0][0])
    assert np.array_equal(weightsData[i][1], weightsData[0][1])

In [19]:
SAA.fit(y = np.repeat(np.arange(10), 2))
weightsData = SAA.getWeights(outputType = 'onlyPositiveWeights')

test_eq(len(weightsData), 1)
test_eq(weightsData[0][0], np.repeat(0.05, 20))
test_eq(weightsData[0][1], np.arange(20))

weightsData = SAA.getWeights(X = np.identity(10))
test_eq(len(weightsData), 10)

for i in range(10):
    assert np.array_equal(weightsData[i][0], weightsData[0][0])
    assert np.array_equal(weightsData[i][1], weightsData[0][1])

## Loading Yaz Data

In [None]:
#| hide
testDays = 182
daysToCut = 300

data, XTrain, yTrain, XTest, yTest = loadDataYaz(testDays = testDays,
                                                 daysToCut = 0,
                                                 returnXY = True)
scalingList = data.loc[data['label'] == 'test', 'scalingValue'].tolist()

# data2, XTrain2, yTrain2, XTest2, yTest2 = loadDataYaz(testDays = testDays,
#                                                       daysToCut = daysToCut,
#                                                       returnXY = True)

# data3 = loadDataYaz(testDays = testDays,
#                     daysToCut = 300,
#                     returnXY = False)

In [None]:
#| hide
assert XTest.shape[0] == len(data['id'].unique()) * testDays 
assert yTest.shape[0] == len(data['id'].unique()) * testDays

assert XTrain.shape[0] == data.shape[0] - len(data['id'].unique()) * testDays
assert yTrain.shape[0] == data.shape[0] - len(data['id'].unique()) * testDays

#---

# assert XTest2.shape[0] == len(data2['id'].unique()) * testDays
# assert yTest2.shape[0] == len(data2['id'].unique()) * testDays

# assert data2.shape[0] == data.shape[0] - len(data['id'].unique()) * daysToCut
# assert XTrain2.shape[0] == data.shape[0] - len(data['id'].unique()) * (daysToCut + testDays)
# assert yTrain2.shape[0] == data.shape[0] - len(data['id'].unique()) * (daysToCut + testDays)

# #---

# test_eq(data2, data3)

## Grouped Time Series Split

In [None]:
#| hide 

nIDs = len(data['id'].unique())
kFolds = 4
testLength = 7
timeFeature = 'dayIndex'
groupFeature = 'id'

cvFolds = groupedTimeSeriesSplit(data = data, 
                                 kFolds = kFolds, 
                                 testLength = testLength, 
                                 groupFeature = groupFeature, 
                                 timeFeature = timeFeature)

test_eq(len(cvFolds), 4)

for i in range(len(cvFolds)):
    fold = cvFolds[i]
    
    test_eq(len(fold[1]), testLength * nIDs)
    test_eq(len(fold[0]), (data.shape[0] - nIDs * testLength * (kFolds - i - 1)) - len(fold[1]))
    test_eq(len(set(fold[0]) & set(fold[1])), 0)
    
    dataTrainToCheck = data.iloc[fold[0]]
    dataTestToCheck = data.iloc[fold[1]]
    
    timeMaxGroupTrain = dataTrainToCheck.groupby(groupFeature)[timeFeature].max()
    timeMinGroupTest = dataTestToCheck.groupby(groupFeature)[timeFeature].min()
    
    assert (timeMaxGroupTrain < timeMinGroupTest).all()

In [None]:
#| hide
LGBM = LGBMRegressor(max_depth = 4, n_jobs = 1)
LGBM.fit(X = XTrain, y = yTrain)

LGBMRegressor(max_depth=4, n_jobs=1)

## Bin Size Cross-Validation

### LSx Bin-Building

#### Normal Weights, no refit

In [None]:
yTrainPred = LGBM.predict(XTrain)
yTestPred = LGBM.predict(XTest)

LSKDEx = LevelSetKDEx(estimator = LGBM,
                      weightsByDistance = False)

kFolds = 2
probs = [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999]
binSizeGrid = [1, 100, 1000, 10000]

dataTrain = data[data['label'] == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = 28, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

CV = binSizeCV(estimatorLSx = LSKDEx, 
               cvFolds = cvFolds, 
               binSizeGrid = binSizeGrid,
               probs = probs,
               refitPerProb = False,
               n_jobs = None)

CV.fit(X = XTrain, y = yTrain)

In [None]:
test_eq(CV.cvFolds, cvFolds)
test_eq(CV.probs, probs)
assert not CV.refitPerProb

#---

# Stored binSize-Grid
binSizesFiltered = [binSize for binSize in CV.binSizeGrid if binSize <= len(cvFolds[0][0])]
test_eq(CV.binSizeGrid, binSizesFiltered)

#---

# Estimator the same? Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which shouldn't be the case here because we 
# create a deepcopy of LGBM when initiating LS_KDEx. The point estimators
# should be copies of each other though.
assert CV.estimatorLSx.estimator is not LGBM
test_eq(CV.estimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.estimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

#---

# CV results raw
assert isinstance(CV.cv_results_raw, list)
test_eq(len(CV.cv_results_raw), kFolds)

for resDf in CV.cv_results_raw:
    test_eq(resDf.shape, (len(binSizesFiltered), len(probs)))
    test_eq(resDf.index, binSizesFiltered)
    test_eq(resDf.columns, probs)
    assert np.all(resDf >= 0)
    
#---

# CV results aggregated
meanCostMatrix = 0
for i in range(0, kFolds, 1):
    meanCostMatrix += CV.cv_results_raw[i]
    
meanCostMatrix = meanCostMatrix / kFolds

assert np.allclose(CV.cv_results, meanCostMatrix)

test_eq(CV.cv_results.shape, (len(binSizesFiltered), len(probs)))
test_eq(CV.cv_results.index, binSizesFiltered)
test_eq(CV.cv_results.columns, probs)
assert np.all(resDf >= 0)

#---

# best bin size
averageCostsPerBin = CV.cv_results.mean(axis = 1)
bestBinSizeTest = CV.cv_results.index[np.argmin(averageCostsPerBin)]
test_eq(CV.bestBinSize, bestBinSizeTest)
assert CV.bestBinSize in binSizeGrid

#---

# best bin size per prob
test_eq(len(CV.bestBinSize_perProb), len(probs))
assert [binSize in binSizeGrid for binSize in CV.bestBinSize_perProb]

bestBinSizePerProbTest = CV.cv_results.idxmin(axis = 0)
test_eq(CV.bestBinSize_perProb, bestBinSizePerProbTest)

#---

# refitted LSx object
assert isinstance(CV.bestEstimatorLSx, LevelSetKDEx)
test_eq(CV.bestEstimatorLSx.binSize, bestBinSizeTest)

# Estimator the same? (Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which is not (!!) the case here)
assert CV.bestEstimatorLSx.estimator is not LGBM
test_eq(CV.bestEstimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.bestEstimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

In [None]:
test = LevelSetKDEx(estimator = LGBM)
testCV = binSizeCV(estimatorLSx = test,
                   cvFolds = cvFolds,
                   binSizeGrid = [10, 100])

testCV.fit(XTrain, yTrain)

testCV.estimatorLSx.estimator == LGBM

False

#### Normal Weights, refit

In [None]:
LSKDEx = LevelSetKDEx(estimator = LGBM,
                weightsByDistance = False)

kFolds = 2
probs = [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999]
binSizeGrid = [1, 100, 1000, 10000]

dataTrain = data[data['label'] == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = 28, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

CV = binSizeCV(estimatorLSx = LSKDEx, 
               cvFolds = cvFolds, 
               binSizeGrid = binSizeGrid,
               probs = probs,
               refitPerProb = True,
               n_jobs = None)

CV.fit(X = XTrain, y = yTrain)

In [None]:
test_eq(CV.cvFolds, cvFolds)
test_eq(CV.probs, probs)
assert CV.refitPerProb

#---

# Stored binSize-Grid
binSizesFiltered = [binSize for binSize in CV.binSizeGrid if binSize <= len(cvFolds[0][0])]
test_eq(CV.binSizeGrid, binSizesFiltered)

#---

# Estimator the same? Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which shouldn't be the case here because we 
# create a deepcopy of LGBM when initiating LS_KDEx. The point estimators
# should be copies of each other though.
assert CV.estimatorLSx.estimator is not LGBM
test_eq(CV.estimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.estimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

#---

# CV Results Raw
assert isinstance(CV.cv_results_raw, list)
test_eq(len(CV.cv_results_raw), kFolds)

for resDf in CV.cv_results_raw:
    test_eq(resDf.shape, (len(binSizesFiltered), len(probs)))
    test_eq(resDf.index, binSizesFiltered)
    test_eq(resDf.columns, probs)
    assert np.all(resDf >= 0)

#---

# CV Results Aggregated
meanCostMatrix = 0
for i in range(0, kFolds, 1):
    meanCostMatrix += CV.cv_results_raw[i]
    
meanCostMatrix = meanCostMatrix / kFolds

assert np.allclose(CV.cv_results, meanCostMatrix)

test_eq(CV.cv_results.shape, (len(binSizesFiltered), len(probs)))
test_eq(CV.cv_results.index, binSizesFiltered)
test_eq(CV.cv_results.columns, probs)
assert np.all(resDf >= 0)

#---

# Best bin size
averageCostsPerBin = CV.cv_results.mean(axis = 1)
bestBinSizeTest = CV.cv_results.index[np.argmin(averageCostsPerBin)]
test_eq(CV.bestBinSize, bestBinSizeTest)
assert CV.bestBinSize in binSizeGrid

#---

# Best bin size per prob
test_eq(len(CV.bestBinSize_perProb), len(probs))
assert [binSize in binSizeGrid for binSize in CV.bestBinSize_perProb]

bestBinSizePerProbTest = CV.cv_results.idxmin(axis = 0)
test_eq(CV.bestBinSize_perProb, bestBinSizePerProbTest)

#---

# refitted LSx objects
assert isinstance(CV.bestEstimatorLSx, dict)
test_eq(list(CV.bestEstimatorLSx.keys()), probs)

for prob, LSx in CV.bestEstimatorLSx.items():
    assert isinstance(LSx, LevelSetKDEx)
    test_eq(LSx.binSize, bestBinSizePerProbTest.loc[prob])
    
    # Estimator the same? (Note: No __eq__ method is implemented for LGBMRegressor. 
    # LGBM == LGBM2 consequently only returns true when both names are refering to 
    # the exact same object in memory, which is not (!!) the case here)
    assert LSx.estimator is not LGBM
    test_eq(LSx.estimator.predict(XTrain), LGBM.predict(XTrain))
    test_eq(LSx.estimator.predict(XTest), LGBM.predict(XTest))
    

#### Distance Weights, no refit

In [None]:
LSKDEx = LevelSetKDEx(estimator = LGBM,
                weightsByDistance = True)

kFolds = 2
probs = [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999]
binSizeGrid = [1, 100, 1000, 10000]
LSF_type = 'LSF'

dataTrain = data[data['label'] == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = 28, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

CVDistance = binSizeCV(estimatorLSx = LSKDEx, 
                       cvFolds = cvFolds, 
                       binSizeGrid = binSizeGrid,
                       probs = probs,
                       refitPerProb = False,
                       n_jobs = None)

CVDistance.fit(X = XTrain, y = yTrain)

#---

LSKDEx.set_params(weightsByDistance = False)

CVStandard = binSizeCV(estimatorLSx = LSKDEx, 
                       cvFolds = cvFolds, 
                       binSizeGrid = binSizeGrid,
                       probs = probs,
                       refitPerProb = False,
                       n_jobs = None)

CVStandard.fit(X = XTrain, y = yTrain)

In [None]:
test_eq(CVDistance.cvFolds, cvFolds)
test_eq(CVDistance.probs, probs)
assert not CVDistance.refitPerProb

#---

# Check if CV Results are different compared to the standard case of generating weights
# This is supposed to check whether we really used the attribute 'weightsByDistance'
# in the predict function inside scorePerFold
assert not np.allclose(CVDistance.cv_results, CVStandard.cv_results)

#---

# Stored binSize-Grid
binSizesFiltered = [binSize for binSize in CVDistance.binSizeGrid if binSize <= len(cvFolds[0][0])]
test_eq(CVDistance.binSizeGrid, binSizesFiltered)

#---

# Estimator the same? Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which shouldn't be the case here because we 
# create a deepcopy of LGBM when initiating LS_KDEx. The point estimators
# should be copies of each other though.
assert CV.estimatorLSx.estimator is not LGBM
test_eq(CV.estimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.estimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

#---

# CV Results Raw
assert isinstance(CVDistance.cv_results_raw, list)
test_eq(len(CVDistance.cv_results_raw), kFolds)

for resDf in CVDistance.cv_results_raw:
    test_eq(resDf.shape, (len(binSizesFiltered), len(probs)))
    test_eq(resDf.index, binSizesFiltered)
    test_eq(resDf.columns, probs)
    assert np.all(resDf >= 0)

#---

# CV Results Aggregated
meanCostMatrix = 0
for i in range(0, kFolds, 1):
    meanCostMatrix += CVDistance.cv_results_raw[i]
    
meanCostMatrix = meanCostMatrix / kFolds

assert np.allclose(CVDistance.cv_results, meanCostMatrix)

test_eq(CVDistance.cv_results.shape, (len(binSizesFiltered), len(probs)))
test_eq(CVDistance.cv_results.index, binSizesFiltered)
test_eq(CVDistance.cv_results.columns, probs)
assert np.all(resDf >= 0)

#---

# Best bin size
averageCostsPerBin = CVDistance.cv_results.mean(axis = 1)
bestBinSizeTest = CVDistance.cv_results.index[np.argmin(averageCostsPerBin)]
test_eq(CVDistance.bestBinSize, bestBinSizeTest)
assert CVDistance.bestBinSize in binSizeGrid

#---

# Best bin size per prob
test_eq(len(CVDistance.bestBinSize_perProb), len(probs))
assert [binSize in binSizeGrid for binSize in CVDistance.bestBinSize_perProb]

bestBinSizePerProbTest = CVDistance.cv_results.idxmin(axis = 0)
test_eq(CVDistance.bestBinSize_perProb, bestBinSizePerProbTest)

#---

# refitted LSx object
assert isinstance(CVDistance.bestEstimatorLSx, LevelSetKDEx)
test_eq(CVDistance.bestEstimatorLSx.binSize, bestBinSizeTest)

# Estimator the same? (Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which is not (!!) the case here)
assert CVDistance.bestEstimatorLSx.estimator is not LGBM
test_eq(CVDistance.bestEstimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CVDistance.bestEstimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))
    

#### Distance Weights, refit

In [None]:
LSKDEx = LevelSetKDEx(estimator = LGBM,
                      weightsByDistance = True)

kFolds = 2
probs = [0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999]
binSizeGrid = [1, 100, 1000, 10000]
LSF_type = 'LSF'

dataTrain = data[data['label'] == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = 28, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

CVDistance = binSizeCV(estimatorLSx = LSKDEx, 
                       cvFolds = cvFolds, 
                       binSizeGrid = binSizeGrid,
                       probs = probs,
                       refitPerProb = True,
                       n_jobs = None)

CVDistance.fit(X = XTrain, y = yTrain)

#---

LSKDEx.set_params(weightsByDistance = False)

CVStandard = binSizeCV(estimatorLSx = LSKDEx, 
                       cvFolds = cvFolds, 
                       binSizeGrid = binSizeGrid,
                       probs = probs,
                       refitPerProb = True,
                       n_jobs = None)

CVStandard.fit(X = XTrain, y = yTrain)

In [None]:
test_eq(CVDistance.cvFolds, cvFolds)
test_eq(CVDistance.probs, probs)
assert CVDistance.refitPerProb

#---

# Check if CV Results are different compared to the standard case of generating weights
# This is supposed to check whether we really used the attribute 'weightsByDistance'
# in the predict function inside scorePerFold
assert not np.allclose(CVDistance.cv_results, CVStandard.cv_results)

#---

# Stored binSize-Grid
binSizesFiltered = [binSize for binSize in CVDistance.binSizeGrid if binSize <= len(cvFolds[0][0])]
test_eq(CVDistance.binSizeGrid, binSizesFiltered)

#---

# Estimator the same? Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which shouldn't be the case here because we 
# create a deepcopy of LGBM when initiating LS_KDEx. The point estimators
# should be copies of each other though.
assert CV.estimatorLSx.estimator is not LGBM
test_eq(CV.estimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.estimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

#---

# CV Results Raw
assert isinstance(CVDistance.cv_results_raw, list)
test_eq(len(CVDistance.cv_results_raw), kFolds)

for resDf in CVDistance.cv_results_raw:
    test_eq(resDf.shape, (len(binSizesFiltered), len(probs)))
    test_eq(resDf.index, binSizesFiltered)
    test_eq(resDf.columns, probs)
    assert np.all(resDf >= 0)

#---

# CV Results Aggregated
meanCostMatrix = 0
for i in range(0, kFolds, 1):
    meanCostMatrix += CVDistance.cv_results_raw[i]
    
meanCostMatrix = meanCostMatrix / kFolds

assert np.allclose(CVDistance.cv_results, meanCostMatrix)

test_eq(CVDistance.cv_results.shape, (len(binSizesFiltered), len(probs)))
test_eq(CVDistance.cv_results.index, binSizesFiltered)
test_eq(CVDistance.cv_results.columns, probs)
assert np.all(resDf >= 0)

#---

# Best bin size
averageCostsPerBin = CVDistance.cv_results.mean(axis = 1)
bestBinSizeTest = CVDistance.cv_results.index[np.argmin(averageCostsPerBin)]
test_eq(CVDistance.bestBinSize, bestBinSizeTest)
assert CVDistance.bestBinSize in binSizeGrid

#---

# Best bin size per prob
test_eq(len(CVDistance.bestBinSize_perProb), len(probs))
assert [binSize in binSizeGrid for binSize in CVDistance.bestBinSize_perProb]

bestBinSizePerProbTest = CVDistance.cv_results.idxmin(axis = 0)
test_eq(CVDistance.bestBinSize_perProb, bestBinSizePerProbTest)

#---

# refitted LSx objects
assert isinstance(CVDistance.bestEstimatorLSx, dict)
test_eq(list(CVDistance.bestEstimatorLSx.keys()), probs)

for prob, LSx in CVDistance.bestEstimatorLSx.items():
    assert isinstance(LSx, LevelSetKDEx)
    test_eq(LSx.binSize, bestBinSizePerProbTest.loc[prob])
    
    # Estimator the same? (Note: No __eq__ method is implemented for LGBMRegressor. 
    # LGBM == LGBM2 consequently only returns true when both names are refering to 
    # the exact same object in memory, which is not (!!) the case here)
    assert LSx.estimator is not LGBM
    test_eq(LSx.estimator.predict(XTrain), LGBM.predict(XTrain))
    test_eq(LSx.estimator.predict(XTest), LGBM.predict(XTest))
    

### LSx kNN

#### Normal Weights, refit

In [None]:
LSKDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM,
                              weightsByDistance = False)

kFolds = 3
probs = [0.005, 0.2, 0.4, 0.6, 0.78,0.99999]
binSizeGrid = [1, 100, 1000, 10000, 2000000]

dataTrain = data[data['label'] == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = 7, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

CV = binSizeCV(estimatorLSx = LSKDEx_kNN, 
               cvFolds = cvFolds, 
               binSizeGrid = binSizeGrid, 
               probs = probs,
               refitPerProb = True,
               n_jobs = 2)

CV.fit(X = XTrain, y = yTrain)

In [None]:
test_eq(CV.cvFolds, cvFolds)
test_eq(CV.probs, probs)
assert CV.refitPerProb

#---

# Stored binSize-Grid
binSizesFiltered = [binSize for binSize in CV.binSizeGrid if binSize <= len(cvFolds[0][0])]
test_eq(CV.binSizeGrid, binSizesFiltered)

#---

# Estimator the same? Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which shouldn't be the case here because we 
# create a deepcopy of LGBM when initiating LS_KDEx. The point estimators
# should be copies of each other though.
assert CV.estimatorLSx.estimator is not LGBM
test_eq(CV.estimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.estimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

#---

# CV Results Raw
assert isinstance(CV.cv_results_raw, list)
test_eq(len(CV.cv_results_raw), kFolds)

for resDf in CV.cv_results_raw:
    test_eq(resDf.shape, (len(binSizesFiltered), len(probs)))
    test_eq(resDf.index, binSizesFiltered)
    test_eq(resDf.columns, probs)
    assert np.all(resDf >= 0)

#---

# CV Results Aggregated
meanCostMatrix = 0
for i in range(0, kFolds, 1):
    meanCostMatrix += CV.cv_results_raw[i]
    
meanCostMatrix = meanCostMatrix / kFolds

assert np.allclose(CV.cv_results, meanCostMatrix)

test_eq(CV.cv_results.shape, (len(binSizesFiltered), len(probs)))
test_eq(CV.cv_results.index, binSizesFiltered)
test_eq(CV.cv_results.columns, probs)
assert np.all(resDf >= 0)

#---

# Best bin size
averageCostsPerBin = CV.cv_results.mean(axis = 1)
bestBinSizeTest = CV.cv_results.index[np.argmin(averageCostsPerBin)]
test_eq(CV.bestBinSize, bestBinSizeTest)
assert CV.bestBinSize in binSizeGrid

#---

# Best bin size per prob
test_eq(len(CV.bestBinSize_perProb), len(probs))
assert [binSize in binSizeGrid for binSize in CV.bestBinSize_perProb]

bestBinSizePerProbTest = CV.cv_results.idxmin(axis = 0)
test_eq(CV.bestBinSize_perProb, bestBinSizePerProbTest)

#---

# refitted LSx objects
assert isinstance(CV.bestEstimatorLSx, dict)
test_eq(list(CV.bestEstimatorLSx.keys()), probs)

for prob, LSx in CV.bestEstimatorLSx.items():
    assert isinstance(LSx, LevelSetKDEx_kNN)
    test_eq(LSx.binSize, bestBinSizePerProbTest.loc[prob])
    
    # Estimator the same? (Note: No __eq__ method is implemented for LGBMRegressor. 
    # LGBM == LGBM2 consequently only returns true when both names are refering to 
    # the exact same object in memory, which is not (!!) the case here)
    assert LSx.estimator is not LGBM
    test_eq(LSx.estimator.predict(XTrain), LGBM.predict(XTrain))
    test_eq(LSx.estimator.predict(XTest), LGBM.predict(XTest))
    

#### Distance Weights, refit

In [None]:
LSKDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM,
                              weightsByDistance = True)

kFolds = 3
probs = [0.005, 0.2, 0.4, 0.6, 0.78,0.99999]
binSizeGrid = [1, 100, 1000, 10000, 2000000]

dataTrain = data[data['label'] == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = kFolds, 
                                 testLength = 7, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

CVDistance = binSizeCV(estimatorLSx = LSKDEx_kNN, 
                       cvFolds = cvFolds, 
                       binSizeGrid = binSizeGrid, 
                       probs = probs,
                       refitPerProb = True,
                       n_jobs = 2)

CVDistance.fit(X = XTrain, y = yTrain)

#---

LSKDEx_kNN.set_params(weightsByDistance = False)

CVStandard = binSizeCV(estimatorLSx = LSKDEx_kNN, 
                       cvFolds = cvFolds, 
                       binSizeGrid = binSizeGrid,
                       probs = probs,
                       refitPerProb = True,
                       n_jobs = None)

CVStandard.fit(X = XTrain, y = yTrain)

In [None]:
test_eq(CVDistance.cvFolds, cvFolds)
test_eq(CVDistance.probs, probs)
assert CVDistance.refitPerProb

#---

# Check if CV Results are different compared to the standard case of generating weights
# This is supposed to check whether we really used the attribute 'weightsByDistance'
# in the predict function inside scorePerFold
assert not np.allclose(CVDistance.cv_results, CVStandard.cv_results)

#---

# Stored binSize-Grid
binSizesFiltered = [binSize for binSize in CVDistance.binSizeGrid if binSize <= len(cvFolds[0][0])]
test_eq(CVDistance.binSizeGrid, binSizesFiltered)

#---

# Estimator the same? Note: No __eq__ method is implemented for LGBMRegressor. 
# LGBM == LGBM2 consequently only returns true when both names are refering to 
# the exact same object in memory, which shouldn't be the case here because we 
# create a deepcopy of LGBM when initiating LS_KDEx. The point estimators
# should be copies of each other though.
assert CV.estimatorLSx.estimator is not LGBM
test_eq(CV.estimatorLSx.estimator.predict(XTrain), LGBM.predict(XTrain))
test_eq(CV.estimatorLSx.estimator.predict(XTest), LGBM.predict(XTest))

#---

# CV Results Raw
assert isinstance(CVDistance.cv_results_raw, list)
test_eq(len(CVDistance.cv_results_raw), kFolds)

for resDf in CVDistance.cv_results_raw:
    test_eq(resDf.shape, (len(binSizesFiltered), len(probs)))
    test_eq(resDf.index, binSizesFiltered)
    test_eq(resDf.columns, probs)
    assert np.all(resDf >= 0)

#---

# CV Results Aggregated
meanCostMatrix = 0
for i in range(0, kFolds, 1):
    meanCostMatrix += CVDistance.cv_results_raw[i]
    
meanCostMatrix = meanCostMatrix / kFolds

assert np.allclose(CVDistance.cv_results, meanCostMatrix)

test_eq(CVDistance.cv_results.shape, (len(binSizesFiltered), len(probs)))
test_eq(CVDistance.cv_results.index, binSizesFiltered)
test_eq(CVDistance.cv_results.columns, probs)
assert np.all(resDf >= 0)

#---

# Best bin size
averageCostsPerBin = CVDistance.cv_results.mean(axis = 1)
bestBinSizeTest = CVDistance.cv_results.index[np.argmin(averageCostsPerBin)]
test_eq(CVDistance.bestBinSize, bestBinSizeTest)
assert CVDistance.bestBinSize in binSizeGrid

#---

# Best bin size per prob
test_eq(len(CVDistance.bestBinSize_perProb), len(probs))
assert [binSize in binSizeGrid for binSize in CVDistance.bestBinSize_perProb]

bestBinSizePerProbTest = CVDistance.cv_results.idxmin(axis = 0)
test_eq(CVDistance.bestBinSize_perProb, bestBinSizePerProbTest)

#---

# refitted LSx objects
assert isinstance(CVDistance.bestEstimatorLSx, dict)
test_eq(list(CVDistance.bestEstimatorLSx.keys()), probs)

for prob, LSx in CVDistance.bestEstimatorLSx.items():
    assert isinstance(LSx, LevelSetKDEx_kNN)
    test_eq(LSx.binSize, bestBinSizePerProbTest.loc[prob])
    
    # Estimator the same? (Note: No __eq__ method is implemented for LGBMRegressor. 
    # LGBM == LGBM2 consequently only returns true when both names are refering to 
    # the exact same object in memory, which is not (!!) the case here)
    assert LSx.estimator is not LGBM
    test_eq(LSx.estimator.predict(XTrain), LGBM.predict(XTrain))
    test_eq(LSx.estimator.predict(XTest), LGBM.predict(XTest))
    

## LS_KDEx

In [None]:
#| hide
LS_KDEx = LevelSetKDEx(estimator = LGBM, binSize = 100)
LS_KDEx.fit(XTrain, yTrain)

# Check if nothing weird happened to y and yPred
test_eq(LS_KDEx.yTrain, yTrain)
test_eq(LS_KDEx.yTrainPred, LGBM.predict(XTrain))

# Check if fitted has been set correctly
assert LS_KDEx.fitted 

### Standard Attributes

In [None]:
# All train-indices must be part of indicesPerBin
# and duplicates mustn't exist
indicesList = list()

for values in LS_KDEx.indicesPerBin.values():
    indicesList.extend(values)
    
test_eq(set(indicesList), set(np.arange(XTrain.shape[0])))
test_eq(len(indicesList), XTrain.shape[0])

### Lower Bounds

In [None]:
# Lower-bound structure has to be correct
yPred = LS_KDEx.yTrainPred
indicesPerBin = LS_KDEx.indicesPerBin
lowerBoundPerBin = LS_KDEx.lowerBoundPerBin

for i in range(len(indicesPerBin)):
    binIndex = list(indicesPerBin.keys())[i]
    indices = indicesPerBin[binIndex]
    
    minValue = yPred[indices].min()
    maxValue = yPred[indices].max()
    
    assert minValue >= lowerBoundPerBin.loc[binIndex]
    
    if binIndex < max(list(indicesPerBin.keys())):
        assert maxValue < lowerBoundPerBin.loc[binIndex + 1]

### getWeights

#### Standard Settings

In [None]:
# Weights-Output Test

LS_KDEx = LevelSetKDEx(estimator = LGBM, binSize = 100)
LS_KDEx.fit(XTrain, yTrain)

#---

indicesPerBin = LS_KDEx.indicesPerBin
lowerBoundPerBin = LS_KDEx.lowerBoundPerBin
yPredTrain = LGBM.predict(XTrain)
yPredTest = LGBM.predict(XTest)
binPerPred = np.searchsorted(a = lowerBoundPerBin, v = yPredTest, side = 'right') - 1
indicesPerPred = [indicesPerBin[binIndex] for binIndex in binPerPred]

#---

weightsAll = LS_KDEx.getWeights(X = XTest, outputType = 'all')

# Check if every bin contains at least 100 observations
binSizesReal = [sum(weightsAll[i] > 0) for i in range(XTest.shape[0])]
assert (np.array(binSizesReal) >= 100).all()

test_eq(len(weightsAll), XTest.shape[0])

for i in range(len(weightsAll)):
    weights = weightsAll[i]
    
    assert all(weights >= 0)
    assert isclose(weights.sum(), 1)
    
    test_eq(np.where(weights > 0)[0], np.sort(indicesPerPred[i]))

#---

weightsOnlyPos = LS_KDEx.getWeights(X = XTest, outputType = 'onlyPositiveWeights')

# Check if every bin contains at least 100 observations
binSizesReal = [len(weightsOnlyPos[i][1]) for i in range(XTest.shape[0])]
assert (np.array(binSizesReal) >= 100).all()

test_eq(len(weightsOnlyPos), XTest.shape[0])

for i in range(len(weightsOnlyPos)):
    weights = weightsOnlyPos[i][0]
    indices = weightsOnlyPos[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    test_eq(indices, indicesPerPred[i])
    
    if len(indices) > 100:
        checkLastBin = yPredTrain.max() == yPredTrain[indices].max()
        checkBinExtension = yPredTrain[indices[99]] == yPredTrain[indices[100]]
        assert checkLastBin or checkBinExtension
        
#---

weightsSummarized = LS_KDEx.getWeights(X = XTest, outputType = 'summarized')

test_eq(len(weightsSummarized), XTest.shape[0])

for i in range(len(weightsSummarized)):
    weights = weightsSummarized[i][0]
    values = weightsSummarized[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(yTrain[indicesPerPred[i]]), set(values))
    
#---

weightsCumDistr = LS_KDEx.getWeights(X = XTest, outputType = 'cumulativeDistribution')

# Check if every bin contains at least 100 observations
binSizesReal = [len(weightsCumDistr[i][1]) for i in range(XTest.shape[0])]
assert (np.array(binSizesReal) >= 100).all()

test_eq(len(weightsCumDistr), XTest.shape[0])

for i in range(len(weightsCumDistr)):
    cumProb = weightsCumDistr[i][0]
    values = weightsCumDistr[i][1]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    assert np.allclose(np.diff(cumProb), np.diff(cumProb)[0])
    
    test_eq(values, np.sort(yTrain[indicesPerPred[i]]))

#---

weightsDistrSummarized = LS_KDEx.getWeights(X = XTest, outputType = 'cumulativeDistributionSummarized')

test_eq(len(weightsDistrSummarized), XTest.shape[0])

for i in range(len(weightsDistrSummarized)):
    cumProb = weightsDistrSummarized[i][0]
    values = weightsDistrSummarized[i][1]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(values), set(np.sort(yTrain[indicesPerPred[i]])))
    

#### Distance Based Weights

In [None]:
# Weights-Output Test

# Modifying XTrain to enforce test-predictions being identical to train predictions
XTrainMod = np.concatenate([XTest[0:2, :], XTrain], axis = 0)
yTrainMod = np.concatenate([yTest[0:2], yTrain], axis = 0)

LS_KDEx = LevelSetKDEx(estimator = LGBM, binSize = 100, weightsByDistance = True)
LS_KDEx.fit(XTrainMod, yTrainMod)

#---

indicesPerBin = LS_KDEx.indicesPerBin
lowerBoundPerBin = LS_KDEx.lowerBoundPerBin
yPredTrain = LGBM.predict(XTrainMod)
yPredTest = LGBM.predict(XTest)
binPerPred = np.searchsorted(a = lowerBoundPerBin, v = yPredTest, side = 'right') - 1
indicesPerPred = [indicesPerBin[binIndex] for binIndex in binPerPred]

predDistances = [np.abs(yPredTrain[indicesPerPred[i]] - yPredTest[i]) for i in range(XTest.shape[0])]

#---

weightsList = LS_KDEx.getWeights(X = XTest, outputType = 'onlyPositiveWeights')

# Check if all bins either contain at least 100 observations or if not all weights have to equal
binSizesReal = [len(weightsList[i][1]) for i in range(XTest.shape[0])]

for i in range(len(binSizesReal)):
    if binSizesReal[i] < 100:
        assert np.allclose(weightsList[i][0], 1 / len(weightsList[i][0]))
        
# Because of our above modification of XTrain and yTrain, for the first and second test observation 
# the special case applies where the test prediction is identical to at least 1 train prediction.
assert np.allclose(weightsList[0][0], 1 / len(weightsList[0][0]))
assert np.allclose(weightsList[1][0], 1 / len(weightsList[0][0]))

assert 0 in weightsList[0][1]
assert 1 in weightsList[1][1]

#---

weightsAll = LS_KDEx.getWeights(X = XTest, outputType = 'all')

test_eq(len(weightsAll), XTest.shape[0])

for i in range(len(weightsAll)):
    neighbors = indicesPerPred[i]
    distances = predDistances[i]
    weights = weightsAll[i]
    
    assert all(weights >= 0)
    assert isclose(weights.sum(), 1)
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        assert np.allclose(weights[neighborsPredDistanceZero], 1 / sum(predDistanceCloseZero))
        test_eq(np.sort(neighborsPredDistanceZero), np.sort(np.where(weights > 0)[0]))
        
    else:
        inverseDistances = 1 / distances
        np.allclose(np.sort(weights[neighbors]), np.sort(inverseDistances / sum(inverseDistances)))
        test_eq(np.sort(neighbors), np.sort(np.where(weights > 0)[0]))

#---

weightsOnlyPos = LS_KDEx.getWeights(X = XTest, outputType = 'onlyPositiveWeights')

test_eq(len(weightsOnlyPos), XTest.shape[0])

for i in range(len(weightsOnlyPos)):
    neighbors = indicesPerPred[i]
    distances = predDistances[i]
    weights = weightsOnlyPos[i][0]
    indices = weightsOnlyPos[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        assert np.allclose(weights, 1 / sum(predDistanceCloseZero))
        test_eq(np.sort(neighborsPredDistanceZero), np.sort(indices))
            
    else:
        inverseDistances = 1 / distances
        assert np.allclose(weights, inverseDistances / sum(inverseDistances))
        test_eq(np.sort(neighbors), np.sort(indices))
    
    if len(indices) > 100:
        checkLastBin = yPredTrain.max() == yPredTrain[indices].max()
        checkBinExtension = yPredTrain[indices[99]] == yPredTrain[indices[100]]
        assert checkLastBin or checkBinExtension
    
#---

weightsSummarized = LS_KDEx.getWeights(X = XTest, outputType = 'summarized')

test_eq(len(weightsSummarized), XTest.shape[0])

for i in range(len(weightsSummarized)):
    neighbors = indicesPerPred[i]
    distances = predDistances[i]
    weights = weightsSummarized[i][0]
    values = weightsSummarized[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        valuesByHand = yTrainMod[neighborsPredDistanceZero]
            
    else:
        valuesByHand = yTrainMod[neighbors]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(valuesByHand), set(values))
    
#---

weightsCumDistr = LS_KDEx.getWeights(X = XTest, outputType = 'cumulativeDistribution')

test_eq(len(weightsCumDistr), XTest.shape[0])

for i in range(len(weightsCumDistr)):
    neighbors = indicesPerPred[i]
    distances = predDistances[i]
    cumProb = weightsCumDistr[i][0]
    values = weightsCumDistr[i][1]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        valuesByHand = yTrainMod[neighborsPredDistanceZero]
        nCloseZero = sum(predDistanceCloseZero)
        assert np.allclose(cumProb, np.cumsum(np.repeat(1 / nCloseZero, nCloseZero)))
        
    else:
        # The following test works if we use 'valuesByHand = yTrainMod[weightsOnlyPos[i][1]' to grab the yTrain values
        # because the getWeights-function does nothing else. If we grab them differently here (e.g. via neighborsMatrix),
        # the sorting can become different for identical yTrain values which will change the cumulated probabilities
        # for exactly those indices (and only these). This has no practical implications for the usage of the computed
        # cumulated distribution function, but it has for the exact value testing we are doing here.
        inverseDistances = 1 / distances
        valuesByHand = yTrainMod[weightsOnlyPos[i][1]]
        valuesByHandIndicesSort = np.argsort(valuesByHand)
        assert np.allclose(cumProb, np.cumsum((inverseDistances / sum(inverseDistances))[valuesByHandIndicesSort]))
    
    test_eq(values, np.sort(values))
    test_eq(np.sort(valuesByHand), values)

#---

weightsDistrSummarized = LS_KDEx.getWeights(X = XTest, outputType = 'cumulativeDistributionSummarized')

test_eq(len(weightsDistrSummarized), XTest.shape[0])

for i in range(len(weightsDistrSummarized)):
    neighbors = indicesPerPred[i]
    distances = predDistances[i]
    cumProb = weightsDistrSummarized[i][0]
    values = weightsDistrSummarized[i][1]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        valuesByHand = yTrainMod[neighborsPredDistanceZero]
    
    else:
        inverseDistances = 1 / distances
        valuesByHand = yTrainMod[neighbors]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(values, np.sort(values))
    test_eq(set(valuesByHand), set(values))
    

#### ScalingList

In [None]:
# Testing scalingList 

LS_KDEx = LevelSetKDEx(estimator = LGBM, binSize = 100)
LS_KDEx.fit(XTrain, yTrain)

#---

indicesPerBin = LS_KDEx.indicesPerBin
lowerBoundPerBin = LS_KDEx.lowerBoundPerBin
yPredTest = LGBM.predict(XTest)
binPerPred = np.searchsorted(a = lowerBoundPerBin, v = yPredTest, side = 'right') - 1
indicesPerPred = [indicesPerBin[binIndex] for binIndex in binPerPred]

#---

weightsSummarized = LS_KDEx.getWeights(X = XTest, outputType = 'summarized', scalingList = scalingList)

test_eq(len(weightsSummarized), XTest.shape[0])

for i in range(len(weightsSummarized)):
    weights = weightsSummarized[i][0]
    values = weightsSummarized[i][1]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(yTrain[indicesPerPred[i]] * scalingList[i]), set(values))
    
#---

weightsCumDistr = LS_KDEx.getWeights(X = XTest, outputType = 'cumulativeDistribution', scalingList = scalingList)

test_eq(len(weightsCumDistr), XTest.shape[0])

for i in range(len(weightsCumDistr)):
    cumProb = weightsCumDistr[i][0]
    values = weightsCumDistr[i][1]
    
    test_eq(values, np.sort(yTrain[indicesPerPred[i]]) * scalingList[i])

#---

weightsDistrSummarized = LS_KDEx.getWeights(X = XTest, outputType = 'cumulativeDistributionSummarized', scalingList = scalingList)

test_eq(len(weightsDistrSummarized), XTest.shape[0])

for i in range(len(weightsDistrSummarized)):
    cumProb = weightsDistrSummarized[i][0]
    values = weightsDistrSummarized[i][1]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(values), set(np.sort(yTrain[indicesPerPred[i]]) * scalingList[i]))
    

#### predict

In [None]:
# Testing predict-method
LS_KDEx = LevelSetKDEx(estimator = LGBM, binSize = 100)
LS_KDEx.fit(XTrain, yTrain)

#---

indicesPerBin = LS_KDEx.indicesPerBin
lowerBoundPerBin = LS_KDEx.lowerBoundPerBin
yPredTest = LGBM.predict(XTest)
binPerPred = np.searchsorted(a = lowerBoundPerBin, v = yPredTest, side = 'right') - 1
indicesPerPred = [indicesPerBin[binIndex] for binIndex in binPerPred]
yTrainPerPred = [yTrain[indices] for indices in indicesPerPred]

#---

probs = [0.001, 0.5, 0.999]
quantileDict = LS_KDEx.predict(X = XTest, probs = probs, outputAsDf = False, scalingList = None)
quantileDf = LS_KDEx.predict(X = XTest, probs = probs, outputAsDf = True, scalingList = None)

test_eq(pd.DataFrame(quantileDict), quantileDf)
test_eq(list(quantileDict.keys()), probs)

for i in range(quantileDf.shape[0]):
    assert((np.diff(quantileDf.iloc[i,:]) >= 0).all())
    test_eq(yTrainPerPred[i].min(), quantileDf.loc[i, 0.001])
    test_eq(yTrainPerPred[i].max(), quantileDf.loc[i, 0.999])
    test_eq(np.quantile(a = yTrainPerPred[i], q = 0.5, method = 'inverted_cdf'), quantileDf.loc[i, 0.5])

### Set and Get Parameters

In [None]:
#| hide
LS_KDEx = LevelSetKDEx(estimator = LGBM, binSize = 100, weightsByDistance = False)
LS_KDEx.fit(XTrain, yTrain)

#---

assert all([param in LS_KDEx.get_params() for param in ['binSize', 'estimator', 'weightsByDistance']])

#---

LS_KDEx.set_params(estimator = LGBMRegressor(min_child_samples = 10000))
LS_KDEx.fit(X = XTrain, y = yTrain)
test_eq(len(np.unique(LS_KDEx.estimator.predict(XTest))), 1)

LS_KDEx.set_params(estimator = LGBM, binSize = 3000)
LS_KDEx.fit(X = XTrain, y = yTrain)
test_eq(len(np.unique(LS_KDEx.predict(XTest, probs = [0.5]).values.flatten())), 1)

### Point Estimator Fit and Predict

In [None]:
LGBM2 = LGBMRegressor(n_jobs = 1, max_depth = 2)

LS_KDEx = LevelSetKDEx(estimator = LGBM2, binSize = 100, weightsByDistance = False)
assert 'fitted_' not in dir(LS_KDEx.estimator)
LS_KDEx.fit(XTrain, yTrain)
assert LS_KDEx.estimator.fitted_

# Check whether original estimator has been unintentionally modified
assert 'fitted_' not in dir(LGBM2)

#---

LGBM2 = LGBMRegressor(n_jobs = 1, max_depth = 2)

LS_KDEx = LevelSetKDEx(estimator = LGBM2, binSize = 100, weightsByDistance = False)
assert 'fitted_' not in dir(LS_KDEx.estimator)
LS_KDEx.refitPointEstimator(XTrain, yTrain)
assert LS_KDEx.estimator.fitted_

# Check whether original estimator has been unintentionally modified
assert 'fitted_' not in dir(LGBM2)

#---

LGBM2 = LGBMRegressor(n_jobs = 1, max_depth = 2)

LS_KDEx = LevelSetKDEx(estimator = LGBM2, binSize = 100, weightsByDistance = False)
LS_KDEx.set_params(estimator = LGBMRegressor(min_child_samples = 10000))

assert 'fitted_' not in dir(LS_KDEx.estimator)
LS_KDEx.refitPointEstimator(XTrain, yTrain)
assert LS_KDEx.estimator.fitted_
test_eq(len(np.unique(LS_KDEx.estimator.predict(XTest))), 1)
test_eq(len(np.unique(LS_KDEx.pointPredict(XTest))), 1)

# Check whether original estimator has been unintentionally modified
assert 'fitted_' not in dir(LGBM2)

## LS_KDEx_kNN

### Standard Attributes

In [None]:
LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 100)
LS_KDEx_kNN.fit(XTrain, yTrain)

# Check if nothing weird happened to y and yPred
test_eq(LS_KDEx_kNN.yTrain, yTrain)
test_eq(LS_KDEx_kNN.yTrainPred, LGBM.predict(XTrain))

# Check if fitted has been set correctly
assert LS_KDEx_kNN.fitted 

### getWeights

#### Standard Settings

In [None]:
# Weights-Output Test
LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 100, weightsByDistance = False)
LS_KDEx_kNN.fit(XTrain, yTrain)

nn = LS_KDEx_kNN.nearestNeighborsOnPreds
yPredTest = LGBM.predict(XTest)
yPredTest_reshaped = np.reshape(yPredTest, newshape = (len(yPredTest), 1))

weightsList = LS_KDEx_kNN.getWeights(X = XTest, 
                                     outputType = 'onlyPositiveWeights')

binSizesReal = [len(weightsList[i][1]) for i in range(XTest.shape[0])]
distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPredTest_reshaped, n_neighbors = max(binSizesReal))

# Check if all bins contain at least 100 observations
assert np.all((np.array(binSizesReal) >= 100))

#---

weightsAll = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'all')

test_eq(len(weightsAll), XTest.shape[0])

for i in range(len(weightsAll)):
    weights = weightsAll[i]
    
    assert all(weights >= 0)
    assert isclose(weights.sum(), 1)
    assert np.allclose(weights[weights > 0], 1 / binSizesReal[i])
    
    test_eq(set(neighborsMatrix[i, 0:binSizesReal[i]]), set(np.where(weights > 0)[0]))

#---

weightsOnlyPos = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'onlyPositiveWeights')

test_eq(len(weightsOnlyPos), XTest.shape[0])

for i in range(len(weightsOnlyPos)):
    weights = weightsOnlyPos[i][0]
    indices = weightsOnlyPos[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    assert np.allclose(weights, 1 / binSizesReal[i])
    
    test_eq(set(neighborsMatrix[i, 0:binSizesReal[i]]), set(indices))
    
    if len(indices) > 100:
        assert np.allclose(np.diff(distancesMatrix[i, 99:binSizesReal[i]]), 0)
    
#---

weightsSummarized = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'summarized')

test_eq(len(weightsSummarized), XTest.shape[0])

for i in range(len(weightsSummarized)):
    weights = weightsSummarized[i][0]
    values = weightsSummarized[i][1]
    valuesByHand = yTrain[neighborsMatrix[i, 0:binSizesReal[i]]]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    assert np.all(weights >= 1 / binSizesReal[i])
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(valuesByHand), set(values))
    
#---

weightsCumDistr = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'cumulativeDistribution')

test_eq(len(weightsCumDistr), XTest.shape[0])

for i in range(len(weightsCumDistr)):
    cumProb = weightsCumDistr[i][0]
    values = weightsCumDistr[i][1]
    valuesByHand = yTrain[neighborsMatrix[i, 0:binSizesReal[i]]]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    assert np.allclose(np.diff(cumProb), np.diff(cumProb)[0])
    
    test_eq(values, np.sort(values))
    test_eq(np.sort(valuesByHand), values)

#---

weightsDistrSummarized = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'cumulativeDistributionSummarized')

test_eq(len(weightsDistrSummarized), XTest.shape[0])

for i in range(len(weightsDistrSummarized)):
    cumProb = weightsDistrSummarized[i][0]
    values = weightsDistrSummarized[i][1]
    valuesByHand = yTrain[neighborsMatrix[i, 0:binSizesReal[i]]]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(values, np.sort(values))
    test_eq(set(valuesByHand), set(values))
    

#### Distance Based Weights

In [None]:
# Weights-Output Test

# Modifying XTrain to enforce test-predictions being identical to train predictions
XTrainMod = np.concatenate([XTest[0:2, :], XTrain], axis = 0)
yTrainMod = np.concatenate([yTest[0:2], yTrain], axis = 0)

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 100, weightsByDistance = True,)
LS_KDEx_kNN.fit(XTrainMod, yTrainMod)

#---

nn = LS_KDEx_kNN.nearestNeighborsOnPreds
yPredTest = LGBM.predict(XTest)
yPredTest_reshaped = np.reshape(yPredTest, newshape = (len(yPredTest), 1))

weightsList = LS_KDEx_kNN.getWeights(X = XTest,  
                                     outputType = 'onlyPositiveWeights')

#---

binSizesReal = [len(weightsList[i][1]) for i in range(XTest.shape[0])]
distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPredTest_reshaped, n_neighbors = max(binSizesReal))

# Check if all bins either contain at least 100 observations or if not all weights have to equal
for i in range(len(binSizesReal)):
    if binSizesReal[i] < 100:
        assert np.allclose(weightsList[i][0], 1 / len(weightsList[i][0]))
        
# Because of our above modification of XTrain and yTrain, for the first and second test observation 
# the special case applies where the test prediction is identical to at least 1 train prediction.
assert np.allclose(weightsList[0][0], 1 / len(weightsList[0][0]))
assert np.allclose(weightsList[1][0], 1 / len(weightsList[0][0]))

assert 0 in weightsList[0][1]
assert 1 in weightsList[1][1]

#---

weightsAll = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'all')

test_eq(len(weightsAll), XTest.shape[0])

for i in range(len(weightsAll)):
    neighbors = neighborsMatrix[i, 0:binSizesReal[i]]
    distances = distancesMatrix[i, 0:binSizesReal[i]]
    weights = weightsAll[i]
    
    assert all(weights >= 0)
    assert isclose(weights.sum(), 1)
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        assert np.allclose(weights[neighborsPredDistanceZero], 1 / sum(predDistanceCloseZero))
        test_eq(np.sort(neighborsPredDistanceZero), np.sort(np.where(weights > 0)[0]))
        
    else:
        inverseDistances = 1 / distances
        np.allclose(np.sort(weights[neighbors]), np.sort(inverseDistances / sum(inverseDistances)))
        test_eq(np.sort(neighbors), np.sort(np.where(weights > 0)[0]))

#---

weightsOnlyPos = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'onlyPositiveWeights')

test_eq(len(weightsOnlyPos), XTest.shape[0])

for i in range(len(weightsOnlyPos)):
    neighbors = neighborsMatrix[i, 0:binSizesReal[i]]
    distances = distancesMatrix[i, 0:binSizesReal[i]]
    weights = weightsOnlyPos[i][0]
    indices = weightsOnlyPos[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        assert np.allclose(weights, 1 / sum(predDistanceCloseZero))
        test_eq(np.sort(neighborsPredDistanceZero), np.sort(indices))
            
    else:
        inverseDistances = 1 / distances
        assert np.allclose(weights, inverseDistances / sum(inverseDistances))
        test_eq(np.sort(neighbors), np.sort(indices))
    
    if len(indices) > 100:
        assert np.allclose(np.diff(distancesMatrix[i, 99:binSizesReal[i]]), 0)
    
#---

weightsSummarized = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'summarized')

test_eq(len(weightsSummarized), XTest.shape[0])

for i in range(len(weightsSummarized)):
    neighbors = neighborsMatrix[i, 0:binSizesReal[i]]
    distances = distancesMatrix[i, 0:binSizesReal[i]]
    weights = weightsSummarized[i][0]
    values = weightsSummarized[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        valuesByHand = yTrainMod[neighborsPredDistanceZero]
            
    else:
        valuesByHand = yTrainMod[neighborsMatrix[i, 0:binSizesReal[i]]]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(set(valuesByHand), set(values))
    
#---

weightsCumDistr = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'cumulativeDistribution')

test_eq(len(weightsCumDistr), XTest.shape[0])

for i in range(len(weightsCumDistr)):
    neighbors = neighborsMatrix[i, 0:binSizesReal[i]]
    distances = distancesMatrix[i, 0:binSizesReal[i]]
    cumProb = weightsCumDistr[i][0]
    values = weightsCumDistr[i][1]
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        valuesByHand = yTrainMod[neighborsPredDistanceZero]
        nCloseZero = sum(predDistanceCloseZero)
        assert np.allclose(cumProb, np.cumsum(np.repeat(1 / nCloseZero, nCloseZero)))
        
    else:
        # The following test works if we use 'valuesByHand = yTrainMod[weightsOnlyPos[i][1]' to grab the yTrain values
        # because the getWeights-function does nothing else. If we grab them differently here (e.g. via neighborsMatrix),
        # the sorting can become different for identical yTrain values which will change the cumulated probabilities
        # for exactly those indices (and only these). This has no practical implications for the usage of the computed
        # cumulated distribution function, but it has for the exact value testing we are doing here.
        inverseDistances = 1 / distances
        valuesByHand = yTrainMod[weightsOnlyPos[i][1]]
        valuesByHandIndicesSort = np.argsort(valuesByHand)
        assert np.allclose(cumProb, np.cumsum((inverseDistances / sum(inverseDistances))[valuesByHandIndicesSort]))
    
    test_eq(values, np.sort(values))
    test_eq(np.sort(valuesByHand), values)

#---

weightsDistrSummarized = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'cumulativeDistributionSummarized')

test_eq(len(weightsDistrSummarized), XTest.shape[0])

for i in range(len(weightsDistrSummarized)):
    neighbors = neighborsMatrix[i, 0:binSizesReal[i]]
    distances = distancesMatrix[i, 0:binSizesReal[i]]
    cumProb = weightsDistrSummarized[i][0]
    values = weightsDistrSummarized[i][1]
    
    
    assert all(cumProb > 0)
    assert isclose(cumProb.max(), 1)
    test_eq(cumProb, np.sort(cumProb))
    
    predDistanceCloseZero = np.isclose(distances, 0)
    
    if np.any(predDistanceCloseZero):
        neighborsPredDistanceZero = neighbors[np.where(predDistanceCloseZero)[0]]
        valuesByHand = yTrainMod[neighborsPredDistanceZero]
    
    else:
        inverseDistances = 1 / distances
        valuesByHand = yTrainMod[neighborsMatrix[i, 0:binSizesReal[i]]]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(values, np.sort(values))
    test_eq(set(valuesByHand), set(values))
    

#### Artificially Big Bins

In [None]:
# Enforcing bins with size bigger than binSize
binSize = 10

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 10)

# Done to ensure that bins with binSize > 100 happen
XTrainDuplicated = np.concatenate([XTrain] * (binSize + 1), axis = 0)
yTrainDuplicated = np.concatenate([yTrain] * (binSize + 1), axis = 0)

LS_KDEx_kNN.fit(XTrainDuplicated, yTrainDuplicated)

#---

nn = LS_KDEx_kNN.nearestNeighborsOnPreds
yPredTest = LGBM.predict(XTest)
yPredTest_reshaped = np.reshape(yPredTest, newshape = (len(yPredTest), 1))

distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPredTest_reshaped, n_neighbors = binSize + 1)

#---

weightsOnlyPos = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'onlyPositiveWeights')

test_eq(len(weightsOnlyPos), XTest.shape[0])

for i in range(len(weightsOnlyPos)):
    weights = weightsOnlyPos[i][0]
    indices = weightsOnlyPos[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    assert set(neighborsMatrix[i, 0:binSize]) <= set(indices)
    
    if len(indices) > 100:
        test_eq(distancesMatrix[i, binSize-1], distancesMatrix[i, binSize])

#### Bins with only 1 Unique Value

In [None]:
# Enforcing bins with only one unique value
binSize = 10

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 10)

# Done to ensure that bins with binSize > 100 happen
XTrainDuplicated = np.concatenate([XTrain] * binSize, axis = 0)
yTrainDuplicated = np.concatenate([yTrain] * binSize, axis = 0)

LS_KDEx_kNN.fit(XTrainDuplicated, yTrainDuplicated)

#---

weightsOnlyPos = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'summarized')

test_eq(len(weightsOnlyPos), XTest.shape[0])

for i in range(len(weightsOnlyPos)):
    weights = weightsOnlyPos[i][0]
    values = weightsOnlyPos[i][1]
    
    assert all(weights > 0)
    assert isclose(weights.sum(), 1)
    
    test_eq(len(values), 1)

#### ScalingList

In [None]:
# Testing scalingList 
binSize = 20
LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = binSize)

#---

LS_KDEx_kNN.fit(XTrain, yTrain)
nn = LS_KDEx_kNN.nearestNeighborsOnPreds
yPredTest = LGBM.predict(XTest)
yPredTest_reshaped = np.reshape(yPredTest, newshape = (len(yPredTest), 1))

distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPredTest_reshaped, n_neighbors = binSize)

#---

weightsSummarized = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'summarized', scalingList = scalingList)

test_eq(len(weightsSummarized), XTest.shape[0])

for i in range(len(weightsSummarized)):
    weights = weightsSummarized[i][0]
    values = weightsSummarized[i][1]
    
    test_eq(len(values), len(np.unique(values)))
    assert set(yTrain[neighborsMatrix[i, :]] * scalingList[i]) <= set(values)
    
#---

weightsCumDistr = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'cumulativeDistribution', scalingList = scalingList)

test_eq(len(weightsCumDistr), XTest.shape[0])

for i in range(len(weightsCumDistr)):
    cumProb = weightsCumDistr[i][0]
    values = weightsCumDistr[i][1]
    
    test_eq(values, np.sort(values))
    assert set(yTrain[neighborsMatrix[i, :]] * scalingList[i]) <= set(values)

#---

weightsDistrSummarized = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'cumulativeDistributionSummarized', scalingList = scalingList)

test_eq(len(weightsDistrSummarized), XTest.shape[0])

for i in range(len(weightsDistrSummarized)):
    cumProb = weightsDistrSummarized[i][0]
    values = weightsDistrSummarized[i][1]
    
    test_eq(len(values), len(np.unique(values)))
    test_eq(values, np.sort(values))
    assert set(yTrain[neighborsMatrix[i, :]] * scalingList[i]) <= set(values)
    

### predict

In [None]:
# Testing predict-method
binSize = 15

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = binSize)

LS_KDEx_kNN.fit(XTrain, yTrain)
nn = LS_KDEx_kNN.nearestNeighborsOnPreds
yPredTest = LGBM.predict(XTest)
yPredTest_reshaped = np.reshape(yPredTest, newshape = (len(yPredTest), 1))

weightsList = LS_KDEx_kNN.getWeights(X = XTest, outputType = 'onlyPositiveWeights')
binSizesReal = [len(weightsList[i][1]) for i in range(XTest.shape[0])]
distancesMatrix, neighborsMatrix = nn.kneighbors(X = yPredTest_reshaped, n_neighbors = max(binSizesReal))

#---

probs = [0.001, 0.5, 0.999]
quantileDict = LS_KDEx_kNN.predict(X = XTest, probs = probs, outputAsDf = False, scalingList = None)
quantileDf = LS_KDEx_kNN.predict(X = XTest, probs = probs, outputAsDf = True, scalingList = None)

test_eq(pd.DataFrame(quantileDict), quantileDf)
test_eq(list(quantileDict.keys()), probs)

for i in range(quantileDf.shape[0]):
    
    assert((np.diff(quantileDf.iloc[i,:]) >= 0).all())
    
    binSizeReal = binSizesReal[i]
    valuesByHand = yTrain[neighborsMatrix[i, 0:binSizeReal]]
    
    test_eq(valuesByHand.min(), quantileDf.loc[i, 0.001])
    test_eq(valuesByHand.max(), quantileDf.loc[i, 0.999])
    test_eq(np.quantile(a = valuesByHand, q = 0.5, method = 'inverted_cdf'), quantileDf.loc[i, 0.5])

### Set and Get Parameters

In [None]:
#| hide
LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM, binSize = 100, weightsByDistance = False)
LS_KDEx_kNN.fit(XTrain, yTrain)

#---

assert all([param in LS_KDEx_kNN.get_params() for param in ['binSize', 'estimator', 'weightsByDistance']])

#---

LS_KDEx_kNN.set_params(estimator = LGBMRegressor(min_child_samples = 10000))
LS_KDEx_kNN.fit(X = XTrain, y = yTrain)
test_eq(len(np.unique(LS_KDEx_kNN.estimator.predict(XTest))), 1)

LS_KDEx_kNN.set_params(estimator = LGBM, binSize = yTrain.shape[0] - 1)
LS_KDEx_kNN.fit(X = XTrain, y = yTrain)
test_eq(len(np.unique(LS_KDEx_kNN.predict(XTest, probs = [0.5]).values.flatten())), 1)

### Point Estimator Fit and Predict

In [None]:
LGBM2 = LGBMRegressor(n_jobs = 1, max_depth = 2)

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM2, binSize = 100, weightsByDistance = False)
assert 'fitted_' not in dir(LS_KDEx_kNN.estimator)
LS_KDEx_kNN.fit(XTrain, yTrain)
assert LS_KDEx_kNN.estimator.fitted_

# Check whether original estimator has been unintentionally modified
assert 'fitted_' not in dir(LGBM2)

#---

LGBM2 = LGBMRegressor(n_jobs = 1, max_depth = 2)

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM2, binSize = 100, weightsByDistance = False)
assert 'fitted_' not in dir(LS_KDEx_kNN.estimator)
LS_KDEx_kNN.refitPointEstimator(XTrain, yTrain)
assert LS_KDEx_kNN.estimator.fitted_

# Check whether original estimator has been unintentionally modified
assert 'fitted_' not in dir(LGBM2)

#---

LGBM2 = LGBMRegressor(n_jobs = 1, max_depth = 2)

LS_KDEx_kNN = LevelSetKDEx_kNN(estimator = LGBM2, binSize = 100, weightsByDistance = False)
LS_KDEx_kNN.set_params(estimator = LGBMRegressor(min_child_samples = 10000))

assert 'fitted_' not in dir(LS_KDEx_kNN.estimator)
LS_KDEx_kNN.refitPointEstimator(XTrain, yTrain)
assert LS_KDEx_kNN.estimator.fitted_
test_eq(len(np.unique(LS_KDEx_kNN.estimator.predict(XTest))), 1)
test_eq(len(np.unique(LS_KDEx_kNN.pointPredict(XTest))), 1)

# Check whether original estimator has been unintentionally modified
assert 'fitted_' not in dir(LGBM2)

## Generate Bins

In [None]:
# Testing various artificial inputs of 'generateBins'

yPred = np.arange(100)
indicesPerBin, lowerBoundPerBin = generateBins(binSize = 10, yPred = yPred)

test_eq(list(indicesPerBin.keys()), [i for i in range(10)])

indicesPerBinTest = {0: np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
                     1: np.array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
                     2: np.array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
                     3: np.array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39]),
                     4: np.array([40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                     5: np.array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59]),
                     6: np.array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69]),
                     7: np.array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79]),
                     8: np.array([80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
                     9: np.array([90, 91, 92, 93, 94, 95, 96, 97, 98, 99])}

indicesTracker = list()
for i in range(len(indicesPerBin)):
    test_eq(set(indicesPerBin[i]), set(indicesPerBinTest[i]))
    test_eq(len(indicesPerBin[i]), len(np.unique(indicesPerBin[i])))
    
    indicesTracker.extend(indicesPerBin[i].tolist())

test_eq(len(indicesTracker), len(np.unique(indicesTracker)))
test_eq(np.sort(indicesTracker), np.arange(len(yPred)))

lowerBoundPerBinTest = [np.NINF, 9.5, 19.5, 29.5, 39.5, 49.5, 59.5, 69.5, 79.5, 89.5]
test_eq(list(lowerBoundPerBin), lowerBoundPerBinTest)
test_eq(list(lowerBoundPerBin.index), [i for i in range(10)])

#---

yPred = np.append(np.arange(100), np.arange(100))
indicesPerBin, lowerBoundPerBin = generateBins(binSize = 10, yPred = yPred)

test_eq(list(indicesPerBin.keys()), [i for i in range(20)])

indicesPerBinTest = {0: np.array([   0, 100,   1, 101,   2, 102,   3, 103,   4, 104]),
                     1: np.array([   5, 105,   6, 106,   7, 107,   8, 108,   9, 109]),
                     2: np.array([  10, 110,  11, 111,  12, 112,  13, 113,  14, 114]),
                     3: np.array([  15, 115,  16, 116,  17, 117,  18, 118,  19, 119]),
                     4: np.array([  20, 120,  21, 121,  22, 122,  23, 123,  24, 124]),
                     5: np.array([  25, 125,  26, 126,  27, 127,  28, 128,  29, 129]),
                     6: np.array([  30, 130,  31, 131,  32, 132,  33, 133,  34, 134]),
                     7: np.array([  35, 135,  36, 136,  37, 137,  38, 138,  39, 139]),
                     8: np.array([  40, 140,  41, 141,  42, 142,  43, 143,  44, 144]),
                     9: np.array([  45, 145,  46, 146,  47, 147,  48, 148,  49, 149]),
                     10: np.array([ 50, 150,  51, 151,  52, 152,  53, 153,  54, 154]),
                     11: np.array([ 55, 155,  56, 156,  57, 157,  58, 158,  59, 159]),
                     12: np.array([ 60, 160,  61, 161,  62, 162,  63, 163,  64, 164]),
                     13: np.array([ 65, 165,  66, 166,  67, 167,  68, 168,  69, 169]),
                     14: np.array([ 70, 170,  71, 171,  72, 172,  73, 173,  74, 174]),
                     15: np.array([ 75, 175,  76, 176,  77, 177,  78, 178,  79, 179]),
                     16: np.array([ 80, 180,  81, 181,  82, 182,  83, 183,  84, 184]),
                     17: np.array([ 85, 185,  86, 186,  87, 187,  88, 188,  89, 189]),
                     18: np.array([ 90, 190,  91, 191,  92, 192,  93, 193,  94, 194]),
                     19: np.array([ 95, 195,  96, 196,  97, 197,  98, 198,  99, 199])}

indicesTracker = list()
for i in range(len(indicesPerBin)):
    test_eq(set(indicesPerBin[i]), set(indicesPerBinTest[i]))
    test_eq(len(indicesPerBin[i]), len(np.unique(indicesPerBin[i])))
    
    indicesTracker.extend(indicesPerBin[i].tolist())

test_eq(len(indicesTracker), len(np.unique(indicesTracker)))
test_eq(np.sort(indicesTracker), np.arange(len(yPred)))
    
lowerBoundPerBinTest = [np.NINF] + list(np.arange(4.5, 99.5, 5))
test_eq(list(lowerBoundPerBin), lowerBoundPerBinTest)
test_eq(list(lowerBoundPerBin.index), [i for i in range(20)])

#---

# Check if creation of last bin works correctly
yPred = np.append(np.arange(10), np.arange(10))
indicesPerBin, lowerBoundPerBin = generateBins(binSize = 5, yPred = yPred)

test_eq(list(indicesPerBin.keys()), [i for i in range(3)])

indicesPerBinTest = {0: np.array([ 0, 10,  1, 11,  2, 12]),
                     1: np.array([ 3, 13,  4, 14,  5, 15]),
                     2: np.array([ 6, 16,  7, 17,  8, 18,  9, 19])}

indicesTracker = list()
for i in range(len(indicesPerBin)):
    test_eq(set(indicesPerBin[i]), set(indicesPerBinTest[i]))
    test_eq(len(indicesPerBin[i]), len(np.unique(indicesPerBin[i])))
    
    indicesTracker.extend(indicesPerBin[i].tolist())

test_eq(len(indicesTracker), len(np.unique(indicesTracker)))
test_eq(np.sort(indicesTracker), np.arange(len(yPred)))
    
lowerBoundPerBinTest = [np.NINF, 2.5, 5.5]
test_eq(list(lowerBoundPerBin), lowerBoundPerBinTest)
test_eq(list(lowerBoundPerBin.index), [i for i in range(3)])

#---

# yPred.unique() == 1
yPred = np.repeat(1, 100)
indicesPerBin, lowerBoundPerBin = generateBins(binSize = 5, yPred = yPred)

test_eq(list(indicesPerBin.keys()), [0])

indicesPerBinTest = {0: np.arange(0, 100, 1)}

indicesTracker = list()
for i in range(len(indicesPerBin)):
    test_eq(set(indicesPerBin[i]), set(indicesPerBinTest[i]))
    test_eq(len(indicesPerBin[i]), len(np.unique(indicesPerBin[i])))
    
    indicesTracker.extend(indicesPerBin[i].tolist())

test_eq(len(indicesTracker), len(np.unique(indicesTracker)))
test_eq(np.sort(indicesTracker), np.arange(len(yPred)))
    
lowerBoundPerBinTest = [np.NINF]
test_eq(list(lowerBoundPerBin), lowerBoundPerBinTest)
test_eq(list(lowerBoundPerBin.index), [i for i in range(1)])

#---

# binSize > len(yPred)
yPred = np.arange(10)
indicesPerBin, lowerBoundPerBin = generateBins(binSize = 100, yPred = yPred)

test_eq(list(indicesPerBin.keys()), [0])

indicesPerBinTest = {0: np.arange(0, 10, 1)}

indicesTracker = list()
for i in range(len(indicesPerBin)):
    test_eq(set(indicesPerBin[i]), set(indicesPerBinTest[i]))
    test_eq(len(indicesPerBin[i]), len(np.unique(indicesPerBin[i])))
    
    indicesTracker.extend(indicesPerBin[i].tolist())

test_eq(len(indicesTracker), len(np.unique(indicesTracker)))
test_eq(np.sort(indicesTracker), np.arange(len(yPred)))
    
lowerBoundPerBinTest = [np.NINF]
test_eq(list(lowerBoundPerBin), lowerBoundPerBinTest)
test_eq(list(lowerBoundPerBin.index), [i for i in range(1)])

In [None]:
# # LevelSetKDEx.getWeights() and LevelSetKDEx_kNN.getWeights()
# for i in range(len(neighborsList)):
#     if len(neighborsList[i]) < self.binSize:
#         ipdb.set_trace()

In [None]:
# # generateBins
# indices = np.array([])
# for k in range(len(indicesPerBin.keys())):
#     indices = np.append(indices, indicesPerBin[k])

# if len(indices) != len(yPred):
#     ipdb.set_trace()

# predCheck = np.array([pred in binPerPred.keys() for pred in yPred])
# keyCheck = np.array([key in yPred for key in binPerPred.keys()])

# if (all(predCheck) & all(keyCheck)) is False:
#     ipdb.set_trace()

In [None]:
# # LevelSetKDEx.getWeights()
# check = [i for i in range(len(weightsDataList)) if len(weightsDataList[i][1]) > 100]
# check2 = [i for i in range(len(weightsDataList)) if len(weightsDataList[i][1]) > 100 and binPerPred[i] != self.lowerBoundPerBin.index.max()]