# Packages

In [2]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from numpy.random import randint
import time
# from pykdtree.kdtree import KDTree
from collections import defaultdict, Counter

In [None]:
np.repeat(1/5, 5)

In [11]:
a = randint(low = 0, high = 100000, size = 1000000)
# %timeit np.unique(a)

In [6]:
def uniqueValues(a):
    
    duplicationDict = defaultdict(list)
    
    for i, value in enumerate(a):
        duplicationDict[value].append(i)
        
    return duplicationDict

In [None]:
def uniqueValues2(a):
    
    countDict = Counter(a)
        
    return countDict

In [12]:
%timeit uniqueValues(a)

1 s ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## SCANN

In [None]:
import numpy as np
import h5py
import os
import requests
import tempfile
import time

import scann

In [None]:
with tempfile.TemporaryDirectory() as tmp:
    response = requests.get("http://ann-benchmarks.com/glove-100-angular.hdf5")
    loc = os.path.join(tmp, "glove.hdf5")
    with open(loc, 'wb') as f:
        f.write(response.content)
    
    glove_h5py = h5py.File(loc, "r")

In [None]:
list(glove_h5py.keys())

In [None]:
dataset = glove_h5py['train']
queries = glove_h5py['test']
print(dataset.shape)
print(queries.shape)

In [None]:
normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = scann.scann_ops_pybind.builder(normalized_dataset, 100, "dot_product").tree(
    num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100).build()

In [None]:
def compute_recall(neighbors, true_neighbors):
    total = 0
    for gt_row, row in zip(true_neighbors, neighbors):
        total += np.intersect1d(gt_row, row).shape[0]
    return total / true_neighbors.size

In [None]:
# this will search the top 100 of the 2000 leaves, and compute
# the exact dot products of the top 100 candidates from asymmetric
# hashing to get the final top 10 candidates.
start = time.time()
neighbors, distances = searcher.search_batched(queries)
end = time.time()

# we are given top 100 neighbors in the ground truth, so select top 10
print("Recall:", compute_recall(neighbors, glove_h5py['neighbors'][:, :100]))
print("Time:", end - start)

In [None]:
start = time.time()
neighbors, distances = searcher.search_batched(queries, leaves_to_search=500)
end = time.time()

print("Recall:", compute_recall(neighbors, glove_h5py['neighbors'][:, :100]))
print("Time:", end - start)

In [None]:
neighbors

# Multivariate Preds Test

In [2]:
a = np.array([randint(low = 0, high = 10000, size = 4) for i in range(1000)])
# a = np.concatenate([a, np.array([np.arange(2)] * 10)], axis = 0)
a.shape

(1000, 4)

## Clustering

In [3]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
import numpy as np
from faiss import Kmeans

def get_even_clusters(X, cluster_size):
    n_clusters = int(np.ceil(len(X)/cluster_size))
    kmeans = Kmeans(d = X.shape[1], k = n_clusters)
    
    start = time.time()
    kmeans.train(X)
    print(time.time() - start)
    
    centers = kmeans.centroids
    centers = centers.reshape(-1, 1, X.shape[-1]).repeat(cluster_size, 1).reshape(-1, X.shape[-1])
    
    start = time.time()
    distance_matrix = cdist(X, centers)
    print(time.time() - start)
    
    start = time.time()
    clusters = linear_sum_assignment(distance_matrix)[1]//cluster_size
    print(time.time() - start)
    
    return centers

In [5]:
X = a.astype(np.float32)
cluster_size = 100

import time
import ipdb

res = get_even_clusters(X, cluster_size)


5.873585939407349
0.007401227951049805
0.10688900947570801


In [None]:
Counter(res)

In [None]:
X = a.astype(np.float32)
kmeans = KMeans(n_clusters)

kmeans.fit(X)

In [None]:
centers = kmeans.cluster_centers_

In [None]:
centers = centers.reshape(-1, 1, X.shape[-1]).repeat(cluster_size, 1).reshape(-1, X.shape[-1])
distance_matrix = cdist(X, centers)

In [None]:
clusters = linear_sum_assignment(distance_matrix)[1]//cluster_size

In [None]:
from faiss import Kmeans

In [None]:
n_init = 10
max_iter = 300
# kmeans = faiss.Kmeans(d = X.shape[1], k = n_clusters, niter = max_iter, nredo = n_init)
kmeans = Kmeans(d = X.shape[1], k = 100)
kmeans.train(X.astype(np.float32))

In [None]:
res = kmeans.assign(X.astype(np.float32))

In [None]:
kmeans.centroids

In [None]:
from collections import Counter
Counter(res[1])

## kNN

In [None]:
kd = KDTree(data_pts = a)

In [None]:
start = time.time()
dist, idx = kd.query(a, k = 2000)
print(time.time() - start)

In [None]:
idx

In [None]:
nn = NearestNeighbors(algorithm = 'kd_tree')

In [None]:
nn.fit(a)

In [None]:
start = time.time()
idx2 = nn.kneighbors(a, n_neighbors = 10000, return_distance = False)
print(time.time() - start)

In [None]:
np.array_equal(idx, idx2)

In [None]:
def bad_append(new_item, a_list=[]):
    a_list.append(new_item)
    return a_list

In [None]:
bad_append('one', a_list = [1, 2])
print(bad_append('one', a_list = [1, 2]))

In [None]:
def good_append(new_item, a_list=None):
    if a_list is None:
        a_list = []
    a_list.append(new_item)
    return a_list

In [None]:
print(good_append('one'))

In [None]:
import numpy as np
import pandas as pd
import os
from random import sample

from lightgbm import LGBMRegressor
from dddex.levelSetKDEx import LevelSetKDEx, binSizeCV
from dddex.utils import groupedTimeSeriesSplit

In [None]:
path = ('/home/kagu/yaz_trainingData/dataYaz.csv')
data = pd.read_csv(path)

In [None]:
dataTrain = data[data['label'] == 'train'].reset_index(drop = True)
dataTest = data[data['label'] == 'test']

XTrain = dataTrain.drop(['dayIndex', 'label', 'id', 'demand'], axis = 1)
XTest = dataTest.drop(['dayIndex', 'label', 'id', 'demand'], axis = 1)

yTrain = dataTrain['demand']
yTest = dataTest['demand']

In [None]:
LGBM = LGBMRegressor(n_jobs = 1)
LGBM.fit(X = XTrain, y = yTrain)

In [None]:
cvFolds = groupedTimeSeriesSplit(data = dataTrain, kFolds = 2, testLength = 28, groupFeature = 'id', timeFeature = 'dayIndex')

In [None]:
res = binSizeCV(estimator = LGBM,
                binSizeGrid = [100, 200],
                cv = cvFolds,
                LSF_type = 'LSF')

In [None]:
res.fit(X = XTrain, y = yTrain)

# Cross Validation

In [1]:
import numpy as np
import pandas as pd

from dddex.crossValidation import *
from dddex.levelSetKDEx_univariate import *
from dddex.loadData import loadDataYaz

from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

import time

In [2]:
data, XTrain, yTrain, XTest, yTest = loadDataYaz()

In [3]:
LGBM = LGBMRegressor(n_jobs = 1)
LSKDEx = LevelSetKDEx(estimator = LGBM)

In [4]:
paramGridLSx = {'binSize': [20, 50, 100, 150, 200, 250, 400, 600, 800, 1000],
                'weightsByDistance': [False, True]}

paramGridEstimator = {'max_depth': [4],
                      'n_estimators': [100]}

In [9]:
dataTrain = data[data.label == 'train']

cvFolds = groupedTimeSeriesSplit(data = dataTrain, 
                                 kFolds = 5, 
                                 testLength = 28, 
                                 groupFeature = 'id', 
                                 timeFeature = 'dayIndex')

In [13]:
start = time.time()

CV = CrossValidationLSx_combined(estimatorLSx = LSKDEx,
                                 cvFolds = cvFolds,
                                 parameterGridLSx = paramGridLSx,
                                 parameterGridEstimator = paramGridEstimator,
                                 probs = np.arange(0.01, 0.1, 0.01),
                                 refitPerProb = False,
                                 n_jobs = len(cvFolds))

CV.fit(XTrain, yTrain)

print(time.time() - start)

3.997749090194702


In [11]:
start = time.time()

CV = GridSearchCV(estimator = LGBM,
                  cv = cvFolds,
                  param_grid = paramGridEstimator,
                  scoring = 'neg_mean_squared_error',
                  refit = False,
                  n_jobs = len(cvFolds),
                  return_train_score = True,
                  verbose = 0)

CV.fit(XTrain, yTrain)

print(time.time() - start)

0.36907124519348145


In [14]:
start = time.time()

CV = QuantileCrossValidation(quantileEstimator = LSKDEx,
                             cvFolds = cvFolds,
                             parameterGrid = paramGridLSx,
                             probs = np.arange(0.01, 0.1, 0.01),
                             refitPerProb = False,
                             n_jobs = len(cvFolds))

CV.fit(XTrain, yTrain)

print(time.time() - start)

2.616056203842163
