In [194]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import time
import math
import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import HDBSCAN, DBSCAN
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

from scipy.sparse import csr_matrix, issparse, lil_matrix, coo_matrix

from tqdm import tqdm
from pandarallel import pandarallel

from numba import njit, prange, jit
from numba_progress import ProgressBar

import networkx as nx

HOME:  /Users/ericsuardi/Desktop/DataMiningProject23-24


In [195]:
# STANDARD_FILE = 'standard_small_order_printClustroids.json'
# ACTUAL_FILE = 'actual_small_order_printClustroids.json'

# STANDARD_FILE = 'standard_small.json'
# ACTUAL_FILE = 'actual_small.json'

STANDARD_FILE = 'standard_medium_last.json'
ACTUAL_FILE = 'actual_medium_last.json'

# STANDARD_FILE = 'standard_big_new_3.json'
# ACTUAL_FILE = 'actual_big_new_3.json'

K_SHINGLES = 3

In [196]:
# load standard and actual data
print("\nReading standard data...")
with open(os.path.join('data',STANDARD_FILE)) as f:
    standard = json.load(f)

print("\nReading actual data...")
with open(os.path.join('data', ACTUAL_FILE)) as f:
    actual = json.load(f)

# load the data into a dataframe
print("\nCreating standard dataframe...")
dfStandard = pd.DataFrame(standard)
print("\nCreating actual dataframe...")
dfActual = pd.DataFrame(actual)

# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())

# get the unique cities and items of the standard data
cities = []
items = []
longestRoute = 0
shortestRoute = np.inf
maxItemQuantity = 0

standardRefIds = []
for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    standardRefIds.append(int(idS[1]))
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing standard data")

actualRefStandardIds = []
for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    idStandard = s['sroute']
    actualRefStandardIds.append(int(idStandard[1]))
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing actual data")


# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
#uniqueCities.insert(0, 'NULL')          # add NULL city, for padding vectors with different lengths (trips in routes)
uniqueItems = sorted(list(set(items)))

print("\nSorted cities and items")

if shortestRoute < 2:
    K_SHINGLES = 2

threeShingles = []

for i, c1 in enumerate(uniqueCities):
    for j, c2 in enumerate(uniqueCities):
        if i == j:
            continue
        for k, c3 in enumerate(uniqueCities):
            if j == k or i == k:
                continue
            threeShingles.append([c1, c2, c3])
            
permutations = math.perm(len(uniqueCities), K_SHINGLES)

print("\nComputed all possible three-shingles")

print("\nUnique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)

print("\nNumber of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))

print("\nLongest route: ", longestRoute)
print("Shortest route: ", shortestRoute)

print("\nMax item quantity: ", maxItemQuantity)

print("\nNumber of three-shingles: ", len(threeShingles))

print(f"\n{K_SHINGLES}-shingles: ", math.perm(len(uniqueCities), K_SHINGLES))
print(f"{K_SHINGLES}-shingles: ", math.comb(len(uniqueCities), K_SHINGLES))

print(f"\n\033[92mK-Shingles used: {K_SHINGLES} \033[0m")



Reading standard data...

Reading actual data...

Creating standard dataframe...

Creating actual dataframe...
   id                                              route
0  s0  [{'from': 'L’Aquila', 'to': 'Reggio Calabria',...
1  s1  [{'from': 'Trento', 'to': 'Fano', 'merchandise...
2  s2  [{'from': 'Foligno', 'to': 'Siena', 'merchandi...
3  s3  [{'from': 'Pozzuoli', 'to': 'Marano di Napoli'...
4  s4  [{'from': 'Vigevano', 'to': 'Piacenza', 'merch...
   id driver sroute                                              route
0  a0    J_3     s0  [{'from': 'L’Aquila', 'to': 'Aprilia', 'mercha...
1  a1    N_2     s0  [{'from': 'L’Aquila', 'to': 'Crotone', 'mercha...
2  a2    K_0     s0  [{'from': 'L’Aquila', 'to': 'Reggio Calabria',...
3  a3    T_3     s0  [{'from': 'L’Aquila', 'to': 'Aprilia', 'mercha...
4  a4    N_2     s0  [{'from': 'L’Aquila', 'to': 'Reggio Calabria',...

Finished preparing standard data

Finished preparing actual data

Sorted cities and items

Computed all possible three-

In [197]:
def hashShingles(shingles, n):
    # hash shingles
    string = "" 
    for shingle in shingles:
        string += str(shingle) + "," # [45, 4, 8] -> "45,4,8,"
    
    return hash(string) #% n

def createShingles(df, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    # create shingles for each route
    shingles = []
    for index, s in df.iterrows():
        idS = s['id']
        route = s['route']
        shingle = [index]
        citiesInRoute = [] # napoli roma milano teramo bergamo [10,4,5,48,12] [10,4,5] [4,5,48] [5,48,12]
        merchandiseInRoute = np.zeros(len(uniqueItems))
        for trip in route:
            citiesInRoute.append(uniqueCities.index(trip['from']))
            #merchandiseInRoute += np.array(list(trip['merchandise'].values()))
            for item, n in trip['merchandise'].items():
                merchandiseInRoute[uniqueItems.index(item)] += n
        if len(route) > 0:
            citiesInRoute.append(uniqueCities.index(route[-1]['to']))
        if len(route) > 0:
            merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
        
        hashedShingles = []
        for i in range(len(citiesInRoute)-k+1):
            # Q: is it correct to set the modulo for the hash function to the number of permutations?
            # A: yes, because we want to have a unique hash for each shingle
            # Q: would it be better to use a different hash function?
            # A: yes, because the modulo function is not a good hash function
            hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations) )
        
        shingle.append(np.array(hashedShingles))
        
        shingle.append(merchandiseInRoute) # quantity hot encoding
        
        shingles.append(shingle)
        
    return shingles # [ index, [shingles], [merchandise] ]

def create_shingles(s, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    idS = s['id']
    route = s['route']
    shingle = [s.name]
    citiesInRoute = [] 
    merchandiseInRoute = np.zeros(len(uniqueItems))
    for trip in route:
        citiesInRoute.append(uniqueCities.index(trip['from']))
        for item, n in trip['merchandise'].items():
            merchandiseInRoute[uniqueItems.index(item)] += n
    if len(route) > 0:
        citiesInRoute.append(uniqueCities.index(route[-1]['to']))
    if len(route) > 0:
        merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
    
    hashedShingles = []
    for i in range(len(citiesInRoute)-k+1):
        hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations))
    
    shingle.append(np.array(hashedShingles))
    shingle.append(merchandiseInRoute)
    
    return shingle

In [198]:
#standardSets = createShingles(dfStandard, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
#actualSets = createShingles(dfActual, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
pandarallel.initialize(progress_bar=True)
standardSets = dfStandard.parallel_apply(lambda s: create_shingles(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
standardSets = standardSets.tolist()
actualSets = dfActual.parallel_apply(lambda s: create_shingles(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
actualSets = actualSets.tolist()

print("\nstandardSets", len(standardSets), "shape first element", standardSets[0][1].shape, standardSets[0])
print("\nactualSets", len(actualSets),  "shape first element", standardSets[0][1].shape, actualSets[0])

print("\nstandardSets:", len(standardSets))
print("actualSets:", len(actualSets))

assert len(standardSets[0]) == 3, "The length of the standard set is not equal to 3 (index, shingles, merchandise)"
assert len(standardSets[0][2]) == len(uniqueItems), "The length of the merchandise vector is not equal to the number of unique items"

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1000), Label(value='0 / 1000'))), …


standardSets 10 shape first element (23,) [0, array([ 3015349886759599057,  5667943752935670122,  -975069169134304565,
        4181403540013195089,  2003091227518130313,  2095647629475917822,
        8115180469225632122, -4632020617563048725, -6402140529104296560,
       -3383203552763716844, -6460109321257473213,  3046262796170709702,
        3908292898155949648, -6263252204500378165, -8493274267372164994,
        2637501355856866144, -8134254359422451332,   293076404814396321,
       -7559131630636892171, -9207353961969821625,   695030124823582918,
        -102385618719880047,  1715032645242683811]), array([0.30166667, 0.37375   , 0.21875   , 0.25083333, 0.3125    ,
       0.27083333, 0.28666667, 0.20916667, 0.15625   , 0.28583333,
       0.25166667, 0.21833333, 0.18125   , 0.27208333, 0.17333333,
       0.23416667, 0.24083333, 0.19166667, 0.21416667, 0.22583333])]

actualSets 10000 shape first element (23,) [0, array([ 3950172739369341265,  -975069169134304565,  4181403540013195089

## Clustering

In [199]:


def jaccard_similarity_matrix(matrix):
    if 1.0 - np.count_nonzero(matrix) / matrix.size > 0.5:
        print("matrix jaccard is sparse")
        
        matrixCSR = csr_matrix(matrix)
        intersection = np.dot(matrixCSR, matrixCSR.T)
        intersection = intersection.todense()
        print("intersection", intersection.shape, type(intersection))
        #print("intersection", intersection.toarray(), type(intersection.toarray()))
        row_sums = matrix.sum(axis=1)
        print("row_sums", row_sums.shape)
        union = row_sums[:, None] + row_sums - intersection
        print("union", union.shape)
        union = np.where(union == 0, 1, union)  # avoid division by zero
        print("union", union.shape)
        jaccard_similarity = intersection / union
        print("jaccard_similarity", jaccard_similarity.shape, type(jaccard_similarity))
    else:
        print("matrix jaccard is not sparse")
        
        intersection = np.dot(matrix, matrix.T)
        print("intersection", intersection.shape)
        #print("intersection", intersection.toarray(), type(intersection.toarray()))
        row_sums = matrix.sum(axis=1)
        print("row_sums", row_sums.shape)
        union = row_sums[:, None] + row_sums - intersection
        print("union", union.shape)
        union = np.where(union == 0, 1, union)
        jaccard_similarity = intersection / union
        print("jaccard_similarity", jaccard_similarity.shape)
    print("jaccard_similarity contains nan", np.isnan(jaccard_similarity).any())   
    return jaccard_similarity

def jaccard_similarity_minhash(matrix):
    similarity_matrix = np.full((matrix.shape[0], matrix.shape[0]), np.inf)
    num_permutations = matrix.shape[1]
    for i in tqdm(range(matrix.shape[0])):
        for j in range(i + 1, matrix.shape[0]):
            similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / num_permutations
            similarity_matrix[j, i] = similarity_matrix[i, j]
                    
    # Create a full symmetric matrix from the upper triangular part
    #similarity_matrix = np.triu(similarity_matrix) + np.triu(similarity_matrix, 1).T
    np.fill_diagonal(similarity_matrix, 1)
    print("similarity_matrix", similarity_matrix.shape, similarity_matrix[0])
    return similarity_matrix

# def minhash(matrix, permutations):
#     minhash_matrix = np.full((matrix.shape[0], permutations), np.inf)
#     coeff_range = permutations * 100
#     index_range = permutations * 100

#     # Generate the hash functions
#     hash_functions = [lambda x, a=a, b=b: (a * x + b) % matrix.shape[1] for a, b in zip(random.sample(range(coeff_range), permutations), random.sample(range(index_range), permutations))]
    
#     for i in tqdm(range(matrix.shape[0]), desc="minhashing", miniters=1000):
#         indices = np.where(matrix[i] == 1)[0]
#         for k in range(permutations):
#             hashed_indices = np.array([hash_functions[k](j) for j in indices])
#             #print("hashed_indices", hashed_indices.shape, hashed_indices)
#             minhash_matrix[i, k] = np.min(hashed_indices, initial=matrix.shape[1]+1)
    
#     return minhash_matrix

def hash_function_hash_code(num_of_hashes,n_col,next_prime):
  
    #coeffA = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))
    #coeffB = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))

    coeffA = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))
    coeffB = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))

    x = np.arange(n_col).reshape((1,n_col))

    hash_code = (np.matmul(coeffA,x) + coeffB) % next_prime # (num_of_hashes,n_col) so how each column index is permuted

    return hash_code

def minhash(u,num_of_hashes):
    (n_row, n_col) = u.shape
    next_prime = n_col
    hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

    signature_array = np.empty(shape = (n_row,num_of_hashes))

    #t2 = time.time()

    for row in tqdm(range(n_row), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(u[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature_array[row,:] = np.zeros((1,num_of_hashes))
            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature_array[row,:] = row_signature

    return signature_array

def minhash_matrices(matrix1,matrix2,num_of_hashes):
    (n_row, n_col) = matrix1.shape
    next_prime = n_col
    hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

    signature1_array = np.empty(shape = (n_row,num_of_hashes))
    signature2_array = np.empty(shape = (matrix2.shape[0],num_of_hashes))

    #t2 = time.time()

    for row in tqdm(range(n_row), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(matrix1[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature1_array[row,:] = np.zeros((1,num_of_hashes))

            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature1_array[row,:] = row_signature

    for row in tqdm(range(matrix2.shape[0]), desc="minhashing second matrix"):
        #print("row", row)
        ones_index = np.where(matrix2[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature2_array[row,:] = np.zeros((1,num_of_hashes))

            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature2_array[row,:] = row_signature

    return signature1_array, signature2_array

def find_band_and_row_values(columns, threshold):
    previous_b = 1
    previous_r = columns
    for b in range(1, columns + 1):
        if columns % b == 0:
            r = columns // b
            if (1 / b) ** (1 / r)  <= threshold:
                if np.abs((1 / previous_b) ** (1 / previous_r) - threshold) < np.abs((1 / b) ** (1 / r) - threshold):
                    return previous_b, previous_r
                return b, r
    return columns, 1

def lsh(minhash_matrix, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix.shape[1]
    
    # Generate the hash functions
   # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
    #bands = b
        
    print("final bands", b)
    signature_matrix = np.full((minhash_matrix.shape[0], b), np.inf)
    
    # if threshold is 0.8,
    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # For each band
    print("Computing hash values of bands...")
    hash_values = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix.shape[0], 1, minhash_matrix.reshape(-1, r))
    # Reshape the hash values to match the signature matrix
    hash_values = hash_values.reshape(minhash_matrix.shape[0], b)
    # Update the signature matrix
    signature_matrix = hash_values
            
    # find candidate pairs
    print("Finding candidate pairs...")
    candidate_pairs = []
    for i in tqdm(range(signature_matrix.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix[i+1:, :] == signature_matrix[i, :], axis=1) / b
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        indices = np.nonzero(similarities >= threshold)[0]
        # Add the pairs to the candidate pairs
        candidate_pairs.extend((i, i+1+index) for index in indices)
    
    return np.array(candidate_pairs)

def lsh_two_matrices(minhash_matrix1, minhash_matrix2, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix1.shape[1]
    
    # Generate the hash functions
    # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    # hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    def hash_function(x):
        # print("x",x)
        var = hash(",".join([str(x[i]) for i in range(len(x))]))
        # print ("str x ", (",".join([(x[i]) for i in range(len(x))])))
        # print ("var", var)
        return var % minhash_matrix1.shape[0]


    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
        
    print("final bands", b)
    signature_matrix1 = np.full((minhash_matrix1.shape[0], b), np.inf)
    signature_matrix2 = np.full((minhash_matrix2.shape[0], b), np.inf)
    

    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # # For each band
    # print("Computing hash values of bands...")
    # hash_values1 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix1.shape[0], 1, minhash_matrix1.reshape(-1, r))
    # print("hash_values1", hash_values1.shape, hash_values1)
    # hash_values2 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix2.shape[0], 1, minhash_matrix2.reshape(-1, r))
    # print("hash_values2", hash_values2.shape, hash_values2)

    print("minhash_matrix1.reshape(-1, r).shape",minhash_matrix1.reshape(-1, r).shape)

    # For each band
    print("Computing hash values of bands...")
    hash_values1 = np.apply_along_axis(hash_function, 1, minhash_matrix1.reshape(-1, r))
    # print("hash_values1", hash_values1.shape, hash_values1)
    hash_values2 = np.apply_along_axis(hash_function, 1, minhash_matrix2.reshape(-1, r))
    # print("hash_values2", hash_values2.shape, hash_values2)


    # Reshape the hash values to match the signature matrix
    hash_values1 = hash_values1.reshape(minhash_matrix1.shape[0], b)
    # print("hash_values1", hash_values1.shape, hash_values1)
    hash_values2 = hash_values2.reshape(minhash_matrix2.shape[0], b)
    # print("hash_values2", hash_values2.shape, hash_values2) 
    # Update the signature matrix
    signature_matrix1 = hash_values1
    signature_matrix2 = hash_values2
    
    
    # find candidate pairs
    print("Finding candidate pairs...")
    # similarities_actual=[]
    # candidate_pairs = np.empty((minhash_matrix1.shape[0], 2))

    data=[]
    rows=[]
    cols=[]

    for i in tqdm(range(signature_matrix1.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix2 == signature_matrix1[i, :], axis=1) / b
        # print("similarities", similarities.shape, similarities)
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        indices = np.nonzero(similarities >= threshold)[0]
        # print("indices", indices.shape, indices)

        # print("similarities[indices] ",similarities[indices])

        data.extend(similarities[indices])
        # print("data", data)
        rows.extend([i]*len(indices))
        # print("rows", rows)
        cols.extend(indices)
        # print("cols", cols)
        # indexMax = np.argmax(similarities)
        # simMax = similarities[indexMax]
        # # Add the pairs to the candidate pairs
        # #candidate_pairs.extend((i, i+1+index) for index in indices)
        # candidate_pairs[i] = [indexMax, simMax]
        # similarities_actual.append(similarities)

        

    # # Create data array for COO matrix
    # data = np.concatenate([subset_sim_matrix[indices_i, indices_j], subset_sim_matrix[indices_i, indices_j]])
    
    # # Create row and column index arrays for COO matrix
    # rows = np.concatenate([indices_i_mapped, indices_j_mapped])
    # cols = np.concatenate([indices_j_mapped, indices_i_mapped])
    # print("data", data)
    # print("rows", rows)
    # print("cols", cols)

    similarity_matrix = coo_matrix((data, (rows, cols)), shape=(minhash_matrix1.shape[0], minhash_matrix2.shape[0])).tocsr()

    return similarity_matrix

@jit(cache=True)
def compute_distance_pairs_merch(sim_matrix, matrix1, matrix1Merch, matrix2, matrix2Merch, progress_proxy):
    n = sim_matrix.shape[0]
    m = sim_matrix.shape[1]
    squareMatrix = np.full((matrix1Merch.shape[0], matrix2Merch.shape[1]), 2)
    # print("sim_matrix", sim_matrix.shape)    
    # print(numba.typeof(sim_matrix))
    # print(numba.typeof(matrix1))
    # print(numba.typeof(matrix1Merch))
    # print(numba.typeof(matrix2))
    # print(numba.typeof(matrix2Merch))
    # print(numba.typeof(progress_proxy))

    

    for i in prange(n):
        subset1 = matrix1[i].reshape(1, -1) #replicate_row(subset_matrix, i) 
        # print("subset1", subset1.shape)
        subset2 = matrix2[sim_matrix[i].nonzero()[1]]
        # print("subset2", subset2.shape)
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)

        route_distance = (np.divide(sum_min_matrix, sum_max_matrix))
        #print("route_distance", route_distance.shape)

        subset1Merch = matrix1Merch[i].reshape(1, -1) #replicate_row(subset_matrixMerch, i)
        subset2Merch = matrix2Merch[sim_matrix[i].nonzero()[1]]
        #normsSubset2Merch = np.sqrt(np.sum(np.power(subset2Merch, squareMatrix), axis=1))
        # print("subset1Merch", subset1Merch.shape)
        # print("subset2Merch", subset2Merch.shape)
        
        # COSINE
        #distMerch = 1 - (((subset1Merch * subset2Merch).sum(axis=1) / (np.sqrt(np.sum(np.power(subset1Merch, squareMatrix),axis=1)) * normsSubset2Merch)) + 1) / 2
        
        # JACCARD
        min_matrixMerch = np.minimum(subset1Merch, subset2Merch)
        sum_min_matrixMerch = np.sum(min_matrixMerch, axis=-1)

        max_matrixMerch = np.maximum(subset1Merch, subset2Merch)
        sum_max_matrixMerch = np.sum(max_matrixMerch, axis=-1)

        merch_distance = (np.divide(sum_min_matrixMerch, sum_max_matrixMerch))
        #print("merch_distance", merch_distance.shape)
        
        # L2
        # merch_distance = np.sqrt(np.sum(np.square(subset1Merch - subset2Merch), axis=-1))



        # mean
        sim_matrix[i,sim_matrix[i].nonzero()[1]] = (0.5) * route_distance + (0.5) * merch_distance
        
        # product
        #sim_matrix[i,sim_matrix[i].nonzero()[1]] = route_distance * merch_distance
        
        # weighted product
        # weightsRoutes = np.full(sim_matrix[i].nonzero()[1].shape[0], 0.8)
        # weightsMerch = np.full(sim_matrix[i].nonzero()[1].shape[0], 0.2)
        # sim_matrix[i,sim_matrix[i].nonzero()[1]] = np.power(route_distance, weightsRoutes) * np.power(merch_distance, weightsMerch)
        
        progress_proxy.update(1)
    
    return sim_matrix


def similarity_minhash_lsh_two_matrices_and_merch(matrix1, matrix1Merch, matrix2, matrix2Merch, thresh_user=0.2):
    
    similarity_matrix = lsh_two_matrices(matrix1,matrix2, thresh_user=thresh_user)
    # print("similarity_matrix", similarity_matrix.shape, similarity_matrix)

    # uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    # neverSeen = set([i for i in range(matrix1.shape[0])]) - uniqueRowsSet
    

    # sortedUniqueRowsSet = sorted(list(uniqueRowsSet))
    # print("sortedUniqueRowsSet", sortedUniqueRowsSet)

    # subset_matrix1 = matrix1[sortedUniqueRowsSet]
    # subset_matrix1Merch = matrix1Merch[sortedUniqueRowsSet]
    # print("subset_matrix1", subset_matrix1.shape, subset_matrix1[0])
    # print("subset_matrix1Merch", subset_matrix1Merch.shape, subset_matrix1Merch[0])

    # subset_matrix2 = matrix2[sortedUniqueRowsSet]
    # subset_matrix2Merch = matrix2Merch[sortedUniqueRowsSet]
    # print("subset_matrix2", subset_matrix2.shape, subset_matrix2[0])
    # print("subset_matrix2Merch", subset_matrix2Merch.shape, subset_matrix2Merch[0])

    # subset_similarity_matrix = np.full((subset_matrix1.shape[0], subset_matrix2.shape[0]), np.inf)
        
    print("Computing distance  on subset matrix...")
    with ProgressBar(total=matrix1.shape[0]) as progress:
        similarity_matrix = compute_distance_pairs_merch(similarity_matrix, matrix1, matrix1Merch, matrix2, matrix2Merch, progress)
        
    return similarity_matrix
        

# def jaccard_similarity_minhash_lsh(matrix, bands=10):
#     #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     pairs = lsh(matrix, bands=bands, columns=matrix.shape[1])
#     uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
#     uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs])
#     print("uniqueRows numpy", len(uniqueRows))
#     print("uniqueRows set", len(uniqueRowsSet))
#     print("num of pairs", len(pairs))
#     print("num unique i", len(set([i for i, j in pairs])))
#     print("num unique j", len(set([j for i, j in pairs])))
#     print("num unique rows", len(uniqueRows))
#     map_i = {i: index for i, index in enumerate(uniqueRows)}
    
#     subset_matrix = matrix[list(uniqueRows)]
    
#     print("Computing jaccard similarity on subset matrix...")
#     print("subset matrix", subset_matrix.shape)
#     subset1 = subset_matrix[:, None, :]
#     subset2 = subset_matrix[None, :, :]
#     min_matrix = np.minimum(subset1, subset2) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
#     sum_min_matrix = np.sum(min_matrix, axis=-1)
#     print("sum_min_matrix", sum_min_matrix.shape)
    
#     max_matrix = np.maximum(subset1, subset2)
#     sum_max_matrix = np.sum(max_matrix, axis=-1)
#     print("sum_max_matrix", sum_max_matrix.shape)
    
#     jaccard_similarity_matrix =  sum_min_matrix / sum_max_matrix
    
#     # map back to original matrix
#     print("Mapping back to original matrix...")
#     # for i, j in tqdm(pairs):
#     #     similarity_matrix[map_i[i], map_i[j]] = jaccard_similarity_matrix[i, j]
#     #     similarity_matrix[map_i[j], map_i[i]] = similarity_matrix[map_i[i], map_i[j]]
        
#     for i in tqdm(range(jaccard_similarity_matrix.shape[0])):
#         for j in range(i + 1, jaccard_similarity_matrix.shape[0]):
#             similarity_matrix[map_i[i], map_i[j]] = jaccard_similarity_matrix[i, j]
#             similarity_matrix[map_i[j], map_i[i]] = similarity_matrix[map_i[i], map_i[j]]
    
    
#     # for i, j in tqdm(pairs, desc="lsh sim"):
#     #     similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / matrix.shape[1]
#     #     similarity_matrix[j, i] = similarity_matrix[i, j]
    
#     similarity_matrix.setdiag(1)
    
#     return similarity_matrix

@njit(cache=True,)
def replicate_row(matrix, i):
    result = np.empty((matrix.shape[0], matrix.shape[1]))
    for j in range(matrix.shape[0]):
        result[j] = matrix[i]
    return result

@njit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrix(subset_matrix, progress_proxy):
    n = subset_matrix.shape[0]
    m = subset_matrix.shape[1]
    subset_matrix = subset_matrix.astype(np.int64)
    subset_similarity_matrix = np.zeros((n, n))
    subset2 = subset_matrix
    for i in prange(n):
        subset1 = subset_matrix[i].reshape(1, -1) #replicate_row(subset_matrix, i)    
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        subset_similarity_matrix[i] = 1 - (np.divide(sum_min_matrix, sum_max_matrix)).T
        progress_proxy.update(1)
    return subset_similarity_matrix

@njit(cache=True, nogil=True, parallel=False)
def compute_subset_similarity_matrix_only_pairs(matrix, matrixMerch, pairs, progress_proxy):
    n = matrix.shape[0]
    m = matrix.shape[1]
    similarity_pairs = np.zeros(len(pairs))
    for i in prange(len(pairs)):
        subset1 = matrix[pairs[i][0]] #replicate_row(subset_matrix, i)  
        subset2 = matrix[pairs[i][1]]
        subset1Merch = matrixMerch[pairs[i][0]]
        subset2Merch = matrixMerch[pairs[i][1]]
        
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        distMerch = 1 - np.abs(np.dot(subset1Merch, subset2Merch) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch)), 1)
        
        
        
        similarity_pairs[i] = (1 - (sum_min_matrix / sum_max_matrix)) * distMerch
        # if similarity_pairs[i] >= 1:
        #     print("similarity_pairs[i]", similarity_pairs[i])
        #     print("dist merch", distMerch, "cosine ", np.abs(np.dot(subset1Merch, subset2Merch) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch))))
        #     print("dist routes", (1 - (sum_min_matrix / sum_max_matrix)))
        #     print("prod", (1 - (sum_min_matrix / sum_max_matrix)) * distMerch)
        progress_proxy.update(1)
    return similarity_pairs

@njit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrix_and_merch(matrix, matrixMerch, progress_proxy):
    n = matrix.shape[0]
    n1 = matrix.shape[1]
    m = matrixMerch.shape[1]
    similarity_pairs = np.zeros((n,n))
    subset2 = matrix
    subset2Merch = matrixMerch
    squareMatrix = np.full((n, m), 2)
    routeWeights = np.full(n, 0.8)
    merchWeights = np.full(n, 0.2)
    print("n", n, "m", m)
    print("matrix merch", matrixMerch.shape)
    print("matrix square", squareMatrix.shape)
    print("routeWeights", routeWeights.shape)
    print("merchWeights", merchWeights.shape)
    normsSubset2Merch = np.sqrt(np.sum(np.power(subset2Merch, squareMatrix), axis=1))
    for i in prange(n):
        subset1 = matrix[i].reshape(1, -1) #replicate_row(subset_matrix, i)  
        subset1Merch = matrixMerch[i].reshape(1, -1)
        #print("subset1", subset1.shape)
        #print("subset2", subset2.shape)
        
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        #print("sum_min_matrix", sum_min_matrix.shape)
        
        #print("merch1", subset1Merch.shape)
        #print("merch2", subset2Merch.shape)
        
        #distMerch = 1 - np.abs(np.dot(subset1Merch, subset2Merch.T) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch)))
        
        # COSINE
        #distMerch = 1 - (((subset1Merch * subset2Merch).sum(axis=1) / (np.sqrt(np.sum(np.power(subset1Merch, squareMatrix),axis=1)) * normsSubset2Merch)) + 1) / 2
        
        # JACCARD
        min_matrix_merch = np.minimum(subset1Merch, subset2Merch)
        sum_min_matrix_merch = np.sum(min_matrix_merch, axis=-1)
        
        max_matrixMerch = np.maximum(subset1Merch, subset2Merch)
        sum_max_matrixMerch = np.sum(max_matrixMerch, axis=-1)
        distMerch = 1 - (sum_min_matrix_merch / sum_max_matrixMerch)
        
        # L2
        #distMerch = np.sqrt(np.sum(np.power(subset1Merch - subset2Merch, squareMatrix), axis=1))
        
        routeDistance = 1 - (sum_min_matrix / sum_max_matrix)
        # if i == 0 or i == n-1:
        #     print("i", i, "distMerch", distMerch.shape, distMerch)
        #     print("i", i, "sum_min_matrix", (1 - (sum_min_matrix / sum_max_matrix)).shape, (1 - (sum_min_matrix / sum_max_matrix)))
        #     print("i", i, "prod", ((1 - (sum_min_matrix / sum_max_matrix)) * distMerch).shape, ((1 - (sum_min_matrix / sum_max_matrix)) * distMerch))
        #print(i, (1 - (sum_min_matrix / sum_max_matrix)) * distMerch)
        
        # MEAN
        similarity_pairs[i] = (routeDistance + distMerch)/2
        # PRODUCT
        #similarity_pairs[i] = routeDistance * distMerch
        # WEIGHTED PRODUCT
        #similarity_pairs[i] = np.power(routeDistance, routeWeights) * np.power(distMerch, merchWeights)
        
        # if np.isnan(similarity_pairs[i]).any():
        #     np.set_printoptions(threshold=10000)
        #     # print("similarity_pairs[i]", similarity_pairs[i])
        #     print("dist merch", distMerch)
        #     # print("dist routes", routeDistance)
        #     # print("pow1", np.power(routeDistance, routeWeights))
        #     # print("pow2", np.power(distMerch, merchWeights))
        #     # print("prod", np.power(routeDistance, routeWeights) * np.power(distMerch, merchWeights))
        #     # print("powers", routeWeights, merchWeights)
        #     print("BROKEN")
        #     return
        #similarity_pairs[i] = (1 - (sum_min_matrix / sum_max_matrix)) * (distMerch)
        #similarity_pairs[i] = 1 - (sum_min_matrix / sum_max_matrix + distMerch) / 2
        # if similarity_pairs[i] >= 1:
        #     print("similarity_pairs[i]", similarity_pairs[i])
        #     print("dist merch", distMerch, "cosine ", np.abs(np.dot(subset1Merch, subset2Merch) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch))))
        #     print("dist routes", (1 - (sum_min_matrix / sum_max_matrix)))
        #     print("prod", (1 - (sum_min_matrix / sum_max_matrix)) * distMerch)
        progress_proxy.update(1)
    return similarity_pairs

@njit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrices_and_merch(subset_matrix1, subset_matrix2, subset_matrix_merch1, subset_matrix_merch2, progress_proxy):
    n = subset_matrix1.shape[0]
    m = subset_matrix2.shape[0]
    mn = subset_matrix_merch1.shape[0]
    mn1 = subset_matrix_merch1.shape[1]
    mm = subset_matrix_merch2.shape[0]
    #subset_matrix = subset_matrix.astype(np.int64)
    subset_similarity_matrix = np.zeros((n, m))
    subset2 = subset_matrix2
    squareMatrix = np.full((mn, mn1), 2)
    for i in prange(n):
        subset1 = subset_matrix1[i].reshape(1, -1) #replicate_row(subset_matrix, i)    
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        # COSINE
        #distMerch = 1 - (((subset1Merch * subset2Merch).sum(axis=1) / (np.sqrt(np.sum(np.power(subset1Merch, squareMatrix),axis=1)) * normsSubset2Merch)) + 1) / 2
        
        # JACCARD
        # min_matrix_merch = np.minimum(subset1Merch, subset2Merch)
        # sum_min_matrix_merch = np.sum(min_matrix_merch, axis=-1)
        
        # max_matrixMerch = np.maximum(subset1Merch, subset2Merch)
        # sum_max_matrixMerch = np.sum(max_matrixMerch, axis=-1)
        # distMerch = 1 - (sum_min_matrix_merch / sum_max_matrixMerch)
        
        # L2
        distMerch = np.sqrt(np.sum(np.power(subset_matrix_merch1 - subset_matrix_merch2, squareMatrix), axis=1))
        
        routeDistance = 1 - (sum_min_matrix / sum_max_matrix)
        # if i == 0 or i == n-1:
        #     print("i", i, "distMerch", distMerch.shape, distMerch)
        #     print("i", i, "sum_min_matrix", (1 - (sum_min_matrix / sum_max_matrix)).shape, (1 - (sum_min_matrix / sum_max_matrix)))
        #     print("i", i, "prod", ((1 - (sum_min_matrix / sum_max_matrix)) * distMerch).shape, ((1 - (sum_min_matrix / sum_max_matrix)) * distMerch))
        #print(i, (1 - (sum_min_matrix / sum_max_matrix)) * distMerch)
        subset_similarity_matrix[i] = (routeDistance + distMerch)/2
        
        
        progress_proxy.update(1)
    return subset_similarity_matrix

def jaccard_similarity_minhash_lsh_route_merch(matrix, matrixMerch, thresh_user=0.2):
    #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    #similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    pairs = lsh(matrix, thresh_user=thresh_user)
    #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    neverSeen = set([i for i in range(matrix.shape[0])]) - uniqueRowsSet
    print("neverSeen", neverSeen)
    #print("uniqueRows numpy", len(uniqueRows))
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    #print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
    print(" num of pairs", len(pairs))
    print(" instead of", matrix.shape[0]*(matrix.shape[0]-1)/2)
    print("improved by", (1 - len(pairs) / (matrix.shape[0]*(matrix.shape[0]-1)/2)) *100, "%")
    #print("num of pairs", len(pairs))
    #print("num unique i", len(set([i for i, j in pairs])))
    #print("num unique j", len(set([j for i, j in pairs])))
    #print("num unique rows", len(uniqueRows))
    #map_i = {i: index for i, index in enumerate(uniqueRowsSet)}
    #map_i_array = np.array([map_i[i] for i in range(len(map_i))])
    
    #subset_matrix = matrix[list(uniqueRowsSet)]
    
    #subset_similarity_matrix = np.full((subset_matrix.shape[0], subset_matrix.shape[0]), np.inf)
    
    # compute connected components on pairs
    # print("Computing connected components...")
    # G = nx.Graph()
    # G.add_edges_from(pairs)
    # connected_components = nx.connected_components(G)
    # #print("num connected components", ))
    # islands = []
    # for i, cc in enumerate(connected_components):
    #     islands.append(list(cc))
        
    # islands.append(islands[0])
    
    # minimum linkage
    #pairs = np.concatenate([pairs, np.array([[i, j] for i, j in zip(islands[:-1], islands[1:])])])
    
    # complete linkage
    #pairs = np.concatenate([pairs, np.array([[i, j] for i in islands for j in islands if i != j])])
    
    # pairwise linkage with every element of islands with every other element of all the other islands
    #pairs = np.array([[i, j] for island1 in islands for island2 in islands if island1 != island2 for i in island1 for j in island2])
    
    # G = nx.Graph()
    # G.add_edges_from(pairs)
    # connected_components = nx.connected_components(G)
    # print("num connected components after", len(list(connected_components)))
    
        
    print("Computing jaccard similarity on subset matrix...")
    #print("subset matrix", subset_matrix.shape)

    # with ProgressBar(total=len(pairs)) as progress:
    #     distance_pairs = compute_subset_similarity_matrix_only_pairs(matrix, matrixMerch, pairs, progress)
    
    sortedUniqueRowsSet = sorted(list(uniqueRowsSet))
    subset_matrix = matrix[sortedUniqueRowsSet]
    subset_matrixMerch = matrixMerch[sortedUniqueRowsSet]
    print("subset_matrix", subset_matrix.shape, subset_matrix[0])
    print("subset_matrixMerch", subset_matrixMerch.shape, subset_matrixMerch[0])
    with ProgressBar(total=len(sortedUniqueRowsSet)) as progress:
        subset_sim_matrix = compute_subset_similarity_matrix_and_merch(subset_matrix, subset_matrixMerch, progress)
    print("subset_sim_matrix", subset_sim_matrix.shape, subset_sim_matrix[0])
    print("subset_sim_matrix contains nan", np.isnan(subset_sim_matrix).any())
    print("nan indices", len(np.argwhere(np.isnan(subset_sim_matrix))), np.argwhere(np.isnan(subset_sim_matrix)))
    
    # if len(neverSeen) > 0:
    #     for i, n in enumerate(neverSeen):
    #         distance_pairs = np.concatenate([distance_pairs, [1]*(matrix.shape[0]-1-i)])
        
    #     pairs = np.concatenate([pairs, np.array([[i, j] for i,n  in enumerate(neverSeen) for j in range(i, matrix.shape[0]) if i != j])])
    #print("pairs", pairs.shape, pairs[-10:])
    # map back to original matrix
    print("Mapping back to original matrix...")
    
    lenMatrixNoNeverSeen = matrix.shape[0] - len(neverSeen)
    
    # remove never seen rows and map indices
    map_indices = {}
    sortedNeverSeen = sorted(list(neverSeen))
    counter = 0
    for i in range(matrix.shape[0]):
        if i in sortedNeverSeen:
            continue
        map_indices[i] = counter
        counter += 1
        
    print("map_indices", map_indices)
    map_indices_back = {v: k for k, v in map_indices.items()}
    
    # Create data array for COO matrix
    # indices_i, indices_j = np.array(pairs).T   # 9400    
    # # map indices to new indices
    # indices_i = np.array([map_indices[i] for i in indices_i])
    # indices_j = np.array([map_indices[j] for j in indices_j])
    
    # data = np.concatenate([distance_pairs, distance_pairs])

    # # Create row and column index arrays for COO matrix
    # rows = np.concatenate([indices_i, indices_j])
    # cols = np.concatenate([indices_j, indices_i])

    # # Create COO matrix
    # similarity_matrix = coo_matrix((data, (rows, cols)), shape=(lenMatrixNoNeverSeen, lenMatrixNoNeverSeen))
    
    # indices_i, indices_j = np.triu_indices(subset_sim_matrix.shape[0], k=1)
    # print("indices_i", indices_i.shape, indices_i)
    # print("indices_j", indices_j.shape, indices_j)
    # # map indices to new indices
    # indices_i_mapped = np.array([map_indices_back[i] for i in indices_i])
    # indices_j_mapped = np.array([map_indices_back[j] for j in indices_j])
    
    # # create coo matrix
    
    # # Create data array for COO matrix
    # data = np.concatenate([subset_sim_matrix[indices_i, indices_j], subset_sim_matrix[indices_i, indices_j]])
    
    # # Create row and column index arrays for COO matrix
    # rows = np.concatenate([indices_i_mapped, indices_j_mapped])
    # cols = np.concatenate([indices_j_mapped, indices_i_mapped])
    
    # # Create COO matrix
    # similarity_matrix = coo_matrix((data, (rows, cols)), shape=(lenMatrixNoNeverSeen, lenMatrixNoNeverSeen))
    
    
    
    #similarity_matrix = similarity_matrix.tocsr()
    # Update the similarity matrix
    #similarity_matrix[map_i_array[indices_i], map_i_array[indices_j]] = subset_similarity_matrix[indices_i, indices_j]
    #similarity_matrix[map_i_array[indices_j], map_i_array[indices_i]] = subset_similarity_matrix[indices_i, indices_j]
    
    # for i, j in tqdm(pairs, desc="lsh sim"):
    #     similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / matrix.shape[1]
    #     similarity_matrix[j, i] = similarity_matrix[i, j]
    
    #similarity_matrix.setdiag(1)
    subset_sim_matrix = csr_matrix(subset_sim_matrix)
    
    return subset_sim_matrix, map_indices_back

def jaccard_similarity_minhash_lsh_two_matrices_and_merch(matrix1, matrix2, matrixMerch1, matrixMerch2, thresh_user=0.2):
    pairs = lsh_two_matrices(matrix1, matrix2, thresh_user=thresh_user)
    print("pairs", pairs.shape)
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    neverSeen = set([i for i in range(matrix1.shape[0])]) - uniqueRowsSet
    print("neverSeen", neverSeen)
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    print(" num of pairs", len(pairs))
    print(" instead of", matrix1.shape[0]*(matrix1.shape[0]-1)/2)
    print("improved by", (1 - len(pairs) / (matrix1.shape[0]*(matrix1.shape[0]-1)/2)) *100, "%")
    
    sortedUniqueRowsSet = sorted(list(uniqueRowsSet))
    print("sortedUniqueRowsSet", sortedUniqueRowsSet)
    subset_matrix1 = matrix1[sortedUniqueRowsSet]
    subset_matrix2 = matrix2[sortedUniqueRowsSet]
    
    subset_matrixMerch1 = matrixMerch1[sortedUniqueRowsSet]
    subset_matrixMerch2 = matrixMerch2[sortedUniqueRowsSet]
    
    print("subset_matrix1", subset_matrix1.shape, subset_matrix1[0])
    print("subset_matrix2", subset_matrix2.shape, subset_matrix2[0])
    
    print("subset_matrixMerch1", subset_matrixMerch1.shape, subset_matrixMerch1[0])
    print("subset_matrixMerch2", subset_matrixMerch2.shape, subset_matrixMerch2[0])
    
    with ProgressBar(total=len(sortedUniqueRowsSet)) as progress:
        subset_sim_matrix = compute_subset_similarity_matrices(subset_matrix1, subset_matrix2, progress)
    print("subset_sim_matrix", subset_sim_matrix.shape, subset_sim_matrix[0])
    print("subset_sim_matrix contains nan", np.isnan(subset_sim_matrix).any())
    print("nan indices", len(np.argwhere(np.isnan(subset_sim_matrix))), np.argwhere(np.isnan(subset_sim_matrix)))
    
    # if len(neverSeen) > 0:
    #     for i, n in enumerate(neverSeen):
    #         distance_pairs = np.concatenate([distance_pairs, [1]*(matrix.shape[0]-1-i)])
    
    # map back to original matrix
    print("Mapping back to original matrix...")
    
    lenMatrixNoNeverSeen = matrix1.shape[0] - len(neverSeen)
    
    # remove never seen rows and map indices
    map_indices = {}
    sortedNeverSeen = sorted(list(neverSeen))
    counter = 0
    for i in range(matrix1.shape[0]):
        if i in sortedNeverSeen:
            continue
        map_indices[i] = counter
        counter += 1
        
    print("map_indices", map_indices)
    map_indices_back = {v: k for k, v in map_indices.items()}
    


# def jaccard_similarity_minhash_lsh(matrix, thresh_user=0.2):
#     #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     #similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     pairs = lsh(matrix, thresh_user=thresh_user)
#     #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
#     uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
#     #print("uniqueRows numpy", len(uniqueRows))
#     print("num of subset of rows to check similarity:", len(uniqueRowsSet))
#     #print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
#     print(" num of pairs", len(pairs))
#     print(" instead of", matrix.shape[0]*(matrix.shape[0]-1)/2)
#     print("improved by", len(pairs) / (matrix.shape[0]*(matrix.shape[0]-1)/2)*100, "%")
#     #print("num of pairs", len(pairs))
#     #print("num unique i", len(set([i for i, j in pairs])))
#     #print("num unique j", len(set([j for i, j in pairs])))
#     #print("num unique rows", len(uniqueRows))
#     map_i = {i: index for i, index in enumerate(uniqueRowsSet)}
#     map_i_array = np.array([map_i[i] for i in range(len(map_i))])
    
#     subset_matrix = matrix[list(uniqueRowsSet)]
    
#     subset_similarity_matrix = np.full((subset_matrix.shape[0], subset_matrix.shape[0]), np.inf)
    
#     print("Computing jaccard similarity on subset matrix...")
#     print("subset matrix", subset_matrix.shape)
    
#     # subset1 = subset_matrix[:, None, :]
#     # subset2 = subset_matrix[None, :, :]
#     # min_matrix = np.minimum(subset1, subset2) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
#     # sum_min_matrix = np.sum(min_matrix, axis=-1)
#     # print("sum_min_matrix", sum_min_matrix.shape)
    
#     # max_matrix = np.maximum(subset1, subset2)
#     # sum_max_matrix = np.sum(max_matrix, axis=-1)
#     # print("sum_max_matrix", sum_max_matrix.shape)
    
#     #subset_similarity_matrix =  sum_min_matrix / sum_max_matrix
#     #subset_similarity_matrix = np.divide(sum_min_matrix, sum_max_matrix, out=np.zeros_like(sum_min_matrix), where=(sum_max_matrix != 0))
    
#     # for i in tqdm(range(subset_matrix.shape[0])):
#     #     subset1 = np.vstack([subset_matrix[i]] * subset_matrix.shape[0])
#     #     subset2 = subset_matrix
#     #     min_matrix = np.minimum(subset1, subset2) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
#     #     sum_min_matrix = np.sum(min_matrix, axis=-1)
#     #     #print("sum_min_matrix", sum_min_matrix.shape)
        
#     #     max_matrix = np.maximum(subset1, subset2)
#     #     sum_max_matrix = np.sum(max_matrix, axis=-1)
#     #     #print("sum_max_matrix", sum_max_matrix.shape)
        
#     #     subset_similarity_matrix[i] =  (sum_min_matrix / sum_max_matrix).T
#     with ProgressBar(total=subset_matrix.shape[0]) as progress:
#         subset_similarity_matrix = compute_subset_similarity_matrix(subset_matrix, progress)
    
#     # map back to original matrix
#     print("Mapping back to original matrix...")
#     # Create arrays of indices
#     # Create data array for COO matrix
#     indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
#     data = np.concatenate([subset_similarity_matrix[indices_i, indices_j], subset_similarity_matrix[indices_i, indices_j]])

#     # Create row and column index arrays for COO matrix
#     rows = np.concatenate([map_i_array[indices_i], map_i_array[indices_j]])
#     cols = np.concatenate([map_i_array[indices_j], map_i_array[indices_i]])

#     # Create COO matrix
#     similarity_matrix = coo_matrix((data, (rows, cols)), shape=(matrix.shape[0], matrix.shape[0]))
    
#     # indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
#     # similarity_matrix = similarity_matrix.tocsr()
#     # # Update the similarity matrix
#     # similarity_matrix[map_i_array[indices_i], map_i_array[indices_j]] = subset_similarity_matrix[indices_i, indices_j]
#     # similarity_matrix[map_i_array[indices_j], map_i_array[indices_i]] = subset_similarity_matrix[indices_i, indices_j]
    
#     # for i, j in tqdm(pairs, desc="lsh sim"):
#     #     similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / matrix.shape[1]
#     #     similarity_matrix[j, i] = similarity_matrix[i, j]
    
#     #similarity_matrix.setdiag(1)
#     similarity_matrix = similarity_matrix.tocsr()
    
#     return similarity_matrix

def jaccard_similarity_minhash_lsh_two_matrices(matrix1, matrix2, thresh_user=0.2):
    #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    #similarity_matrix = lil_matrix((matrix1.shape[0], matrix2.shape[0]), dtype=np.float64)
    pairs = lsh(matrix1, thresh_user=thresh_user)
    #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs])
    #print("uniqueRows numpy", len(uniqueRows))
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
    print(" instead of", matrix1.shape[0]*(matrix1.shape[0]-1)/2)
    print("improved by", (1 - len(pairs)*(len(pairs)-1)/2 / (matrix1.shape[0]*(matrix1.shape[0]-1)/2))*100, "%")
    #print("num of pairs", len(pairs))
    #print("num unique i", len(set([i for i, j in pairs])))
    #print("num unique j", len(set([j for i, j in pairs])))
    #print("num unique rows", len(uniqueRows))
    map_i = {i: index for i, index in enumerate(uniqueRowsSet)}
    map_i_array = np.array([map_i[i] for i in range(len(map_i))])
    
    subset_matrix1 = matrix1[list(uniqueRowsSet)]
    subset_matrix2 = matrix2[list(uniqueRowsSet)]
    
    subset_similarity_matrix = np.full((subset_matrix1.shape[0], subset_matrix2.shape[0]), np.inf)
    
    print("Computing jaccard similarity on subset matrix...")
    print("subset matrix", subset_matrix1.shape)

    with ProgressBar(total=subset_matrix1.shape[0]) as progress:
        subset_similarity_matrix = compute_subset_similarity_matrix(subset_matrix1, progress)
    
    # map back to original matrix
    print("Mapping back to original matrix...")
    # Create arrays of indices
    # Create data array for COO matrix
    indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
    data = np.concatenate([subset_similarity_matrix[indices_i, indices_j], subset_similarity_matrix[indices_i, indices_j]])
    
    # Create row and column index arrays for COO matrix
    rows = np.concatenate([map_i_array[indices_i], map_i_array[indices_j]])
    cols = np.concatenate([map_i_array[indices_j], map_i_array[indices_i]])
    
    # Create COO matrix
    similarity_matrix = coo_matrix((data, (rows, cols)), shape=(matrix1.shape[0], matrix2.shape[0]))
    similarity_matrix.setdiag(0)
    
    similarity_matrix = similarity_matrix.tocsr()
    
    return similarity_matrix
    

def jaccard_similarity_two_matrices(matrix1, matrix2):
    #intersection = np.dot(matrix, matrix.T)
    intersection = np.dot(matrix1, matrix2.T)
    row_sums1 = matrix1.sum(axis=1)
    row_sums2 = matrix2.sum(axis=1)
    union = row_sums1[:, None] + row_sums2 - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

def jaccard_similarity_matrix_merch(matrix):
    print("matrix", matrix.shape)
    min_matrix = np.minimum(matrix[:, None, :], matrix[None, :, :]) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix[:, None, :], matrix[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity

    # current wrong method
    # 1 2 3    1 4 7    1 2 3  6     6 6 6
    # 4 5 6    2 5 8    2 5 6  13    6 15 15
    # 7 8 9    3 6 9    3 6 9        6 15 24
    
    
    # right method
    # 1 2 3    1 2 3    1 2 3
    # 1 2 3    4 5 6    1 2 3
    # 1 2 3    7 8 9    1 2 3
    
    # 4 5 6    1 2 3    1 2 3
    # 4 5 6    4 5 6    4 5 6
    # 4 5 6    7 8 9    4 5 6
    
    # 7 8 9    1 2 3    1 2 3
    # 7 8 9    4 5 6    4 5 6
    # 7 8 9    7 8 9    7 8 9
    
    
    #                   6 6 6
    #                   6 15 15
    #                   6 15 24
    
def similarity_matrix_merch(matrix):
    if 1.0 - np.count_nonzero(matrix) / matrix.size > 0.5:
        print("merch matrix is sparse")
    matrix = csr_matrix(matrix)
    print("matrix merch shape", matrix.shape)
    simMatrix = cosine_similarity(matrix, dense_output=False)
    
    return simMatrix

def create_binary_matrix(routeSets):
    uniqueShingles = list(set(shingle for route in routeSets for shingle in route[1]))
    print("uniqueShingles", len(uniqueShingles))

    # Create a dictionary that maps each shingle to its index
    shingle_to_index = {shingle: index for index, shingle in enumerate(uniqueShingles)}
    print("shingle_to_index", len(shingle_to_index))

    binaryMatrix = np.zeros((len(routeSets), len(uniqueShingles)), dtype=int)

    for i, route in enumerate(routeSets):
        #print("i", i)
        # Get the indices of the shingles in this route
        indices = [shingle_to_index[shingle] for shingle in route[1]]
        # Use advanced indexing to set the corresponding elements in the binary matrix to 1
        binaryMatrix[i, indices] = 1

    return binaryMatrix

def create_binary_matrix_minhash(matrix):
    numUnique = np.unique(matrix)
    binaryMatrix = np.zeros((matrix.shape[0], len(numUnique)), dtype=int)
    for i, route in enumerate(matrix):
        indices = np.where(route == 1)[0]
        binaryMatrix[i, indices] = 1
    
    return binaryMatrix

def create_binary_matrices(routeSet1, routeSet2):
    # create binary matrix where each row represents a route
    uniqueShinglesBoth = list(set([shingle for route in routeSet1 for shingle in route[1]] + [shingle for route in routeSet2 for shingle in route[1]]))
    binaryMatrix1 = np.zeros((len(routeSet1), len(uniqueShinglesBoth)))
    binaryMatrix2 = np.zeros((len(routeSet2), len(uniqueShinglesBoth)))
    for i, route in enumerate(routeSet1):
        for shingle in route[1]:
            binaryMatrix1[i][uniqueShinglesBoth.index(shingle)] = 1
            
    for i, route in enumerate(routeSet2):
        for shingle in route[1]:
            binaryMatrix2[i][uniqueShinglesBoth.index(shingle)] = 1
    return binaryMatrix1, binaryMatrix2

def find_num_hashes_minhash(matrix):
    if matrix.shape[1] < 150:
        num_hash_functions = matrix.shape[1]
    elif matrix.shape[1] < 500:
        num_hash_functions = matrix.shape[1]//2
    elif matrix.shape[1] < 1000:
        num_hash_functions = matrix.shape[1]//10
    elif matrix.shape[1] < 10_000:
        num_hash_functions = 150
    elif matrix.shape[1] < 100_000:
        num_hash_functions = 250
    else:
        num_hash_functions = 300
    return num_hash_functions


# convert routes and merchandise to binary matrices
# binary matrix where each row represents a route
print("Creating route binary matrix...")
route_matrix, route_matrix_standard = create_binary_matrices(actualSets, standardSets)
print("\nroute_matrix actual", route_matrix.shape, route_matrix[0])
print("\nroute_matrix standard", route_matrix_standard.shape, route_matrix_standard[0])

print("Minhashing route matrix...")    
num_hash_functions = find_num_hashes_minhash(route_matrix)
#route_matrix = minhash(route_matrix, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
route_matrix, route_matrix_standard = minhash_matrices(route_matrix, route_matrix_standard, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
print("\nroute_matrix minhash", route_matrix.shape, route_matrix[0])
# binary matrix where each row represents merchandise

print("Creating merchandise binary matrix...")
merch_matrix = np.array([s[2] for s in actualSets])

print("\nmerch_matrix", merch_matrix.shape, merch_matrix)
print("merch_matrix contains nan", np.isnan(merch_matrix).any())

# compute Jaccard similarity for each matrix
# print("Computing Jaccard similarity route matrix...")
# route_similarity = jaccard_similarity_minhash_lsh(route_matrix, thresh_user=0.4)
# #route_similarity = jaccard_similarity_matrix(route_matrix)
# print("\nroute_similarity", type(route_similarity), route_similarity.shape,route_similarity[0, 0], route_similarity[0])
# #merch_similarity = jaccard_similarity_matrix_merch(merch_matrix)
# print("Computing Jaccard similarity merchandise matrix...")
# #merch_similarity = similarity_matrix_merch(merch_matrix)
# merch_similarity_lsh = jaccard_similarity_minhash_lsh(merch_matrix, thresh_user=0.4)
# print("\nmerch_similarity", type(merch_similarity_lsh), merch_similarity_lsh.shape, merch_similarity_lsh[0])

print("Computing Jaccard similarity route matrix...")
actualSetsDistances, map_indices_back = jaccard_similarity_minhash_lsh_route_merch(route_matrix, merch_matrix, thresh_user=0.0)
#route_similarity = jaccard_similarity_matrix(route_matrix)
print("\nactualSetsDistances", type(actualSetsDistances), actualSetsDistances.shape,actualSetsDistances[0, 0], actualSetsDistances[0])
print("map indices back", map_indices_back)


# # compute final Jaccard distance
# print("Multiplying Jaccard similarities...")
# actualSetsDistances = (route_similarity.multiply(merch_similarity_lsh))
# actualSetsDistances = np.nan_to_num(actualSetsDistances, nan=0)
#actualSetsDistances = 1 - actualSetsDistances
#print("\nactualSetsDistances", actualSetsDistances.shape, actualSetsDistances[0, 0], actualSetsDistances[0])

# Essentials for Task 2
print("\n\nTASK 2 ESSENTIALS\n\n")

# standardToActualSetsDistances = None
#route_matrix_standard = create_binary_matrix(standardSets)
print("Minhashing standard route matrix...")
print("\nroute_matrix_standard", route_matrix_standard.shape, route_matrix_standard[0])
#route_matrix_standard = minhash(route_matrix_standard, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
print("\nroute_matrix_standard minhash", route_matrix_standard.shape, route_matrix_standard[0])

merch_matrix_standard = np.array([s[2] for s in standardSets])

route_similarity_standard_to_actual = similarity_minhash_lsh_two_matrices_and_merch(route_matrix, merch_matrix, route_matrix_standard, merch_matrix_standard, thresh_user=0.0)
print("\nroute_similarity_standard_to_actual", route_similarity_standard_to_actual.shape, route_similarity_standard_to_actual[0])

#merch_similarity_lsh_standard_to_actual = jaccard_similarity_minhash_lsh_two_matrices(merch_matrix, merch_matrix_standard, thresh_user=0.4)





Creating route binary matrix...



The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.




route_matrix actual (10000, 6067) [0. 0. 0. ... 0. 0. 0.]

route_matrix standard (10, 6067) [0. 0. 0. ... 0. 0. 0.]
Minhashing route matrix...


minhashing: 100%|██████████| 10000/10000 [00:00<00:00, 80130.83it/s]
minhashing second matrix: 100%|██████████| 10/10 [00:00<00:00, 14747.90it/s]


route_matrix minhash (10000, 150) [ 322.  887.  584.  446.  113.   23.  528.  202.  394. 1273.  342.  454.
  248.   43.  142.   69.  239.  482.  733.  102.  359.    0.  137.  723.
  879.  268.  192.   35.  225.  337.  556.  191.  128.  293.  331.  845.
  200.  468.  325.   65.  310.  253.  460.  168.  141.  634.  128.  234.
  586.  688.  521.  240.   55.   54.  243.  374. 1631.  130.  147.  419.
  362.  177.  239.  225.  357.  342.  164.  442.  504.  102.  221.  252.
   78.  218.   14.   87.  167. 1510.  396.   10.  955.  677.  487.  110.
  142.  138.  273.  219.   30.  315.  897.   25.  345.   18.  213.  190.
  258.  473.  528.   12.   15.  112.  727.  229.  114.  269.  129.   59.
  225.   72.   33.  502.  525.   89.   92.   74.  114.  204.   45.  172.
    2.  290.   94.   60.   91.   67.  510.  405.   94.  525.  294.  406.
  258.  115.  486.  688.   20.  165.  480.  512.  829.   18.   74.  534.
  107.  253.  322.  754.  177.   84.]
Creating merchandise binary matrix...

merch_matrix




Finding candidate pairs...


100%|██████████| 10000/10000 [00:05<00:00, 1751.77it/s]


neverSeen set()
num of subset of rows to check similarity: 10000
 num of pairs 5462188
 instead of 49995000.0
improved by 89.07453145314531 %
Computing jaccard similarity on subset matrix...
subset_matrix (10000, 150) [ 322.  887.  584.  446.  113.   23.  528.  202.  394. 1273.  342.  454.
  248.   43.  142.   69.  239.  482.  733.  102.  359.    0.  137.  723.
  879.  268.  192.   35.  225.  337.  556.  191.  128.  293.  331.  845.
  200.  468.  325.   65.  310.  253.  460.  168.  141.  634.  128.  234.
  586.  688.  521.  240.   55.   54.  243.  374. 1631.  130.  147.  419.
  362.  177.  239.  225.  357.  342.  164.  442.  504.  102.  221.  252.
   78.  218.   14.   87.  167. 1510.  396.   10.  955.  677.  487.  110.
  142.  138.  273.  219.   30.  315.  897.   25.  345.   18.  213.  190.
  258.  473.  528.   12.   15.  112.  727.  229.  114.  269.  129.   59.
  225.   72.   33.  502.  525.   89.   92.   74.  114.  204.   45.  172.
    2.  290.   94.   60.   91.   67.  510.  405.   9

  0%|          | 0/10000 [00:00<?, ?it/s]

n 10000 m 20
matrix merch (10000, 20)
matrix square (10000, 20)
routeWeights (10000,)
merchWeights (10000,)
subset_sim_matrix (10000, 10000) [0.         0.36440132 0.31005786 ... 0.55026892 0.55519078 0.54625914]
subset_sim_matrix contains nan False
nan indices 0 []
Mapping back to original matrix...
map_indices {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 8

100%|██████████| 10000/10000 [00:00<00:00, 156543.60it/s]

Computing distance  on subset matrix...





  0%|          | 0/10000 [00:00<?, ?it/s]



Compilation is falling back to object mode WITH looplifting enabled because Function "compute_distance_pairs_merch" failed type inference due to: non-precise type pyobject
During: typing of argument at /var/folders/7g/qgw0d2v55szdww4y4_lzltt80000gn/T/ipykernel_11659/2885281117.py (311)

File "../../../../var/folders/7g/qgw0d2v55szdww4y4_lzltt80000gn/T/ipykernel_11659/2885281117.py", line 311:
<source missing, REPL/exec in use?>



Function "compute_distance_pairs_merch" was compiled in object mode without forceobj=True.

File "../../../../var/folders/7g/qgw0d2v55szdww4y4_lzltt80000gn/T/ipykernel_11659/2885281117.py", line 311:
<source missing, REPL/exec in use?>




Fall-back from the nopython compilation path to the object mode compilation path has been detected. This is deprecated behaviour that will be removed in Numba 0.59.0.

For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit


route_distance (2,)
merch_distance (2,)
route_distance (2,)
merch_distance (2,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (2,)
merch_distance (2,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (2,)
merch_distance (2,)
route_distance (2,)
merch_distance (2,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (2,)
merch_distance (2,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (1,)
merch_distance (1,)
route_distance (2,)
merch_distance (2,)


In [200]:
# def jaccard_distance_routes(route1, route2):
#     id1 = route1[0]
#     id2 = route2[0]
#     r1 = set(route1[1])
#     r2 = set(route2[1])
#     merch1 = route1[2]
#     merch2 = route2[2]
    
#     intersection = len(list(r1.intersection(r2)))
#     union = (len(r1) + len(r2)) - intersection
#     jaccard_similarity = float(intersection) / union if union != 0 else 0
    
#     intersectionMerch = np.sum(np.minimum(merch1, merch2))
#     unionMerch = np.sum(np.maximum(merch1, merch2))
#     jaccard_similarity_merch = float(intersectionMerch) / unionMerch if unionMerch != 0 else 0
    
#     return 1 - (jaccard_similarity + jaccard_similarity_merch) / 2

# forward_expansion = len(actualSets) // len(standardSets)
# # precompute the distances between the elements of the actual sets
# actualSetsDistances = np.zeros((len(actualSets), len(actualSets)))
# for i in range(len(actualSets)):
#     for j in range(len(actualSets)):
#         actualSetsDistances[i,j] = jaccard_distance_routes(actualSets[i], actualSets[j])
#print("actualSetsDistances: ", actualSetsDistances.shape, actualSetsDistances[0])

In [201]:
# HDBSCAN clustering
# Compute HDBSCAN

# import gridsearchcv

#grid = GridSearchCV(hdbscan, scoring=hdbscan.probabilities_, , verbose=1)
# min_cluster_size = [10, 15, 20, 25, 30, 35, 40, 45, 50]
# max_cluster_size = [55, 60, 65, 75, 80, 85, 90, 95, 100]
forward_expansion = len(actualSets) // len(standardSets) #! TODO: change to mean
print("forward_expansion", forward_expansion)
from hdbscan import HDBSCAN as hdbsc

#actualSetsDistances = actualSetsDistances.astype(np.float64)

# find rows with only zeros and set them to 1 of csr matrix
#actualSetsDistances[actualSetsDistances.getnnz(1)==0, :] = 1
#print("get nnz", np.where((actualSetsDistances.getnnz(1)==1)==True)[0])
#print("get nnz of 2 values", (actualSetsDistances==0))
#print(actualSetsDistances[:, 9816])
#print(actualSetsDistances[9816, :])
#actualSetsDistances[:, actualSetsDistances.getnnz(1)==0] = 1

#actualSetsDistances.setdiag(0)
#actualSetsDistances = actualSetsDistances.toarray()
print("actualSetsDistances", actualSetsDistances.shape, actualSetsDistances)


print("type(actualSetsDistances)", type(actualSetsDistances), actualSetsDistances.dtype, actualSetsDistances.shape, actualSetsDistances.count_nonzero(), min(actualSetsDistances.getnnz(axis=-1)), np.unique(actualSetsDistances.data))
#actualSetsDistances = np.array(actualSetsDistances)

#print("get nnz of 2 values", np.where((actualSetsDistances==2))[0])


print("Computing HDBSCAN...")
hdb = HDBSCAN(min_cluster_size=forward_expansion//3, max_cluster_size=forward_expansion, metric="precomputed", store_centers=None,allow_single_cluster=False).fit(actualSetsDistances.copy())
#hdb = DBSCAN(eps=0.5, min_samples=forward_expansion//3, metric="precomputed").fit(actualSetsDistances.copy())

labels_HDBSCAN = hdb.labels_
print("num clusters found: ", len(set(labels_HDBSCAN)))
print("biggest cluster: ", max(labels_HDBSCAN, key=list(labels_HDBSCAN).count), " num elements: ", list(labels_HDBSCAN).count(max(labels_HDBSCAN, key=list(labels_HDBSCAN).count)))

# Create a color map that maps each unique label to a color
unique_labels = np.unique(labels_HDBSCAN)
#unique_labels = unique_labels[unique_labels != -1]
print("unique_labels: ", len(unique_labels), unique_labels, "standard sets len", len(standardSets))

#centroids = hdb.centroids_
#medoids = hdb.medoids_

#print("centroids: ", centroids.shape)
#print("medoids: ", medoids.shape)

# find the medoids using the clusters found by HDBSCAN
print("actualSetsDistances: ", actualSetsDistances.shape, actualSetsDistances[0])
medoidsIndices = []
cluster_mean_distances = []
for cluster in unique_labels:
    #print("cluster: ", cluster)
    if cluster in [-1, -2, -3]:
        continue
    cluster_elements = np.where(labels_HDBSCAN == cluster)[0]
    #print("cluster_elements: ", cluster_elements.shape, cluster_elements)
    #print("cluster_elements: ", actualSetsDistances[cluster_elements].shape)
    cluster_distances = actualSetsDistances[cluster_elements][:,cluster_elements]
    #print("cluster_distances: ", cluster_distances.shape, cluster_distances)
    #print("real distance", actualSetsDistances[cluster_elements[0], cluster_elements[0]])
    cluster_distances_sum = np.sum(cluster_distances, axis=1)
    cluster_distances_mean = np.mean(cluster_distances, axis=1)
    cluster_mean_distances.append(np.min(cluster_distances_mean))
    #print("cluster min mean distance: ", np.min(cluster_distances_mean))
    medoid = cluster_elements[np.argmin(cluster_distances_sum)]
    #medoidMean = cluster_elements[np.argmin(cluster_distances_mean)]
    #print("medoidMean", medoidMean, "medoid", medoid)
    #print("medoid", medoid)
    medoidsIndices.append(medoid)
medoidsIndices = np.array(medoidsIndices)

print("medoidsIndices: ", medoidsIndices.shape, medoidsIndices)
print("cluster_mean_distances: ", len(cluster_mean_distances), cluster_mean_distances)


forward_expansion 1000
actualSetsDistances (10000, 10000)   (0, 1)	0.36440132225935473
  (0, 2)	0.31005785764525856
  (0, 3)	0.33103320080274373
  (0, 4)	0.270682177418315
  (0, 5)	0.3210083874878585
  (0, 6)	0.3074955512423982
  (0, 7)	0.30345081661652434
  (0, 8)	0.3085444242189584
  (0, 9)	0.285301316381609
  (0, 10)	0.30006572253191277
  (0, 11)	0.2627845682973626
  (0, 12)	0.30362315233480697
  (0, 13)	0.28597039725813045
  (0, 14)	0.350031961824082
  (0, 15)	0.3268862313927416
  (0, 16)	0.23116100580998827
  (0, 17)	0.30183968348115625
  (0, 18)	0.31908304554161
  (0, 19)	0.2663551025197918
  (0, 20)	0.34747783884821765
  (0, 21)	0.2836902741464003
  (0, 22)	0.36038226365324344
  (0, 23)	0.2892571394327176
  (0, 24)	0.30985700617293715
  (0, 25)	0.28744912819561846
  :	:
  (9999, 9974)	0.24395880605071846
  (9999, 9975)	0.04448529411764718
  (9999, 9976)	0.25513726146747095
  (9999, 9977)	0.18334778201498708
  (9999, 9978)	0.16063473073916856
  (9999, 9979)	0.2939263970139034
  (

In [202]:
# completeSets = actualSets + standardSets
# print("completeSets: ", len(completeSets))
# route_matrix_with_standard = create_binary_matrix(completeSets)
# # binary matrix where each row represents merchandise
# merch_matrix_with_standard = np.array([s[2] for s in completeSets])

# # compute Jaccard similarity for each matrix
# route_similarity_with_standard = jaccard_similarity_matrix(route_matrix_with_standard)
# merch_similarity_with_standard = similarity_matrix_merch(merch_matrix_with_standard).toarray()
# print("route_similarity_with_standard", route_similarity_with_standard.shape, route_similarity_with_standard[0])
# print("merch_similarity_with_standard", merch_similarity_with_standard.shape, merch_similarity_with_standard[0])
# completeSetsDistances = np.multiply(route_similarity_with_standard, merch_similarity_with_standard)
# print("completeSetsDistances", type(completeSetsDistances), completeSetsDistances.shape, completeSetsDistances[0])
# completeSetsDistances = np.nan_to_num(completeSetsDistances, nan=0)
# completeSetsDistances = 1 - completeSetsDistances
# completeSetsDistances = np.array(completeSetsDistances)
# print("completeSetsDistances", completeSetsDistances.shape, completeSetsDistances[0])

matricesActualAndStandard = np.vstack([route_matrix, route_matrix_standard])
print("matricesActualAndStandard", matricesActualAndStandard.shape, matricesActualAndStandard[0])

perplexity = 30 if len(matricesActualAndStandard) > 30 else len(matricesActualAndStandard) - 1
completeSetTSNE = TSNE(n_components=3, perplexity=perplexity, n_iter=1000, verbose=1).fit_transform(matricesActualAndStandard)


matricesActualAndStandard (10010, 150) [ 322.  887.  584.  446.  113.   23.  528.  202.  394. 1273.  342.  454.
  248.   43.  142.   69.  239.  482.  733.  102.  359.    0.  137.  723.
  879.  268.  192.   35.  225.  337.  556.  191.  128.  293.  331.  845.
  200.  468.  325.   65.  310.  253.  460.  168.  141.  634.  128.  234.
  586.  688.  521.  240.   55.   54.  243.  374. 1631.  130.  147.  419.
  362.  177.  239.  225.  357.  342.  164.  442.  504.  102.  221.  252.
   78.  218.   14.   87.  167. 1510.  396.   10.  955.  677.  487.  110.
  142.  138.  273.  219.   30.  315.  897.   25.  345.   18.  213.  190.
  258.  473.  528.   12.   15.  112.  727.  229.  114.  269.  129.   59.
  225.   72.   33.  502.  525.   89.   92.   74.  114.  204.   45.  172.
    2.  290.   94.   60.   91.   67.  510.  405.   94.  525.  294.  406.
  258.  115.  486.  688.   20.  165.  480.  512.  829.   18.   74.  534.
  107.  253.  322.  754.  177.   84.]
[t-SNE] Computing 91 nearest neighbors...
[t-SN

In [203]:


# reorder the labels to have colors matching the cluster results, using medoids which are closer to the standard vectors
medoidSets = [actualSets[i] for i in medoidsIndices]
print("medoidSets: ", len(medoidSets))
print("medoid indices: ", medoidsIndices.shape, medoidsIndices)

num_clusters_unique = unique_labels[unique_labels >= 0]
#print("num_clusters_unique: ", len(num_clusters_unique), num_clusters_unique)

assert len(medoidSets) == len(num_clusters_unique), "The number of medoids is not equal to the number of unique labels"   

if len(medoidSets) == 0:
    print("No clustroids found")
else:

    #route_matrix_standard, route_matrix_medoids = create_binary_matrices(standardSets, medoidSets)

    #simMatrixMixed = jaccard_similarity_two_matrices(route_matrix_medoids, route_matrix_standard)
    simMatrixMixed = route_similarity_standard_to_actual[medoidsIndices]
    print("simMatrixMixed: ", type(simMatrixMixed), simMatrixMixed.shape, simMatrixMixed[0])
    #print("route_matrix_standard: ", route_matrix_standard.shape)

    #print("route_matrix_actual: ", route_matrix_medoids.shape)

    #print("simMatrixMixed: ", simMatrixMixed.shape, simMatrixMixed[0])


    CAN_BE_ORDERED = False
    # get the closest standard vector for each medoid using simMatrixMixed

    # argmax = np.argmax(simMatrixMixed, axis=1) # get the index of the closest standard vector for each medoid
    # maxValues = np.max(simMatrixMixed, axis=1) # get the value of the closest standard vector for each medoid
    argmax = simMatrixMixed.argmax(axis=1) # get the index of the closest standard vector for each medoid
    maxValues = simMatrixMixed.max(axis=1).toarray() # get the value of the closest standard vector for each medoid
    #argmax = np.where(maxValues > 0.0, argmax, -1) # if the closest standard vector is not similar enough, set it to -1
    # argmax = np.asarray(argmax).ravel() #np.array([medoidsIndices[i] for i in argmax]) # map the index to the actual index in the completeSets
    # maxValues = np.asarray(maxValues).ravel()
    argmax = np.array(argmax).flatten()
    maxValues = maxValues.flatten()
    # maxValues = np.array(maxValues)
    #argmax = np.where(maxValues > 0.0, argmax, -1) # if the closest standard vector is not similar enough, set it to -1
    print("argmax: ", argmax.shape, type(argmax), argmax)
    print("maxval: ", maxValues.shape, type(maxValues), maxValues)
    

    if len(set(argmax)) == len(medoidsIndices): # if the argmax are all different, then the medoids can be reordered
        print("argmax is correct, can be reordered")
        CAN_BE_ORDERED = True
        # reorder medoidsIndices
        #print("argmax: ", argmax.shape, argmax)
        #print("medoidsIndices: ", medoidsIndices.shape, medoidsIndices)
        
        
        # Create an array of zeros with the same shape as the original array
        # result = np.zeros_like(medoidsIndices)
        # argsort = np.argsort(argmax)
        # print("argsort: ", argsort.shape, argsort)
        # # Fill the result array using the permutation indices
        # result[argsort] = medoidsIndices
        # medoidsIndicesReordered = result # reorder medoidsIndices
        # result = np.zeros_like(medoidsIndices)
        # unique_labels_filtered = unique_labels[unique_labels >= 0]
        # result[argsort] = unique_labels_filtered
        unique_labels_reordered = argmax   # 4, 2, 5, 10, ... -> 10, 4, 5, 2, ...
        
        #medoidsIndicesReordered = medoidsIndices[argmax] # reorder medoidsIndices
        #print("medoidsIndices: ", medoidsIndicesReordered.shape, medoidsIndicesReordered)
        #unique_labels = unique_labels[argmax]


    # for i in range(len(standardSets)):
    #     #distances = np.linalg.norm(medoids - standardVectors[i], axis=1)
    #     distancesCosine = []
    #     for j in range(len(medoidsIndices)):
    #         distancesCosine.append(cosine(actualSets[j], standardSets[i]))
    #     closest_medoid = np.argmin(distancesCosine)
        
    #     if closest_medoid not in unique_labels_reordered:
    #         unique_labels_reordered.append(closest_medoid)
    #     else:
    #         print("closest_medoid already in unique_labels_reordered")
    #         can_be_reordered = False
    #         break

    if not CAN_BE_ORDERED:
        #unique_labels_reordered = unique_labels

        #unique_labels = unique_labels_reordered
        #unique_labels = np.unique(labels_HDBSCAN)
        
        print("unique_labels: ", len(unique_labels), unique_labels)
    else:
        print("unique_labels_reordered: ", len(unique_labels_reordered), unique_labels_reordered)
        #unique_labels = unique_labels_reordered
        # compare if distances between clustroids and standard vectors are smaller than distances between standard vectors and other actual vectors

        #distancesClustroids = simMatrixMixed[np.arange(len(simMatrixMixed)), argmax]
        
    distancesClustroids = []
    distancesStandardVectors = []
    distancesStandardVectors2 = []
    # for i, clustroid in enumerate(medoidsIndices):
    #     distMedCluster = completeSetsDistances[:len(actualSets), :len(actualSets)][clustroid, labels_HDBSCAN == i]
    #     #print(labels_HDBSCAN)
    #     #print(labels_HDBSCAN == argmax[i])
    #     print("distMedCluster: ", np.sum(distMedCluster), len(distMedCluster), distMedCluster)
    #     # make distMedCluster a single list, no np.sum since they are inhomogeneous lists
    #     #print()
    #     #print(actualRefStandardIds)
    #     distStdCluster = completeSetsDistances[len(actualSets):, :len(actualSets)][i, np.where(np.array(actualRefStandardIds) == i)[0]]
    #     print("distStdCluster: ", np.sum(distStdCluster), len(distStdCluster), distStdCluster)
        
    #     distancesClustroids.append(np.mean(distMedCluster))
    #     distancesStandardVectors2.append(np.mean(distStdCluster))
    actualRefStandardIdsNumpy = np.array(actualRefStandardIds)
    for i, stdID in enumerate(standardRefIds):
        #distStdCluster = completeSetsDistances[len(actualSets):, :len(actualSets)][i, np.where(actualRefStandardIdsNumpy == stdID)[0]]
        #distSimCluster = completeSetsSimilarities[len(actualSets):, :len(actualSets)][i, np.where(actualRefStandardIdsNumpy == stdID)[0]]
        #print("stdID: ", stdID)
        #print("actualRefStandardIds", actualRefStandardIds)
        #print("where", len(np.where(actualRefStandardIdsNumpy == stdID)[0]), np.where(actualRefStandardIdsNumpy == stdID)[0])
        distSimCluster = route_similarity_standard_to_actual[np.where(actualRefStandardIdsNumpy == stdID)[0], i]
        distStdCluster = 1 - distSimCluster.toarray()
        distStdCluster = distStdCluster[distStdCluster != 1]
        print("distSimCluster: ", np.mean(distStdCluster), len(distStdCluster), distStdCluster)
        meanDist = np.mean(distStdCluster)
        if np.isnan(meanDist):
            distancesStandardVectors.append(1)
        else:
            distancesStandardVectors.append(np.mean(distStdCluster))
    
    
    mean_similarity_clustroids = np.mean(cluster_mean_distances)
    std_dev_similarity_clustroids = np.std(cluster_mean_distances)
    
    mean_similarity_standard_vectors = np.mean(distancesStandardVectors)
    std_dev_similarity_standard_vectors = np.std(distancesStandardVectors)
    
    cv_clustroids = std_dev_similarity_clustroids / mean_similarity_clustroids
    cv_standard_vectors = std_dev_similarity_standard_vectors / mean_similarity_standard_vectors
    
    # print in green if the improvement is positive, in red if it is negative
    print("\n\033[94mMean similarity from vectors of the same cluster to:")
    print("         clustroids: ", mean_similarity_clustroids)
    #print("         first     : ", np.mean(distancesClustroids))
    print("   standard vectors: ", mean_similarity_standard_vectors)
    #print("first                ", np.mean(distancesStandardVectors2))
    
    
    print("\nStd dev similarity from vectors of the same cluster to:")
    print("         clustroids: ", std_dev_similarity_clustroids)
    print("   standard vectors: ", std_dev_similarity_standard_vectors)
    
    print("\nCoefficient of variation from vectors of the same cluster to:")
    print("         clustroids: ", cv_clustroids)
    print("   standard vectors: ", cv_standard_vectors)
    print("\033[0m")
    
    #ratio = np.mean(distancesClustroids) / np.mean(distancesStandardVectors)
    ratio = mean_similarity_clustroids / mean_similarity_standard_vectors
    percentage = (1 - ratio) * 100
    
    print("\033[93mMean:\033[0m")
    if percentage > 0:
        # print in green if the improvement is positive, in red if it is negative
        print("   Improvement: \033[92m{:.2f}% \033[0m".format(percentage))
    else:
        print("   Decline: \033[91m{:.2f}% \033[0m".format(-percentage))
        
    
    ratio = std_dev_similarity_clustroids / std_dev_similarity_standard_vectors
    percentage = (1 - ratio) * 100
    print("\033[93m\nStd dev:\033[0m")
    if percentage > 0:
        # print in green if the improvement is positive, in red if it is negative
        print("   Improvement: \033[92m{:.2f}% \033[0m".format(percentage))
    else:
        print("   Decline: \033[91m{:.2f}% \033[0m".format(-percentage))
        
    ratio = cv_clustroids / cv_standard_vectors
    percentage = (1 - ratio) * 100
    print("\033[93m\nCoefficient of Variation:\033[0m")
    if percentage > 0:
        # print in green if the improvement is positive, in red if it is negative
        print("   Improvement: \033[92m{:.2f}% \033[0m".format(percentage))
    else:
        print("   Decline: \033[91m{:.2f}% \033[0m".format(-percentage))
        





medoidSets:  10
medoid indices:  (10,) [1003 8562 2014 7092 9791 3067 4461 6668  255 5007]
simMatrixMixed:  <class 'scipy.sparse._csr.csr_matrix'> (10, 10)   (0, 1)	1.0
argmax:  (10,) <class 'numpy.ndarray'> [1 8 2 7 9 3 4 6 0 5]
maxval:  (10,) <class 'numpy.ndarray'> [1.         0.9980315  0.9853236  0.98643362 0.99248215 0.98576939
 0.99262405 0.98106547 0.98177763 0.98506489]
argmax is correct, can be reordered
unique_labels_reordered:  10 [1 8 2 7 9 3 4 6 0 5]
distSimCluster:  0.1795278024219779 1000 [0.22009717 0.27819519 0.18273395 0.29683132 0.12260835 0.27191856
 0.2077543  0.19480691 0.25817923 0.1685508  0.17888617 0.19469823
 0.25574443 0.18647629 0.27967369 0.30226336 0.03973704 0.25303185
 0.2717694  0.16812818 0.27644336 0.15199743 0.31565741 0.21929064
 0.24036385 0.31077008 0.24664769 0.22577938 0.26411102 0.07266508
 0.11533092 0.19958917 0.05139377 0.20685225 0.1173293  0.1235789
 0.10485547 0.16350118 0.27337921 0.21033477 0.22882086 0.24746483
 0.14011415 0.09245424

In [204]:
max_len = max(len(standardSets), len(num_clusters_unique))
colors = plt.cm.jet(np.linspace(0, 1, max_len))
# print("colors", colors.shape, colors)

# print("medoid labels", [labels_HDBSCAN[i] for i in medoidsIndices])
# print("argmax", argmax)

if len(medoidSets) > 0 and CAN_BE_ORDERED:
    print("medoids added to plots and reordered")
    color_map = dict(zip(range(max_len), colors[unique_labels_reordered]))
    #rint("color_map", color_map)
else:
    color_map = dict(zip(range(max_len), colors))   # 0=red, 1=blue, 2=green, 3=yellow, 4=purple, 5=lightblue, 6=lightgreen, 7=lightyellow, 8=lightpurple
    
marker_colors = [color_map[label] if label > -1 else np.array([0,0,0,1]) for label in labels_HDBSCAN]
marker_colors_medoids = [color_map[label] if label > -1 else np.array([0,0,0,1]) for label in labels_HDBSCAN[medoidsIndices]]
#marker_colors_medoids = [color_map[label] if label > -1 else np.array([0,0,0,1]) for label in unique_labels]


# Create a trace for each type (centroids data)
traceStandard = go.Scatter3d(
    x=completeSetTSNE[len(actualSets):,0],
    y=completeSetTSNE[len(actualSets):,1],
    z=completeSetTSNE[len(actualSets):,2],
    mode='markers',
    marker=dict(
        size=7,
        color=colors,                # set color to an array/list of desired values
        opacity=1,
        symbol='diamond'
    )
)

if len(medoidSets) > 0:
    medoidsElements = completeSetTSNE[medoidsIndices] # medoidsIndices = [921, 123]
    traceMedoids = go.Scatter3d(
        x=medoidsElements[:,0],
        y=medoidsElements[:,1],
        z=medoidsElements[:,2],
        mode='markers',
        marker=dict(
            size=7,
            color=marker_colors_medoids,                # set color to an array/list of desired values
            opacity=1,
            symbol='cross'
        )
    )

# Create a trace for each type (centroids data)
traceActual = go.Scatter3d(
    x=completeSetTSNE[:len(actualSets),0],
    y=completeSetTSNE[:len(actualSets),1],
    z=completeSetTSNE[:len(actualSets),2],
    mode='markers',
    marker=dict(
        size=7,
        color=marker_colors,                # set color to an array/list of desired values
        opacity=0.1,
        symbol='circle'
    )
)

# Plot
if len(medoidSets) > 0:
    data = [traceStandard, traceActual, traceMedoids]
else:
    data = [traceStandard, traceActual]

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

medoids added to plots and reordered


## Ground Truth

In [205]:

colors_true = plt.cm.jet(np.linspace(0, 1, len(standardSets)))
color_map_true = dict(zip(range(max_len), colors))   # 0=red, 1=blue, 2=green, 3=yellow, 4=purple, 5=lightblue, 6=lightgreen, 7=lightyellow, 8=lightpurple
# marker colors for each point with the same color as the cluster it belongs to in the original data
marker_colors_true = [color_map_true[label] for label in actualRefStandardIds]


# Create a trace for each type (centroids data)
traceStandard_true = go.Scatter3d(
    x=completeSetTSNE[len(actualSets):,0],
    y=completeSetTSNE[len(actualSets):,1],
    z=completeSetTSNE[len(actualSets):,2],
    mode='markers',
    marker=dict(
        size=7,
        color=colors_true,                # set color to an array/list of desired values
        opacity=1,
        symbol='diamond'
    )
)

# Create a trace for each type (centroids data)
traceActual_true = go.Scatter3d(
    x=completeSetTSNE[:len(actualSets),0],
    y=completeSetTSNE[:len(actualSets),1],
    z=completeSetTSNE[:len(actualSets),2],
    mode='markers',
    marker=dict(
        size=7,
        color=marker_colors_true,                # set color to an array/list of desired values
        opacity=0.1,
        symbol='circle'
    )
)

# Plot
data = [traceStandard_true, traceActual_true]

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

## Output recommended standard routes

In [206]:
# Save the medoids to a file
with open(os.path.join("results", 'recStandard.json'), 'w', encoding="utf-8") as f:
    recStandard = []
    for i, index in enumerate(medoidsIndices):
        recRoute = {"id": "s" + str(i)}
        recRoute["route"] = dfActual.iloc[actualSets[map_indices_back[index]][0]]["route"]
        recStandard.append(recRoute)
    json.dump(recStandard, f, ensure_ascii=False, indent=2)

: 

## t-SNE of centroids of clusters and actual routes

## t-SNE of clustroids, standard, and actual routes