In [1]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import time
import math
import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import HDBSCAN, DBSCAN
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

from scipy.sparse import csr_matrix, issparse, lil_matrix, coo_matrix

from tqdm import tqdm
from pandarallel import pandarallel

from numba import njit, prange, jit
from numba_progress import ProgressBar

HOME:  /Users/ericsuardi/Desktop/DataMiningProject23-24


In [2]:
# STANDARD_FILE = 'standard_small_order_printClustroids.json'
# ACTUAL_FILE = 'actual_small_order_printClustroids.json'

# STANDARD_FILE = 'standard_small.json'
# ACTUAL_FILE = 'actual_small.json'

STANDARD_FILE = 'standard_medium_new.json'
ACTUAL_FILE = 'actual_medium_new.json'

# STANDARD_FILE = 'standard_big_new_3.json'
# ACTUAL_FILE = 'actual_big_new_3.json'

K_SHINGLES = 3

In [3]:
# load standard and actual data
print("\nReading standard data...")
with open(os.path.join('data',STANDARD_FILE)) as f:
    standard = json.load(f)

print("\nReading actual data...")
with open(os.path.join('data', ACTUAL_FILE)) as f:
    actual = json.load(f)

# load the data into a dataframe
print("\nCreating standard dataframe...")
dfStandard = pd.DataFrame(standard)
print("\nCreating actual dataframe...")
dfActual = pd.DataFrame(actual)

# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())

# get the unique cities and items of the standard data
cities = []
items = []
longestRoute = 0
shortestRoute = np.inf
maxItemQuantity = 0

standardRefIds = []
for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    standardRefIds.append(int(idS[1]))
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing standard data")

actualRefStandardIds = []
for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    idStandard = s['sroute']
    actualRefStandardIds.append(int(idStandard[1]))
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing actual data")


# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
#uniqueCities.insert(0, 'NULL')          # add NULL city, for padding vectors with different lengths (trips in routes)
uniqueItems = sorted(list(set(items)))

print("\nSorted cities and items")

if shortestRoute < 2:
    K_SHINGLES = 2

threeShingles = []

for i, c1 in enumerate(uniqueCities):
    for j, c2 in enumerate(uniqueCities):
        if i == j:
            continue
        for k, c3 in enumerate(uniqueCities):
            if j == k or i == k:
                continue
            threeShingles.append([c1, c2, c3])
            
permutations = math.perm(len(uniqueCities), K_SHINGLES)

print("\nComputed all possible three-shingles")

print("\nUnique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)

print("\nNumber of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))

print("\nLongest route: ", longestRoute)
print("Shortest route: ", shortestRoute)

print("\nMax item quantity: ", maxItemQuantity)

print("\nNumber of three-shingles: ", len(threeShingles))

print(f"\n{K_SHINGLES}-shingles: ", math.perm(len(uniqueCities), K_SHINGLES))
print(f"{K_SHINGLES}-shingles: ", math.comb(len(uniqueCities), K_SHINGLES))

print(f"\n\033[92mK-Shingles used: {K_SHINGLES} \033[0m")



Reading standard data...

Reading actual data...

Creating standard dataframe...

Creating actual dataframe...
   id                                              route
0  s0  [{'from': 'Teramo', 'to': 'Varese', 'merchandi...
1  s1  [{'from': 'Trapani', 'to': 'Carrara', 'merchan...
2  s2  [{'from': 'Sanremo', 'to': 'Carrara', 'merchan...
3  s3  [{'from': 'Tivoli', 'to': 'Legnano', 'merchand...
4  s4  [{'from': 'Carrara', 'to': 'Bagheria', 'mercha...
   id driver sroute                                              route
0  a0    E_3     s0  [{'from': 'Teramo', 'to': 'Varese', 'merchandi...
1  a1    M_2     s0  [{'from': 'Teramo', 'to': 'Varese', 'merchandi...
2  a2    X_1     s0  [{'from': 'Teramo', 'to': 'Varese', 'merchandi...
3  a3    C_0     s0  [{'from': 'Varese', 'to': 'Sanremo', 'merchand...
4  a4    Z_2     s0  [{'from': 'Teramo', 'to': 'Varese', 'merchandi...

Finished preparing standard data

Finished preparing actual data

Sorted cities and items

Computed all possible three-

In [4]:
def hashShingles(shingles, n):
    # hash shingles
    string = "" 
    for shingle in shingles:
        string += str(shingle) + "," # [45, 4, 8] -> "45,4,8,"
    
    return hash(string) #% n

def createShingles(df, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    # create shingles for each route
    shingles = []
    for index, s in df.iterrows():
        idS = s['id']
        route = s['route']
        shingle = [index]
        citiesInRoute = [] # napoli roma milano teramo bergamo [10,4,5,48,12] [10,4,5] [4,5,48] [5,48,12]
        merchandiseInRoute = np.zeros(len(uniqueItems))
        for trip in route:
            citiesInRoute.append(uniqueCities.index(trip['from']))
            #merchandiseInRoute += np.array(list(trip['merchandise'].values()))
            for item, n in trip['merchandise'].items():
                merchandiseInRoute[uniqueItems.index(item)] += n
        if len(route) > 0:
            citiesInRoute.append(uniqueCities.index(route[-1]['to']))
        if len(route) > 0:
            merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
        
        hashedShingles = []
        for i in range(len(citiesInRoute)-k+1):
            # Q: is it correct to set the modulo for the hash function to the number of permutations?
            # A: yes, because we want to have a unique hash for each shingle
            # Q: would it be better to use a different hash function?
            # A: yes, because the modulo function is not a good hash function
            hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations) )
        
        shingle.append(np.array(hashedShingles))
        
        shingle.append(merchandiseInRoute) # quantity hot encoding
        
        shingles.append(shingle)
        
    return shingles # [ index, [shingles], [merchandise] ]

def create_shingles(s, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    idS = s['id']
    route = s['route']
    shingle = [s.name]
    citiesInRoute = [] 
    merchandiseInRoute = np.zeros(len(uniqueItems))
    for trip in route:
        citiesInRoute.append(uniqueCities.index(trip['from']))
        for item, n in trip['merchandise'].items():
            merchandiseInRoute[uniqueItems.index(item)] += n
    if len(route) > 0:
        citiesInRoute.append(uniqueCities.index(route[-1]['to']))
    if len(route) > 0:
        merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
    
    hashedShingles = []
    for i in range(len(citiesInRoute)-k+1):
        hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations))
    
    shingle.append(np.array(hashedShingles))
    shingle.append(merchandiseInRoute)
    
    return shingle

In [5]:
#standardSets = createShingles(dfStandard, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
#actualSets = createShingles(dfActual, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
pandarallel.initialize(progress_bar=True)
standardSets = dfStandard.parallel_apply(lambda s: create_shingles(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
standardSets = standardSets.tolist()
actualSets = dfActual.parallel_apply(lambda s: create_shingles(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
actualSets = actualSets.tolist()

print("\nstandardSets", len(standardSets), "shape first element", standardSets[0][1].shape, standardSets[0])
print("\nactualSets", len(actualSets),  "shape first element", standardSets[0][1].shape, actualSets[0])

print("\nstandardSets:", len(standardSets))
print("actualSets:", len(actualSets))

assert len(standardSets[0]) == 3, "The length of the standard set is not equal to 3 (index, shingles, merchandise)"
assert len(standardSets[0][2]) == len(uniqueItems), "The length of the merchandise vector is not equal to the number of unique items"

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1000), Label(value='0 / 1000'))), …


standardSets 10 shape first element (24,) [0, array([-9133844285354990545,  2007260701692332267,   473299742115883028,
       -9049105497237273710, -2906541211342000488,   549967497434303552,
         -62482470410976084,  7217578552460934769, -3153700959833826289,
       -6401398808551733404, -7232661585650623993,  3443255954191577051,
       -8170518462156080237, -7874806150623613130,  2669509377566525361,
        7620645728264910417,  4507047029219194625, -8354696303991687970,
        8171294021517294207,  1205663920577848515,  2478535583236255693,
        6099714892374476518,  -830195267355793480,   104569130869102531]), array([0.25541667, 0.29333333, 0.23166667, 0.28458333, 0.17416667,
       0.26916667, 0.34125   , 0.25833333, 0.32125   , 0.28291667])]

actualSets 10000 shape first element (24,) [0, array([-9133844285354990545,  2007260701692332267,   473299742115883028,
       -9049105497237273710, -2906541211342000488, -7422322349753820843,
       -8192202672046642217,   -62482

## Clustering

In [6]:


def jaccard_similarity_matrix(matrix):
    if 1.0 - np.count_nonzero(matrix) / matrix.size > 0.5:
        print("matrix jaccard is sparse")
        
        matrixCSR = csr_matrix(matrix)
        intersection = np.dot(matrixCSR, matrixCSR.T)
        intersection = intersection.todense()
        print("intersection", intersection.shape, type(intersection))
        #print("intersection", intersection.toarray(), type(intersection.toarray()))
        row_sums = matrix.sum(axis=1)
        print("row_sums", row_sums.shape)
        union = row_sums[:, None] + row_sums - intersection
        print("union", union.shape)
        union = np.where(union == 0, 1, union)  # avoid division by zero
        print("union", union.shape)
        jaccard_similarity = intersection / union
        print("jaccard_similarity", jaccard_similarity.shape, type(jaccard_similarity))
    else:
        print("matrix jaccard is not sparse")
        
        intersection = np.dot(matrix, matrix.T)
        print("intersection", intersection.shape)
        #print("intersection", intersection.toarray(), type(intersection.toarray()))
        row_sums = matrix.sum(axis=1)
        print("row_sums", row_sums.shape)
        union = row_sums[:, None] + row_sums - intersection
        print("union", union.shape)
        union = np.where(union == 0, 1, union)
        jaccard_similarity = intersection / union
        print("jaccard_similarity", jaccard_similarity.shape)
    print("jaccard_similarity contains nan", np.isnan(jaccard_similarity).any())   
    return jaccard_similarity

def jaccard_similarity_minhash(matrix):
    similarity_matrix = np.full((matrix.shape[0], matrix.shape[0]), np.inf)
    num_permutations = matrix.shape[1]
    for i in tqdm(range(matrix.shape[0])):
        for j in range(i + 1, matrix.shape[0]):
            similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / num_permutations
            similarity_matrix[j, i] = similarity_matrix[i, j]
                    
    # Create a full symmetric matrix from the upper triangular part
    #similarity_matrix = np.triu(similarity_matrix) + np.triu(similarity_matrix, 1).T
    np.fill_diagonal(similarity_matrix, 1)
    print("similarity_matrix", similarity_matrix.shape, similarity_matrix[0])
    return similarity_matrix

# def minhash(matrix, permutations):
#     minhash_matrix = np.full((matrix.shape[0], permutations), np.inf)
#     coeff_range = permutations * 100
#     index_range = permutations * 100

#     # Generate the hash functions
#     hash_functions = [lambda x, a=a, b=b: (a * x + b) % matrix.shape[1] for a, b in zip(random.sample(range(coeff_range), permutations), random.sample(range(index_range), permutations))]
    
#     for i in tqdm(range(matrix.shape[0]), desc="minhashing", miniters=1000):
#         indices = np.where(matrix[i] == 1)[0]
#         for k in range(permutations):
#             hashed_indices = np.array([hash_functions[k](j) for j in indices])
#             #print("hashed_indices", hashed_indices.shape, hashed_indices)
#             minhash_matrix[i, k] = np.min(hashed_indices, initial=matrix.shape[1]+1)
    
#     return minhash_matrix

def hash_function_hash_code(num_of_hashes,n_col,next_prime):
  
    #coeffA = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))
    #coeffB = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))

    coeffA = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))
    coeffB = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))

    x = np.arange(n_col).reshape((1,n_col))

    hash_code = (np.matmul(coeffA,x) + coeffB) % next_prime # (num_of_hashes,n_col) so how each column index is permuted

    return hash_code

def minhash(u,num_of_hashes):
    (n_row, n_col) = u.shape
    next_prime = n_col
    hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

    signature_array = np.empty(shape = (n_row,num_of_hashes))

    #t2 = time.time()

    for row in tqdm(range(n_row), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(u[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature_array[row,:] = np.zeros((1,num_of_hashes))
            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature_array[row,:] = row_signature

    return signature_array

def find_band_and_row_values(columns, threshold):
    previous_b = 1
    previous_r = columns
    for b in range(1, columns + 1):
        if columns % b == 0:
            r = columns // b
            if (1 / b) ** (1 / r)  <= threshold:
                if np.abs((1 / previous_b) ** (1 / previous_r) - threshold) < np.abs((1 / b) ** (1 / r) - threshold):
                    return previous_b, previous_r
                return b, r
    return columns, 1

def lsh(minhash_matrix, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix.shape[1]
    
    # Generate the hash functions
   # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
    #bands = b
        
    print("final bands", b)
    signature_matrix = np.full((minhash_matrix.shape[0], b), np.inf)
    
    # if threshold is 0.8,
    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # For each band
    print("Computing hash values of bands...")
    hash_values = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix.shape[0], 1, minhash_matrix.reshape(-1, r))
    # Reshape the hash values to match the signature matrix
    hash_values = hash_values.reshape(minhash_matrix.shape[0], b)
    # Update the signature matrix
    signature_matrix = hash_values
            
    # find candidate pairs
    print("Finding candidate pairs...")
    candidate_pairs = []
    for i in tqdm(range(signature_matrix.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix[i+1:, :] == signature_matrix[i, :], axis=1) / b
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        indices = np.nonzero(similarities >= threshold)[0]
        # Add the pairs to the candidate pairs
        candidate_pairs.extend((i, i+1+index) for index in indices)
    
    return np.array(candidate_pairs)

def lsh_two_matrices(minhash_matrix1, minhash_matrix2, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix1.shape[1]
    
    # Generate the hash functions
    # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
    #bands = b
        
    print("final bands", b)
    signature_matrix1 = np.full((minhash_matrix1.shape[0], b), np.inf)
    signature_matrix2 = np.full((minhash_matrix2.shape[0], b), np.inf)
    
    # if threshold is 0.8,
    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # For each band
    print("Computing hash values of bands...")
    hash_values1 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix1.shape[0], 1, minhash_matrix1.reshape(-1, r))
    hash_values2 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix2.shape[0], 1, minhash_matrix2.reshape(-1, r))
    # Reshape the hash values to match the signature matrix
    hash_values1 = hash_values1.reshape(minhash_matrix1.shape[0], b)
    hash_values2 = hash_values2.reshape(minhash_matrix2.shape[0], b)
    # Update the signature matrix
    signature_matrix1 = hash_values1
    signature_matrix2 = hash_values2
            
    # find candidate pairs
    print("Finding candidate pairs...")
    candidate_pairs = np.empty((minhash_matrix1.shape[0], 2))
    for i in tqdm(range(signature_matrix1.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix2 == signature_matrix1[i, :], axis=1) / b
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        #indices = np.nonzero(similarities >= threshold)[0]
        indexMax = np.argmax(similarities)
        simMax = similarities[indexMax]
        # Add the pairs to the candidate pairs
        #candidate_pairs.extend((i, i+1+index) for index in indices)
        candidate_pairs[i] = [indexMax, simMax]
        
    return candidate_pairs
        

# def jaccard_similarity_minhash_lsh(matrix, bands=10):
#     #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     pairs = lsh(matrix, bands=bands, columns=matrix.shape[1])
#     uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
#     uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs])
#     print("uniqueRows numpy", len(uniqueRows))
#     print("uniqueRows set", len(uniqueRowsSet))
#     print("num of pairs", len(pairs))
#     print("num unique i", len(set([i for i, j in pairs])))
#     print("num unique j", len(set([j for i, j in pairs])))
#     print("num unique rows", len(uniqueRows))
#     map_i = {i: index for i, index in enumerate(uniqueRows)}
    
#     subset_matrix = matrix[list(uniqueRows)]
    
#     print("Computing jaccard similarity on subset matrix...")
#     print("subset matrix", subset_matrix.shape)
#     subset1 = subset_matrix[:, None, :]
#     subset2 = subset_matrix[None, :, :]
#     min_matrix = np.minimum(subset1, subset2) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
#     sum_min_matrix = np.sum(min_matrix, axis=-1)
#     print("sum_min_matrix", sum_min_matrix.shape)
    
#     max_matrix = np.maximum(subset1, subset2)
#     sum_max_matrix = np.sum(max_matrix, axis=-1)
#     print("sum_max_matrix", sum_max_matrix.shape)
    
#     jaccard_similarity_matrix =  sum_min_matrix / sum_max_matrix
    
#     # map back to original matrix
#     print("Mapping back to original matrix...")
#     # for i, j in tqdm(pairs):
#     #     similarity_matrix[map_i[i], map_i[j]] = jaccard_similarity_matrix[i, j]
#     #     similarity_matrix[map_i[j], map_i[i]] = similarity_matrix[map_i[i], map_i[j]]
        
#     for i in tqdm(range(jaccard_similarity_matrix.shape[0])):
#         for j in range(i + 1, jaccard_similarity_matrix.shape[0]):
#             similarity_matrix[map_i[i], map_i[j]] = jaccard_similarity_matrix[i, j]
#             similarity_matrix[map_i[j], map_i[i]] = similarity_matrix[map_i[i], map_i[j]]
    
    
#     # for i, j in tqdm(pairs, desc="lsh sim"):
#     #     similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / matrix.shape[1]
#     #     similarity_matrix[j, i] = similarity_matrix[i, j]
    
#     similarity_matrix.setdiag(1)
    
#     return similarity_matrix

@njit(cache=True,)
def replicate_row(matrix, i):
    result = np.empty((matrix.shape[0], matrix.shape[1]))
    for j in range(matrix.shape[0]):
        result[j] = matrix[i]
    return result

@njit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrix(subset_matrix, progress_proxy):
    n = subset_matrix.shape[0]
    m = subset_matrix.shape[1]
    subset_matrix = subset_matrix.astype(np.int64)
    subset_similarity_matrix = np.zeros((n, n))
    subset2 = subset_matrix
    for i in prange(n):
        subset1 = subset_matrix[i].reshape(1, -1) #replicate_row(subset_matrix, i)    
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        subset_similarity_matrix[i] = 1 - (np.divide(sum_min_matrix, sum_max_matrix)).T
        progress_proxy.update(1)
    return subset_similarity_matrix

@jit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrix_only_pairs(matrix, matrixMerch, pairs, progress_proxy):
    n = matrix.shape[0]
    m = matrix.shape[1]
    similarity_pairs = np.zeros(len(pairs))
    for i in prange(len(pairs)):
        subset1 = matrix[pairs[i][0]] #replicate_row(subset_matrix, i)  
        subset2 = matrix[pairs[i][1]]
        subset1Merch = matrixMerch[pairs[i][0]]
        subset2Merch = matrixMerch[pairs[i][1]]
        
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        distMerch = 1 - np.abs(np.dot(subset1Merch, subset2Merch) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch)))
        
        
        
        similarity_pairs[i] = min((1 - (sum_min_matrix / sum_max_matrix)) * distMerch, 1.0)
        if similarity_pairs[i] == 2:
            print("similarity_pairs[i]", similarity_pairs[i])
            print("dist merch", distMerch, "cosine ", np.abs(np.dot(subset1Merch, subset2Merch) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch))))
            print("dist routes", (1 - (sum_min_matrix / sum_max_matrix)))
        progress_proxy.update(1)
    return similarity_pairs

@njit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrices(subset_matrix1, subset_matrix2, progress_proxy):
    n = subset_matrix1.shape[0]
    m = subset_matrix2.shape[0]
    #subset_matrix = subset_matrix.astype(np.int64)
    subset_similarity_matrix = np.zeros((n, m))
    subset2 = subset_matrix2
    for i in prange(n):
        subset1 = subset_matrix1[i].reshape(1, -1) #replicate_row(subset_matrix, i)    
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        subset_similarity_matrix[i] = 1 - (np.divide(sum_min_matrix, sum_max_matrix)).T
        progress_proxy.update(1)
    return subset_similarity_matrix

def jaccard_similarity_minhash_lsh_route_merch(matrix, matrixMerch, thresh_user=0.2):
    #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    #similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    pairs = lsh(matrix, thresh_user=thresh_user)
    #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    neverSeen = set([i for i in range(matrix.shape[0])]) - uniqueRowsSet
    print("neverSeen", neverSeen)
    #print("uniqueRows numpy", len(uniqueRows))
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    #print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
    print(" num of pairs", len(pairs))
    print(" instead of", matrix.shape[0]*(matrix.shape[0]-1)/2)
    print("improved by", len(pairs) / (matrix.shape[0]*(matrix.shape[0]-1)/2)*100, "%")
    #print("num of pairs", len(pairs))
    #print("num unique i", len(set([i for i, j in pairs])))
    #print("num unique j", len(set([j for i, j in pairs])))
    #print("num unique rows", len(uniqueRows))
    #map_i = {i: index for i, index in enumerate(uniqueRowsSet)}
    #map_i_array = np.array([map_i[i] for i in range(len(map_i))])
    
    #subset_matrix = matrix[list(uniqueRowsSet)]
    
    #subset_similarity_matrix = np.full((subset_matrix.shape[0], subset_matrix.shape[0]), np.inf)
    
    print("Computing jaccard similarity on subset matrix...")
    #print("subset matrix", subset_matrix.shape)

    with ProgressBar(total=len(pairs)) as progress:
        distance_pairs = compute_subset_similarity_matrix_only_pairs(matrix, matrixMerch, pairs, progress)
        
    if len(neverSeen) > 0:
        for i, n in enumerate(neverSeen):
            distance_pairs = np.concatenate([distance_pairs, [1]*(matrix.shape[0]-1-i)])
        
        pairs = np.concatenate([pairs, np.array([[i, j] for i,n  in enumerate(neverSeen) for j in range(i, matrix.shape[0]) if i != j])])
    print("pairs", pairs.shape, pairs[-10:])
    # map back to original matrix
    print("Mapping back to original matrix...")
    # Create arrays of indices
    # Create data array for COO matrix
    indices_i, indices_j = np.array(pairs).T
    data = np.concatenate([distance_pairs, distance_pairs])

    # Create row and column index arrays for COO matrix
    rows = np.concatenate([indices_i, indices_j])
    cols = np.concatenate([indices_j, indices_i])

    # Create COO matrix
    similarity_matrix = coo_matrix((data, (rows, cols)), shape=(matrix.shape[0], matrix.shape[0]))
    
    # indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
    # similarity_matrix = similarity_matrix.tocsr()
    # # Update the similarity matrix
    # similarity_matrix[map_i_array[indices_i], map_i_array[indices_j]] = subset_similarity_matrix[indices_i, indices_j]
    # similarity_matrix[map_i_array[indices_j], map_i_array[indices_i]] = subset_similarity_matrix[indices_i, indices_j]
    
    # for i, j in tqdm(pairs, desc="lsh sim"):
    #     similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / matrix.shape[1]
    #     similarity_matrix[j, i] = similarity_matrix[i, j]
    
    #similarity_matrix.setdiag(1)
    similarity_matrix = similarity_matrix.tocsr()
    
    return similarity_matrix


# def jaccard_similarity_minhash_lsh(matrix, thresh_user=0.2):
#     #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     #similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
#     pairs = lsh(matrix, thresh_user=thresh_user)
#     #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
#     uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
#     #print("uniqueRows numpy", len(uniqueRows))
#     print("num of subset of rows to check similarity:", len(uniqueRowsSet))
#     #print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
#     print(" num of pairs", len(pairs))
#     print(" instead of", matrix.shape[0]*(matrix.shape[0]-1)/2)
#     print("improved by", len(pairs) / (matrix.shape[0]*(matrix.shape[0]-1)/2)*100, "%")
#     #print("num of pairs", len(pairs))
#     #print("num unique i", len(set([i for i, j in pairs])))
#     #print("num unique j", len(set([j for i, j in pairs])))
#     #print("num unique rows", len(uniqueRows))
#     map_i = {i: index for i, index in enumerate(uniqueRowsSet)}
#     map_i_array = np.array([map_i[i] for i in range(len(map_i))])
    
#     subset_matrix = matrix[list(uniqueRowsSet)]
    
#     subset_similarity_matrix = np.full((subset_matrix.shape[0], subset_matrix.shape[0]), np.inf)
    
#     print("Computing jaccard similarity on subset matrix...")
#     print("subset matrix", subset_matrix.shape)
    
#     # subset1 = subset_matrix[:, None, :]
#     # subset2 = subset_matrix[None, :, :]
#     # min_matrix = np.minimum(subset1, subset2) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
#     # sum_min_matrix = np.sum(min_matrix, axis=-1)
#     # print("sum_min_matrix", sum_min_matrix.shape)
    
#     # max_matrix = np.maximum(subset1, subset2)
#     # sum_max_matrix = np.sum(max_matrix, axis=-1)
#     # print("sum_max_matrix", sum_max_matrix.shape)
    
#     #subset_similarity_matrix =  sum_min_matrix / sum_max_matrix
#     #subset_similarity_matrix = np.divide(sum_min_matrix, sum_max_matrix, out=np.zeros_like(sum_min_matrix), where=(sum_max_matrix != 0))
    
#     # for i in tqdm(range(subset_matrix.shape[0])):
#     #     subset1 = np.vstack([subset_matrix[i]] * subset_matrix.shape[0])
#     #     subset2 = subset_matrix
#     #     min_matrix = np.minimum(subset1, subset2) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
#     #     sum_min_matrix = np.sum(min_matrix, axis=-1)
#     #     #print("sum_min_matrix", sum_min_matrix.shape)
        
#     #     max_matrix = np.maximum(subset1, subset2)
#     #     sum_max_matrix = np.sum(max_matrix, axis=-1)
#     #     #print("sum_max_matrix", sum_max_matrix.shape)
        
#     #     subset_similarity_matrix[i] =  (sum_min_matrix / sum_max_matrix).T
#     with ProgressBar(total=subset_matrix.shape[0]) as progress:
#         subset_similarity_matrix = compute_subset_similarity_matrix(subset_matrix, progress)
    
#     # map back to original matrix
#     print("Mapping back to original matrix...")
#     # Create arrays of indices
#     # Create data array for COO matrix
#     indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
#     data = np.concatenate([subset_similarity_matrix[indices_i, indices_j], subset_similarity_matrix[indices_i, indices_j]])

#     # Create row and column index arrays for COO matrix
#     rows = np.concatenate([map_i_array[indices_i], map_i_array[indices_j]])
#     cols = np.concatenate([map_i_array[indices_j], map_i_array[indices_i]])

#     # Create COO matrix
#     similarity_matrix = coo_matrix((data, (rows, cols)), shape=(matrix.shape[0], matrix.shape[0]))
    
#     # indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
#     # similarity_matrix = similarity_matrix.tocsr()
#     # # Update the similarity matrix
#     # similarity_matrix[map_i_array[indices_i], map_i_array[indices_j]] = subset_similarity_matrix[indices_i, indices_j]
#     # similarity_matrix[map_i_array[indices_j], map_i_array[indices_i]] = subset_similarity_matrix[indices_i, indices_j]
    
#     # for i, j in tqdm(pairs, desc="lsh sim"):
#     #     similarity_matrix[i, j] = np.count_nonzero(matrix[i, :] == matrix[j, :]) / matrix.shape[1]
#     #     similarity_matrix[j, i] = similarity_matrix[i, j]
    
#     #similarity_matrix.setdiag(1)
#     similarity_matrix = similarity_matrix.tocsr()
    
#     return similarity_matrix

def jaccard_similarity_minhash_lsh_two_matrices(matrix1, matrix2, thresh_user=0.2):
    #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    #similarity_matrix = lil_matrix((matrix1.shape[0], matrix2.shape[0]), dtype=np.float64)
    pairs = lsh(matrix1, thresh_user=thresh_user)
    #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs])
    #print("uniqueRows numpy", len(uniqueRows))
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
    print(" instead of", matrix1.shape[0]*(matrix1.shape[0]-1)/2)
    print("improved by", (1-len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2/(matrix1.shape[0]*(matrix1.shape[0]-1)/2))*100, "%")
    #print("num of pairs", len(pairs))
    #print("num unique i", len(set([i for i, j in pairs])))
    #print("num unique j", len(set([j for i, j in pairs])))
    #print("num unique rows", len(uniqueRows))
    map_i = {i: index for i, index in enumerate(uniqueRowsSet)}
    map_i_array = np.array([map_i[i] for i in range(len(map_i))])
    
    subset_matrix1 = matrix1[list(uniqueRowsSet)]
    subset_matrix2 = matrix2[list(uniqueRowsSet)]
    
    subset_similarity_matrix = np.full((subset_matrix1.shape[0], subset_matrix2.shape[0]), np.inf)
    
    print("Computing jaccard similarity on subset matrix...")
    print("subset matrix", subset_matrix1.shape)

    with ProgressBar(total=subset_matrix1.shape[0]) as progress:
        subset_similarity_matrix = compute_subset_similarity_matrix(subset_matrix1, progress)
    
    # map back to original matrix
    print("Mapping back to original matrix...")
    # Create arrays of indices
    # Create data array for COO matrix
    indices_i, indices_j = np.triu_indices(subset_similarity_matrix.shape[0], k=1)
    data = np.concatenate([subset_similarity_matrix[indices_i, indices_j], subset_similarity_matrix[indices_i, indices_j]])
    
    # Create row and column index arrays for COO matrix
    rows = np.concatenate([map_i_array[indices_i], map_i_array[indices_j]])
    cols = np.concatenate([map_i_array[indices_j], map_i_array[indices_i]])
    
    # Create COO matrix
    similarity_matrix = coo_matrix((data, (rows, cols)), shape=(matrix1.shape[0], matrix2.shape[0]))
    similarity_matrix.setdiag(0)
    
    similarity_matrix = similarity_matrix.tocsr()
    
    return similarity_matrix
    

def jaccard_similarity_two_matrices(matrix1, matrix2):
    #intersection = np.dot(matrix, matrix.T)
    intersection = np.dot(matrix1, matrix2.T)
    row_sums1 = matrix1.sum(axis=1)
    row_sums2 = matrix2.sum(axis=1)
    union = row_sums1[:, None] + row_sums2 - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

def jaccard_similarity_matrix_merch(matrix):
    print("matrix", matrix.shape)
    min_matrix = np.minimum(matrix[:, None, :], matrix[None, :, :]) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix[:, None, :], matrix[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity

    # current wrong method
    # 1 2 3    1 4 7    1 2 3  6     6 6 6
    # 4 5 6    2 5 8    2 5 6  13    6 15 15
    # 7 8 9    3 6 9    3 6 9        6 15 24
    
    
    # right method
    # 1 2 3    1 2 3    1 2 3
    # 1 2 3    4 5 6    1 2 3
    # 1 2 3    7 8 9    1 2 3
    
    # 4 5 6    1 2 3    1 2 3
    # 4 5 6    4 5 6    4 5 6
    # 4 5 6    7 8 9    4 5 6
    
    # 7 8 9    1 2 3    1 2 3
    # 7 8 9    4 5 6    4 5 6
    # 7 8 9    7 8 9    7 8 9
    
    
    #                   6 6 6
    #                   6 15 15
    #                   6 15 24
    
def similarity_matrix_merch(matrix):
    if 1.0 - np.count_nonzero(matrix) / matrix.size > 0.5:
        print("merch matrix is sparse")
    matrix = csr_matrix(matrix)
    print("matrix merch shape", matrix.shape)
    simMatrix = cosine_similarity(matrix, dense_output=False)
    
    return simMatrix

def create_binary_matrix(routeSets):
    uniqueShingles = list(set(shingle for route in routeSets for shingle in route[1]))
    print("uniqueShingles", len(uniqueShingles))

    # Create a dictionary that maps each shingle to its index
    shingle_to_index = {shingle: index for index, shingle in enumerate(uniqueShingles)}
    print("shingle_to_index", len(shingle_to_index))

    binaryMatrix = np.zeros((len(routeSets), len(uniqueShingles)), dtype=int)

    for i, route in enumerate(routeSets):
        #print("i", i)
        # Get the indices of the shingles in this route
        indices = [shingle_to_index[shingle] for shingle in route[1]]
        # Use advanced indexing to set the corresponding elements in the binary matrix to 1
        binaryMatrix[i, indices] = 1

    return binaryMatrix

def create_binary_matrix_minhash(matrix):
    numUnique = np.unique(matrix)
    binaryMatrix = np.zeros((matrix.shape[0], len(numUnique)), dtype=int)
    for i, route in enumerate(matrix):
        indices = np.where(route == 1)[0]
        binaryMatrix[i, indices] = 1
    
    return binaryMatrix

def create_binary_matrices(routeSet1, routeSet2):
    # create binary matrix where each row represents a route
    uniqueShinglesBoth = list(set([shingle for route in routeSet1 for shingle in route[1]] + [shingle for route in routeSet2 for shingle in route[1]]))
    binaryMatrix1 = np.zeros((len(routeSet1), len(uniqueShinglesBoth)))
    binaryMatrix2 = np.zeros((len(routeSet2), len(uniqueShinglesBoth)))
    for i, route in enumerate(routeSet1):
        for shingle in route[1]:
            binaryMatrix1[i][uniqueShinglesBoth.index(shingle)] = 1
            
    for i, route in enumerate(routeSet2):
        for shingle in route[1]:
            binaryMatrix2[i][uniqueShinglesBoth.index(shingle)] = 1
    return binaryMatrix1, binaryMatrix2

def find_num_hashes_minhash(matrix):
    if matrix.shape[1] < 1000:
        num_hash_functions = matrix.shape[1]//10
    elif matrix.shape[1] < 10_000:
        num_hash_functions = 150
    elif matrix.shape[1] < 100_000:
        num_hash_functions = 250
    else:
        num_hash_functions = 300
    return num_hash_functions


# convert routes and merchandise to binary matrices
# binary matrix where each row represents a route
print("Creating route binary matrix...")
route_matrix, route_matrix_standard = create_binary_matrices(actualSets, standardSets)
print("\nroute_matrix actual", route_matrix.shape, route_matrix[0])
print("\nroute_matrix standard", route_matrix_standard.shape, route_matrix_standard[0])

print("Minhashing route matrix...")    
num_hash_functions = find_num_hashes_minhash(route_matrix)
route_matrix = minhash(route_matrix, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
print("\nroute_matrix minhash", route_matrix.shape, route_matrix[0])
# binary matrix where each row represents merchandise

print("Creating merchandise binary matrix...")
merch_matrix = np.array([s[2] for s in actualSets])

print("\nmerch_matrix", merch_matrix.shape, merch_matrix)
print("merch_matrix contains nan", np.isnan(merch_matrix).any())

# compute Jaccard similarity for each matrix
# print("Computing Jaccard similarity route matrix...")
# route_similarity = jaccard_similarity_minhash_lsh(route_matrix, thresh_user=0.4)
# #route_similarity = jaccard_similarity_matrix(route_matrix)
# print("\nroute_similarity", type(route_similarity), route_similarity.shape,route_similarity[0, 0], route_similarity[0])
# #merch_similarity = jaccard_similarity_matrix_merch(merch_matrix)
# print("Computing Jaccard similarity merchandise matrix...")
# #merch_similarity = similarity_matrix_merch(merch_matrix)
# merch_similarity_lsh = jaccard_similarity_minhash_lsh(merch_matrix, thresh_user=0.4)
# print("\nmerch_similarity", type(merch_similarity_lsh), merch_similarity_lsh.shape, merch_similarity_lsh[0])

print("Computing Jaccard similarity route matrix...")
actualSetsDistances = jaccard_similarity_minhash_lsh_route_merch(route_matrix, merch_matrix, thresh_user=0.7)
#route_similarity = jaccard_similarity_matrix(route_matrix)
print("\nactualSetsDistances", type(actualSetsDistances), actualSetsDistances.shape,actualSetsDistances[0, 0], actualSetsDistances[0])


# # compute final Jaccard distance
# print("Multiplying Jaccard similarities...")
# actualSetsDistances = (route_similarity.multiply(merch_similarity_lsh))
# actualSetsDistances = np.nan_to_num(actualSetsDistances, nan=0)
#actualSetsDistances = 1 - actualSetsDistances
#print("\nactualSetsDistances", actualSetsDistances.shape, actualSetsDistances[0, 0], actualSetsDistances[0])

# Essentials for Task 2

# standardToActualSetsDistances = None
# #route_matrix_standard = create_binary_matrix(standardSets)
# print("\nroute_matrix_standard", route_matrix_standard.shape, route_matrix_standard[0])
# route_matrix_standard = minhash(route_matrix_standard, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
# print("\nroute_matrix_standard minhash", route_matrix_standard.shape, route_matrix_standard[0])

# merch_matrix_standard = np.array([s[2] for s in standardSets])

# route_similarity_standard_to_actual = jaccard_similarity_minhash_lsh_two_matrices(route_matrix, route_matrix_standard, thresh_user=0.0)
# print("\nroute_similarity_standard_to_actual", route_similarity_standard_to_actual.shape, route_similarity_standard_to_actual[0])

# merch_similarity_lsh_standard_to_actual = jaccard_similarity_minhash_lsh_two_matrices(merch_matrix, merch_matrix_standard, thresh_user=0.0)





  @jit(cache=True, nogil=True, parallel=True)


Creating route binary matrix...

route_matrix actual (10000, 870) [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

minhashing: 100%|██████████| 10000/10000 [00:00<00:00, 134654.66it/s]


route_matrix minhash (10000, 88) [ 26.  29.   5.  14.   6.   4.   8.  16.  36.  25.  65.  47.  69.   4.
  94.  33.  58.  22. 102.  28.  85.  52.  50.   7.   3.   7.  38.  58.
  53.  21.  16.   9. 111.  14.  21.  29.   0.   7.  10. 122.  23.  54.
  47.  12.  27.   1.  12.  17.   7.  11.  35.  20.   3.   4.  44. 112.
 136. 125.  34.  17.  14.  83.  46.  13.   3.  50.  21.   6.   0.  27.
   0.   9.  25. 104.  40.  60.  22.  22.  51.  40.  25.  18.  23.  92.
  13.  67.  19.  53.]
Creating merchandise binary matrix...

merch_matrix (10000, 10) [[0.26958333 0.28291667 0.22333333 ... 0.24583333 0.31041667 0.25041667]
 [0.2675     0.32208333 0.25125    ... 0.25833333 0.32125    0.28291667]
 [0.2208     0.3316     0.214      ... 0.2488     0.3228     0.2544    ]
 ...
 [0.15       0.105      0.29       ... 0.845      0.155      0.105     ]
 [0.6        0.07       0.48333333 ... 0.56333333 0.31666667 0.07      ]
 [0.14666667 0.07       0.29       ... 0.56333333 0.31666667 0.07      ]]
merch_matr




Finding candidate pairs...


100%|██████████| 10000/10000 [00:01<00:00, 7138.56it/s]


neverSeen {8192, 8196, 5, 6, 8197, 8, 8200, 10, 11, 8201, 8202, 8203, 8207, 8208, 17, 18, 19, 21, 25, 27, 8219, 29, 8220, 33, 8227, 36, 37, 38, 8231, 8232, 47, 8239, 8241, 8242, 51, 8243, 8245, 8246, 58, 8253, 8254, 67, 69, 74, 8269, 78, 79, 82, 84, 8277, 86, 87, 88, 89, 90, 8281, 92, 93, 8283, 8284, 98, 99, 105, 8297, 8299, 110, 8304, 119, 8311, 8312, 8315, 127, 128, 132, 8324, 8327, 8329, 8330, 143, 8338, 8340, 8346, 8349, 159, 162, 165, 166, 168, 8361, 8367, 177, 8370, 181, 184, 185, 186, 188, 190, 191, 8384, 194, 199, 8391, 203, 8398, 207, 208, 8399, 210, 211, 8403, 8405, 8406, 216, 222, 224, 225, 237, 8430, 239, 8431, 241, 8432, 8433, 8435, 245, 8438, 252, 8447, 8449, 260, 261, 8452, 8457, 266, 267, 268, 271, 272, 273, 274, 278, 285, 289, 290, 291, 8483, 8485, 294, 295, 8488, 8489, 304, 305, 306, 307, 8497, 8500, 8502, 8503, 8504, 8505, 8506, 315, 8507, 319, 321, 323, 8516, 325, 8518, 8521, 332, 8524, 334, 8527, 336, 8528, 8533, 343, 8539, 8540, 8541, 351, 353, 356, 8549, 8550, 35

  0%|          | 0/716180 [00:00<?, ?it/s]

pairs (20768989, 2) [[2260 9990]
 [2260 9991]
 [2260 9992]
 [2260 9993]
 [2260 9994]
 [2260 9995]
 [2260 9996]
 [2260 9997]
 [2260 9998]
 [2260 9999]]
Mapping back to original matrix...

actualSetsDistances <class 'scipy.sparse._csr.csr_matrix'> (10000, 10000) 0.0   (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (0, 11)	1.0
  (0, 12)	1.0
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	1.0
  (0, 18)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 21)	1.0
  (0, 22)	1.0
  (0, 23)	1.0
  (0, 24)	1.0
  (0, 25)	1.0
  :	:
  (0, 9975)	1.0
  (0, 9976)	1.0
  (0, 9977)	1.0
  (0, 9978)	1.0
  (0, 9979)	1.0
  (0, 9980)	1.0
  (0, 9981)	1.0
  (0, 9982)	1.0
  (0, 9983)	1.0
  (0, 9984)	1.0
  (0, 9985)	1.0
  (0, 9986)	1.0
  (0, 9987)	1.0
  (0, 9988)	1.0
  (0, 9989)	1.0
  (0, 9990)	1.0
  (0, 9991)	1.0
  (0, 9992)	1.0
  (0, 9993)	1.0
  (0, 9994)	1.0
  (0, 9995)	1.0
  (0, 9996)	1.0
  (0, 9997)	1.0
  (0, 9998)	1.0
  (0

In [7]:
#np.mean(np.mean(np.abs(actualSetsDistances - test), axis=1))

In [8]:
# def jaccard_distance_routes(route1, route2):
#     id1 = route1[0]
#     id2 = route2[0]
#     r1 = set(route1[1])
#     r2 = set(route2[1])
#     merch1 = route1[2]
#     merch2 = route2[2]
    
#     intersection = len(list(r1.intersection(r2)))
#     union = (len(r1) + len(r2)) - intersection
#     jaccard_similarity = float(intersection) / union if union != 0 else 0
    
#     intersectionMerch = np.sum(np.minimum(merch1, merch2))
#     unionMerch = np.sum(np.maximum(merch1, merch2))
#     jaccard_similarity_merch = float(intersectionMerch) / unionMerch if unionMerch != 0 else 0
    
#     return 1 - (jaccard_similarity + jaccard_similarity_merch) / 2

# forward_expansion = len(actualSets) // len(standardSets)
# # precompute the distances between the elements of the actual sets
# actualSetsDistances = np.zeros((len(actualSets), len(actualSets)))
# for i in range(len(actualSets)):
#     for j in range(len(actualSets)):
#         actualSetsDistances[i,j] = jaccard_distance_routes(actualSets[i], actualSets[j])
#print("actualSetsDistances: ", actualSetsDistances.shape, actualSetsDistances[0])

In [7]:
# HDBSCAN clustering
# Compute HDBSCAN

# import gridsearchcv

#grid = GridSearchCV(hdbscan, scoring=hdbscan.probabilities_, , verbose=1)
# min_cluster_size = [10, 15, 20, 25, 30, 35, 40, 45, 50]
# max_cluster_size = [55, 60, 65, 75, 80, 85, 90, 95, 100]
forward_expansion = len(actualSets) // len(standardSets) #! TODO: change to mean
print("forward_expansion", forward_expansion)
# from hdbscan import HDBSCAN as hdbsc

#actualSetsDistances = actualSetsDistances.astype(np.float64)

# find rows with only zeros and set them to 1 of csr matrix
#actualSetsDistances[actualSetsDistances.getnnz(1)==0, :] = 1
print("get nnz", np.where((actualSetsDistances.getnnz(1)==0)==True)[0])
print("get nnz of 2 values", (actualSetsDistances==2))
print(actualSetsDistances[:, 9816])
#actualSetsDistances[:, actualSetsDistances.getnnz(1)==0] = 1

#actualSetsDistances.setdiag(0)


print("type(actualSetsDistances)", type(actualSetsDistances), actualSetsDistances.dtype, actualSetsDistances.shape, actualSetsDistances.count_nonzero(), min(actualSetsDistances.getnnz(axis=-1)), np.unique(actualSetsDistances.data))
#actualSetsDistances = np.array(actualSetsDistances)

#print("get nnz of 2 values", np.where((actualSetsDistances==2))[0])


hdb = HDBSCAN(min_cluster_size=forward_expansion//3, max_cluster_size=forward_expansion, metric="precomputed", store_centers=None,allow_single_cluster=False, metric_params={"max_distance": 0} ).fit(actualSetsDistances.copy())
#hdb = DBSCAN(eps=0.5, min_samples=40, metric="precomputed").fit(actualSetsDistances.copy())

labels_HDBSCAN = hdb.labels_
print("num clusters found: ", len(set(labels_HDBSCAN)))
print("biggest cluster: ", max(labels_HDBSCAN, key=list(labels_HDBSCAN).count), " num elements: ", list(labels_HDBSCAN).count(max(labels_HDBSCAN, key=list(labels_HDBSCAN).count)))

# Create a color map that maps each unique label to a color
unique_labels = np.unique(labels_HDBSCAN)
#unique_labels = unique_labels[unique_labels != -1]
print("unique_labels: ", len(unique_labels), unique_labels, "standard sets len", len(standardSets))

#centroids = hdb.centroids_
#medoids = hdb.medoids_

#print("centroids: ", centroids.shape)
#print("medoids: ", medoids.shape)

# find the medoids using the clusters found by HDBSCAN
print("actualSetsDistances: ", actualSetsDistances.shape, actualSetsDistances[0])
medoidsIndices = []
cluster_mean_distances = []
for cluster in unique_labels:
    #print("cluster: ", cluster)
    if cluster in [-1, -2, -3]:
        continue
    cluster_elements = np.where(labels_HDBSCAN == cluster)[0]
    #print("cluster_elements: ", cluster_elements.shape, cluster_elements)
    #print("cluster_elements: ", actualSetsDistances[cluster_elements].shape)
    cluster_distances = actualSetsDistances[cluster_elements][:,cluster_elements]
    #print("cluster_distances: ", cluster_distances.shape, cluster_distances)
    #print("real distance", actualSetsDistances[cluster_elements[0], cluster_elements[0]])
    cluster_distances_sum = np.sum(cluster_distances, axis=1)
    cluster_distances_mean = np.mean(cluster_distances, axis=1)
    cluster_mean_distances.append(np.min(cluster_distances_mean))
    #print("cluster min mean distance: ", np.min(cluster_distances_mean))
    medoid = cluster_elements[np.argmin(cluster_distances_sum)]
    #medoidMean = cluster_elements[np.argmin(cluster_distances_mean)]
    #print("medoidMean", medoidMean, "medoid", medoid)
    #print("medoid", medoid)
    medoidsIndices.append(medoid)
medoidsIndices = np.array(medoidsIndices)

print("medoidsIndices: ", medoidsIndices.shape, medoidsIndices)
print("cluster_mean_distances: ", len(cluster_mean_distances), cluster_mean_distances)


forward_expansion 1000
get nnz []
get nnz of 2 values 
  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 0)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 0)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (2236, 0)	1.0
  (2237, 0)	1.0
  (2238, 0)	1.0
  (2239, 0)	1.0
  (2240, 0)	1.0
  (2241, 0)	1.0
  (2242, 0)	1.0
  (2243, 0)	1.0
  (2244, 0)	1.0
  (2245, 0)	1.0
  (2246, 0)	1.0
  (2247, 0)	1.0
  (2248, 0)	1.0
  (2249, 0)	1.0
  (2250, 0)	1.0
  (2251, 0)	1.0
  (2252, 0)	1.0
  (2253, 0)	1.0
  (2254, 0)	1.0
  (2255, 0)	1.0
  (2256, 0)	1.0
  (2257, 0)	1.0
  (2258, 0)	1.0
  (2259, 0)	1.0
  (2260, 0)	1.0
type(actualSetsDistances) <class 'scipy.sparse._csr.csr_matrix'> float64 (10000, 10000) 40591320 2261 [0.00000000e+00 3.27538374e-06 5.15742154e-06 ... 1.00832836e+00
 1.00838206e+00 1.01112835e+00]


: 

In [43]:
completeSets = actualSets + standardSets
print("completeSets: ", len(completeSets))
route_matrix_with_standard = create_binary_matrix(completeSets)
# binary matrix where each row represents merchandise
merch_matrix_with_standard = np.array([s[2] for s in completeSets])

# compute Jaccard similarity for each matrix
route_similarity_with_standard = jaccard_similarity_matrix(route_matrix_with_standard)
merch_similarity_with_standard = similarity_matrix_merch(merch_matrix_with_standard).toarray()
print("route_similarity_with_standard", route_similarity_with_standard.shape, route_similarity_with_standard[0])
print("merch_similarity_with_standard", merch_similarity_with_standard.shape, merch_similarity_with_standard[0])
completeSetsDistances = np.multiply(route_similarity_with_standard, merch_similarity_with_standard)
print("completeSetsDistances", type(completeSetsDistances), completeSetsDistances.shape, completeSetsDistances[0])
completeSetsDistances = np.nan_to_num(completeSetsDistances, nan=0)
completeSetsDistances = 1 - completeSetsDistances
completeSetsDistances = np.array(completeSetsDistances)
print("completeSetsDistances", completeSetsDistances.shape, completeSetsDistances[0])

perplexity = 50 if len(completeSetsDistances) > 50 else len(completeSetsDistances) - 1
completeSetTSNE = TSNE(n_components=3, perplexity=perplexity, n_iter=1000, verbose=1).fit_transform(completeSetsDistances)

completeSets:  10010
uniqueShingles 870
shingle_to_index 870
matrix jaccard is sparse
intersection (10010, 10010) <class 'numpy.matrix'>
row_sums (10010,)
union (10010, 10010)
union (10010, 10010)
jaccard_similarity (10010, 10010) <class 'numpy.matrix'>
jaccard_similarity contains nan False
matrix merch shape (10010, 10)
route_similarity_with_standard (10010, 10010) [[1.         0.77777778 0.68965517 ... 0.         0.         0.        ]]
merch_similarity_with_standard (10010, 10010) [1.         0.99657303 0.99287173 ... 0.95588298 0.92375341 0.82649171]
completeSetsDistances <class 'numpy.matrix'> (10010, 10010) [[1.         0.77511236 0.68473912 ... 0.         0.         0.        ]]
completeSetsDistances (10010, 10010) [1.11022302e-16 2.24887642e-01 3.15260876e-01 ... 1.00000000e+00
 1.00000000e+00 1.00000000e+00]
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 10010 samples in 0.020s...
[t-SNE] Computed neighbors for 10010 samples in 8.303s...
[t-SNE] Computed conditiona

In [44]:


# reorder the labels to have colors matching the cluster results, using medoids which are closer to the standard vectors
medoidSets = [actualSets[i] for i in medoidsIndices]
print("medoidSets: ", len(medoidSets))

num_clusters_unique = unique_labels[unique_labels >= 0]
#print("num_clusters_unique: ", len(num_clusters_unique), num_clusters_unique)

assert len(medoidSets) == len(num_clusters_unique), "The number of medoids is not equal to the number of unique labels"   

if len(medoidSets) == 0:
    print("No clustroids found")
else:

    route_matrix_standard, route_matrix_medoids = create_binary_matrices(standardSets, medoidSets)

    simMatrixMixed = jaccard_similarity_two_matrices(route_matrix_medoids, route_matrix_standard)

    #print("route_matrix_standard: ", route_matrix_standard.shape)

    #print("route_matrix_actual: ", route_matrix_medoids.shape)

    #print("simMatrixMixed: ", simMatrixMixed.shape, simMatrixMixed[0])


    CAN_BE_ORDERED = False
    # get the closest standard vector for each medoid using simMatrixMixed

    argmax = np.argmax(simMatrixMixed, axis=1) # get the index of the closest standard vector for each medoid
    print("argmax: ", argmax.shape, argmax)

    if len(set(argmax)) == len(medoidsIndices): # if the argmax are all different, then the medoids can be reordered
        print("argmax is correct, can be reordered")
        CAN_BE_ORDERED = True
        # reorder medoidsIndices
        #print("argmax: ", argmax.shape, argmax)
        #print("medoidsIndices: ", medoidsIndices.shape, medoidsIndices)
        
        
        # Create an array of zeros with the same shape as the original array
        # result = np.zeros_like(medoidsIndices)
        # argsort = np.argsort(argmax)
        # print("argsort: ", argsort.shape, argsort)
        # # Fill the result array using the permutation indices
        # result[argsort] = medoidsIndices
        # medoidsIndicesReordered = result # reorder medoidsIndices
        # result = np.zeros_like(medoidsIndices)
        # unique_labels_filtered = unique_labels[unique_labels >= 0]
        # result[argsort] = unique_labels_filtered
        unique_labels_reordered = argmax   # 4, 2, 5, 10, ... -> 10, 4, 5, 2, ...
        
        #medoidsIndicesReordered = medoidsIndices[argmax] # reorder medoidsIndices
        #print("medoidsIndices: ", medoidsIndicesReordered.shape, medoidsIndicesReordered)
        #unique_labels = unique_labels[argmax]


    # for i in range(len(standardSets)):
    #     #distances = np.linalg.norm(medoids - standardVectors[i], axis=1)
    #     distancesCosine = []
    #     for j in range(len(medoidsIndices)):
    #         distancesCosine.append(cosine(actualSets[j], standardSets[i]))
    #     closest_medoid = np.argmin(distancesCosine)
        
    #     if closest_medoid not in unique_labels_reordered:
    #         unique_labels_reordered.append(closest_medoid)
    #     else:
    #         print("closest_medoid already in unique_labels_reordered")
    #         can_be_reordered = False
    #         break

    if not CAN_BE_ORDERED:
        #unique_labels_reordered = unique_labels

        #unique_labels = unique_labels_reordered
        #unique_labels = np.unique(labels_HDBSCAN)
        
        print("unique_labels: ", len(unique_labels), unique_labels)
    else:
        print("unique_labels_reordered: ", len(unique_labels_reordered), unique_labels_reordered)
        #unique_labels = unique_labels_reordered
        # compare if distances between clustroids and standard vectors are smaller than distances between standard vectors and other actual vectors

        #distancesClustroids = simMatrixMixed[np.arange(len(simMatrixMixed)), argmax]
        
    distancesClustroids = []
    distancesStandardVectors = []
    distancesStandardVectors2 = []
    # for i, clustroid in enumerate(medoidsIndices):
    #     distMedCluster = completeSetsDistances[:len(actualSets), :len(actualSets)][clustroid, labels_HDBSCAN == i]
    #     #print(labels_HDBSCAN)
    #     #print(labels_HDBSCAN == argmax[i])
    #     print("distMedCluster: ", np.sum(distMedCluster), len(distMedCluster), distMedCluster)
    #     # make distMedCluster a single list, no np.sum since they are inhomogeneous lists
    #     #print()
    #     #print(actualRefStandardIds)
    #     distStdCluster = completeSetsDistances[len(actualSets):, :len(actualSets)][i, np.where(np.array(actualRefStandardIds) == i)[0]]
    #     print("distStdCluster: ", np.sum(distStdCluster), len(distStdCluster), distStdCluster)
        
    #     distancesClustroids.append(np.mean(distMedCluster))
    #     distancesStandardVectors2.append(np.mean(distStdCluster))
    actualRefStandardIdsNumpy = np.array(actualRefStandardIds)
    for i, stdID in enumerate(standardRefIds):
        distStdCluster = completeSetsDistances[len(actualSets):, :len(actualSets)][i, np.where(actualRefStandardIdsNumpy == stdID)[0]]
        #print("distStdCluster: ", np.mean(distStdCluster), len(distStdCluster), distStdCluster)
        distancesStandardVectors.append(np.mean(distStdCluster))
    
    mean_distance_clustroids = np.mean(cluster_mean_distances)
    std_dev_distance_clustroids = np.std(cluster_mean_distances)

    mean_distance_standard_vectors = np.mean(distancesStandardVectors)
    std_dev_distance_standard_vectors = np.std(distancesStandardVectors)

    cv_clustroids = std_dev_distance_clustroids / mean_distance_clustroids
    cv_standard_vectors = std_dev_distance_standard_vectors / mean_distance_standard_vectors
    # print in green if the improvement is positive, in red if it is negative
    print("\n\033[94mMean distance from vectors of the same cluster to:")
    print("         clustroids: ", mean_distance_clustroids)
    #print("         first     : ", np.mean(distancesClustroids))
    print("   standard vectors: ", mean_distance_standard_vectors)
    #print("first                ", np.mean(distancesStandardVectors2))
    

    print("\nStd dev distance from vectors of the same cluster to:")
    print("         clustroids: ", std_dev_distance_clustroids)
    print("   standard vectors: ", std_dev_distance_standard_vectors)

    print("\nCoefficient of variation from vectors of the same cluster to:")
    print("         clustroids: ", cv_clustroids)
    print("   standard vectors: ", cv_standard_vectors)
    print("\033[0m")

    #ratio = np.mean(distancesClustroids) / np.mean(distancesStandardVectors)
    ratio = mean_distance_clustroids / mean_distance_standard_vectors
    percentage = (1 - ratio) * 100

    print("\033[93mMean:\033[0m")
    if percentage > 0:
        # print in green if the improvement is positive, in red if it is negative
        print("   Improvement: \033[92m{:.2f}% \033[0m".format(percentage))
    else:
        print("   Decline: \033[91m{:.2f}% \033[0m".format(-percentage))
        
    
    ratio = std_dev_distance_clustroids / std_dev_distance_standard_vectors
    percentage = (1 - ratio) * 100
    print("\033[93m\nStd dev:\033[0m")
    if percentage > 0:
        # print in green if the improvement is positive, in red if it is negative
        print("   Improvement: \033[92m{:.2f}% \033[0m".format(percentage))
    else:
        print("   Decline: \033[91m{:.2f}% \033[0m".format(-percentage))
        
    ratio = cv_clustroids / cv_standard_vectors
    percentage = (1 - ratio) * 100
    print("\033[93m\nTotal\033[0m")
    if percentage > 0:
        # print in green if the improvement is positive, in red if it is negative
        print("   Improvement: \033[92m{:.2f}% \033[0m".format(percentage))
    else:
        print("   Decline: \033[91m{:.2f}% \033[0m".format(-percentage))
    
    



# orderedClustroids = np.zeros_like(actualSetsDistances[medoidsIndices].copy())
# order = []
# for i, clustroid in enumerate(medoidsIndices):
#     distances = []
#     for j, standardVector in enumerate(standardSets):
#         distances.append(cosine(clustroid, standardVector))
        
#     distances = np.array(distances)
#     orderedClustroids[np.argmin(distances)] = clustroid
#     order.append(np.argmin(distances))
    
# print("order: ", order)

# if len(set(order)) == len(clustroids):
#     print("order is correct")
#     CAN_BE_ORDERED = True
#     clustroids = orderedClustroids
#     print("clustroids: ", clustroids.shape)
#     # reorder clustroidsIndices
#     clustroidsIndices = np.array(clustroidsIndices)[order]


#     # compare if distances between clustroids and standard vectors are smaller than distances between standard vectors and other standard vectors

#     distancesClustroids = []
#     distancesStandardVectors = []
#     for i, clustroid in enumerate(clustroids):
#         distancesClustroids.append(np.sum([cosine(clustroid, v) for v in actualClusters[i]]))
#         distancesStandardVectors.append(np.sum([cosine(standardVectors[i], v) for v in actualClusters[i]]))
        
#     print("distancesClustroids: ", distancesClustroids)
#     print("distancesStandardVectors: ", distancesStandardVectors)

#     print("\nMean distance from vectors of the same cluster to:")
#     print("         clustroids: ", np.mean(distancesClustroids))
#     print("   standard vectors: ", np.mean(distancesStandardVectors))

#     print("\nStd dev distance from vectors of the same cluster to:")
#     print("         clustroids: ", np.std(distancesClustroids))
#     print("   standard vectors: ", np.std(distancesStandardVectors))

#     print("\n improvement: ", 1 - np.mean(distancesClustroids) / np.mean(distancesStandardVectors), "\n")
# else:
#     print("NO ORDER IS POSSIBLE")
#     CAN_BE_ORDERED = False





medoidSets:  9
argmax:  (9,) [7 8 9 1 3 4 5 2 6]
argmax is correct, can be reordered
unique_labels_reordered:  9 [7 8 9 1 3 4 5 2 6]

[94mMean distance from vectors of the same cluster to:
         clustroids:  0.046595524441730346
   standard vectors:  0.2270523140306892

Std dev distance from vectors of the same cluster to:
         clustroids:  0.07013255910254937
   standard vectors:  0.02106142415809807

Coefficient of variation from vectors of the same cluster to:
         clustroids:  1.5051350949006501
   standard vectors:  0.09276022685790083
[0m
[93mMean:[0m
   Improvement: [92m79.48% [0m
[93m
Std dev:[0m
   Decline: [91m232.99% [0m
[93m
Total[0m
   Decline: [91m1522.61% [0m


In [45]:
max_len = max(len(standardSets), len(num_clusters_unique))
colors = plt.cm.jet(np.linspace(0, 1, max_len))
# print("colors", colors.shape, colors)

# print("medoid labels", [labels_HDBSCAN[i] for i in medoidsIndices])
# print("argmax", argmax)

if len(medoidSets) > 0 and CAN_BE_ORDERED:
    print("medoids added to plots and reordered")
    color_map = dict(zip(range(max_len), colors[unique_labels_reordered]))
    #rint("color_map", color_map)
else:
    color_map = dict(zip(range(max_len), colors))   # 0=red, 1=blue, 2=green, 3=yellow, 4=purple, 5=lightblue, 6=lightgreen, 7=lightyellow, 8=lightpurple
    
marker_colors = [color_map[label] if label > -1 else np.array([0,0,0,1]) for label in labels_HDBSCAN]
marker_colors_medoids = [color_map[label] if label > -1 else np.array([0,0,0,1]) for label in labels_HDBSCAN[medoidsIndices]]
#marker_colors_medoids = [color_map[label] if label > -1 else np.array([0,0,0,1]) for label in unique_labels]


# Create a trace for each type (centroids data)
traceStandard = go.Scatter3d(
    x=completeSetTSNE[len(actualSets):,0],
    y=completeSetTSNE[len(actualSets):,1],
    z=completeSetTSNE[len(actualSets):,2],
    mode='markers',
    marker=dict(
        size=7,
        color=colors,                # set color to an array/list of desired values
        opacity=1,
        symbol='diamond'
    )
)

if len(medoidSets) > 0:
    medoidsElements = completeSetTSNE[medoidsIndices] # medoidsIndices = [921, 123]
    traceMedoids = go.Scatter3d(
        x=medoidsElements[:,0],
        y=medoidsElements[:,1],
        z=medoidsElements[:,2],
        mode='markers',
        marker=dict(
            size=7,
            color=marker_colors_medoids,                # set color to an array/list of desired values
            opacity=1,
            symbol='cross'
        )
    )

# Create a trace for each type (centroids data)
traceActual = go.Scatter3d(
    x=completeSetTSNE[:len(actualSets),0],
    y=completeSetTSNE[:len(actualSets),1],
    z=completeSetTSNE[:len(actualSets),2],
    mode='markers',
    marker=dict(
        size=7,
        color=marker_colors,                # set color to an array/list of desired values
        opacity=0.1,
        symbol='circle'
    )
)

# Plot
if len(medoidSets) > 0:
    data = [traceStandard, traceActual, traceMedoids]
else:
    data = [traceStandard, traceActual]

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

medoids added to plots and reordered


## Ground Truth

In [44]:

colors_true = plt.cm.jet(np.linspace(0, 1, len(standardSets)))
color_map_true = dict(zip(range(max_len), colors))   # 0=red, 1=blue, 2=green, 3=yellow, 4=purple, 5=lightblue, 6=lightgreen, 7=lightyellow, 8=lightpurple
# marker colors for each point with the same color as the cluster it belongs to in the original data
marker_colors_true = [color_map_true[label] for label in actualRefStandardIds]


# Create a trace for each type (centroids data)
traceStandard_true = go.Scatter3d(
    x=completeSetTSNE[len(actualSets):,0],
    y=completeSetTSNE[len(actualSets):,1],
    z=completeSetTSNE[len(actualSets):,2],
    mode='markers',
    marker=dict(
        size=7,
        color=colors_true,                # set color to an array/list of desired values
        opacity=1,
        symbol='diamond'
    )
)

# Create a trace for each type (centroids data)
traceActual_true = go.Scatter3d(
    x=completeSetTSNE[:len(actualSets),0],
    y=completeSetTSNE[:len(actualSets),1],
    z=completeSetTSNE[:len(actualSets),2],
    mode='markers',
    marker=dict(
        size=7,
        color=marker_colors_true,                # set color to an array/list of desired values
        opacity=0.1,
        symbol='circle'
    )
)

# Plot
data = [traceStandard_true, traceActual_true]

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

## Output recommended standard routes

In [47]:
# Save the medoids to a file
with open(os.path.join("results", 'recStandard.json'), 'w', encoding="utf-8") as f:
    recStandard = []
    for i, index in enumerate(medoidsIndices):
        recRoute = {"id": "s" + str(i)}
        recRoute["route"] = dfActual.iloc[actualSets[index][0]]["route"]
        recStandard.append(recRoute)
    json.dump(recStandard, f, ensure_ascii=False, indent=2)

## t-SNE of centroids of clusters and actual routes

## t-SNE of clustroids, standard, and actual routes