# TASK 2
for each driver, creates a list of standard routes in that order so that the higher in the list a standard route is, the least the diversion of the driver will be, and 
the output of the program is: 

a file called driver.json that has for each driver, the 5 standard routes routes that if the driver does them, it minimizes the diversion. You can test this by considering as pool of standard routes those that originally the company has and also those that you recommend in the recStandard.json. The file driver.json has the following syntax:
[
	{driver:C, routes:[s10, s20, s2, s6, s10}}, 
	{driver:A, routes:[s1, s2, s22, s61, s102]}, 
….
]


In [155]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import time
import math
import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import HDBSCAN, DBSCAN
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

from scipy.sparse import csr_matrix, issparse, lil_matrix, coo_matrix

from tqdm import tqdm
from pandarallel import pandarallel

from numba import njit, prange, jit
from numba_progress import ProgressBar

import networkx as nx

HOME:  c:\Users\matti\Desktop\CODE\DataMiningProject23-24


In [156]:
STANDARD_FILENAME = "standard_big.json"
ACTUAL_FILENAME = "actual_big.json"


K_SHINGLES = 2
ALPHA = 0.7 #TODO: not needed maybe

In [157]:
# load standard and actual data
print("\nReading standard data...")
with open(os.path.join('data',STANDARD_FILENAME)) as f:
    standard = json.load(f)

print("\nReading actual data...")
with open(os.path.join('data', ACTUAL_FILENAME)) as f:
    actual = json.load(f)

# load the data into a dataframe
print("\nCreating standard dataframe...")
dfStandard = pd.DataFrame(standard)
print("\nCreating actual dataframe...")
dfActual = pd.DataFrame(actual)

# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())

# get the unique cities and items of the standard data
cities = []
items = []
drivers = []
longestRoute = 0
shortestRoute = np.inf
maxItemQuantity = 0

standardRefIds = []
for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    standardRefIds.append(int(idS[1]))
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing standard data")

actualRefStandardIds = []
for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    idStandard = s['sroute']
    drivers.append(s['driver'])
    actualRefStandardIds.append(int(idStandard[1]))
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing actual data")

# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
#uniqueCities.insert(0, 'NULL')          # add NULL city, for padding vectors with different lengths (trips in routes)
uniqueItems = sorted(list(set(items)))
uniqueDrivers = sorted(list(set(drivers)))

if shortestRoute < 2:
    K_SHINGLES = 2

threeShingles = []

for i, c1 in enumerate(uniqueCities):
    for j, c2 in enumerate(uniqueCities):
        if i == j:
            continue
        for k, c3 in enumerate(uniqueCities):
            if j == k or i == k:
                continue
            threeShingles.append([c1, c2, c3])
            
permutations = math.perm(len(uniqueCities), K_SHINGLES)

print("\nUnique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)
print("Unique drivers: ", uniqueDrivers)

standardIds = dfStandard['id'].tolist()
print("standardIds: ", standardIds)

print("\nNumber of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))

print("\nLongest route: ", longestRoute)
print("Shortest route: ", shortestRoute)

print("\nMax item quantity: ", maxItemQuantity)

print("\nNumber of three-shingles: ", len(threeShingles))

print(f"\n{K_SHINGLES}-shingles: ", math.perm(len(uniqueCities), K_SHINGLES))
print(f"{K_SHINGLES}-shingles: ", math.comb(len(uniqueCities), K_SHINGLES))

print(f"\n\033[92mK-Shingles used: {K_SHINGLES} \033[0m")



Reading standard data...

Reading actual data...

Creating standard dataframe...

Creating actual dataframe...
   id                                              route
0  s0  [{'from': 'Marsala', 'to': 'Fiumicino', 'merch...
1  s1  [{'from': 'Reggio Emilia', 'to': 'Busto Arsizi...
2  s2  [{'from': 'Faenza', 'to': 'Sanremo', 'merchand...
3  s3  [{'from': 'Novara', 'to': 'Milan', 'merchandis...
4  s4  [{'from': 'Avellino', 'to': 'Fiumicino', 'merc...
   id driver sroute                                              route
0  a0    I_2     s0  [{'from': 'Marsala', 'to': 'Aversa', 'merchand...
1  a1    T_3     s0  [{'from': 'Marsala', 'to': 'Fiumicino', 'merch...
2  a2    I_2     s0  [{'from': 'Marsala', 'to': 'Pisa', 'merchandis...
3  a3    R_2     s0  [{'from': 'Marsala', 'to': 'Fiumicino', 'merch...
4  a4    J_3     s0  [{'from': 'Marsala', 'to': 'Fiumicino', 'merch...

Finished preparing standard data

Finished preparing actual data

Unique cities:  ['Altamura', 'Ancona', 'Andria', 'Anz

In [158]:
def hashShingles(shingles, n):
    # hash shingles
    string = "" 
    for shingle in shingles:
        string += str(shingle) + "," # [45, 4, 8] -> "45,4,8,"
    
    return hash(string) #% n

def createShingles(df, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    # create shingles for each route
    shingles = []
    for index, s in df.iterrows():
        idS = s['id']
        route = s['route']
        shingle = [index]
        citiesInRoute = [] # napoli roma milano teramo bergamo [10,4,5,48,12] [10,4,5] [4,5,48] [5,48,12]
        merchandiseInRoute = np.zeros(len(uniqueItems))
        for trip in route:
            citiesInRoute.append(uniqueCities.index(trip['from']))
            #merchandiseInRoute += np.array(list(trip['merchandise'].values()))
            for item, n in trip['merchandise'].items():
                merchandiseInRoute[uniqueItems.index(item)] += n
        if len(route) > 0:
            citiesInRoute.append(uniqueCities.index(route[-1]['to']))
        if len(route) > 0:
            merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
        
        hashedShingles = []
        for i in range(len(citiesInRoute)-k+1):
            # Q: is it correct to set the modulo for the hash function to the number of permutations?
            # A: yes, because we want to have a unique hash for each shingle
            # Q: would it be better to use a different hash function?
            # A: yes, because the modulo function is not a good hash function
            hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations) )
        
        shingle.append(np.array(hashedShingles))
        
        shingle.append(merchandiseInRoute) # quantity hot encoding
        
        shingles.append(shingle)
        
    return shingles # [ index, [shingles], [merchandise] ]

def create_shingles(s, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    idS = s['id']
    route = s['route']
    shingle = [s.name]
    citiesInRoute = [] 
    merchandiseInRoute = np.zeros(len(uniqueItems))
    for trip in route:
        citiesInRoute.append(uniqueCities.index(trip['from']))
        for item, n in trip['merchandise'].items():
            merchandiseInRoute[uniqueItems.index(item)] += n
    if len(route) > 0:
        citiesInRoute.append(uniqueCities.index(route[-1]['to']))
    if len(route) > 0:
        merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
    
    hashedShingles = []
    for i in range(len(citiesInRoute)-k+1):
        hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations))
    
    shingle.append(np.array(hashedShingles))
    shingle.append(merchandiseInRoute)
    
    return shingle

In [159]:
def create_shingles_selfcontained(s, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
        import numpy as np
        def hash_shingles(shingles):
            # hash shingles
            string = ""
            for shingle in shingles:
                string += str(shingle) + ","
            return hash(string)

        idS = s['id']
        route = s['route']
        shingle = [s.name]
        citiesInRoute = []
        merchandiseInRoute = np.zeros(len(uniqueItems))

        for trip in route:
            citiesInRoute.append(uniqueCities.index(trip['from']))
            for item, n in trip['merchandise'].items():
                merchandiseInRoute[uniqueItems.index(item)] += n

        if len(route) > 0:
            citiesInRoute.append(uniqueCities.index(route[-1]['to']))

        if len(route) > 0:
            merchandiseInRoute = merchandiseInRoute / (maxItemQuantity * len(route))

        hashedShingles = []

        for i in range(len(citiesInRoute) - k + 1):
            hashedShingles.append(hash_shingles(citiesInRoute[i:i + k]))

        shingle.append(np.array(hashedShingles))
        shingle.append(merchandiseInRoute)
        return shingle

In [160]:
standardSets = createShingles(dfStandard, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
actualSets = createShingles(dfActual, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)

# pandarallel.initialize(progress_bar=True)

# standardSets = dfStandard.parallel_apply(lambda s: create_shingles_selfcontained(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
# standardSets = standardSets.tolist()
# actualSets = dfActual.parallel_apply(lambda s: create_shingles_selfcontained(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
# actualSets = actualSets.tolist()

print("\nstandardSets", len(standardSets), "shape first element", standardSets[0][1].shape, standardSets[0])
print("\nactualSets", len(actualSets),  "shape first element", standardSets[0][1].shape, actualSets[0])

print("\nstandardSets:", len(standardSets))
print("actualSets:", len(actualSets))

assert len(standardSets[0]) == 3, "The length of the standard set is not equal to 3 (index, shingles, merchandise)"
assert len(standardSets[0][2]) == len(uniqueItems), "The length of the merchandise vector is not equal to the number of unique items"


standardSets 100 shape first element (14,) [0, array([-3525746341207040858, -7665491940657465360, -7313224059357137743,
        8641309246532801639,  2355252672371967007,  7053823274113899311,
        7633385113691996908, -4346088795957614939,  -979465566118392045,
        8963986723228981468, -6191692091795537175, -2409031165579898302,
       -5931454765039319399,  7672105062331123626], dtype=int64), array([0.16857143, 0.19428571, 0.17428571, 0.10928571, 0.38285714,
       0.29357143, 0.37857143, 0.41571429, 0.27928571, 0.34571429,
       0.15928571, 0.27642857, 0.27428571, 0.38714286, 0.25214286,
       0.325     , 0.24357143, 0.245     , 0.19642857, 0.18785714,
       0.31714286, 0.29      , 0.31642857, 0.16214286, 0.26214286,
       0.16071429, 0.21571429, 0.22357143, 0.24714286, 0.13714286,
       0.215     , 0.21928571, 0.135     , 0.25857143, 0.13      ,
       0.27785714, 0.23785714, 0.43357143, 0.33142857, 0.21785714,
       0.36928571, 0.28      , 0.22142857, 0.29428571, 0.3

### FUNCTIONS

In [161]:
def jaccard_similarity_matrix(matrix):
    intersection = np.dot(matrix, matrix.T)
    row_sums = matrix.sum(axis=1)
    union = row_sums[:, None] + row_sums - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

def jaccard_similarity_two_matrices(matrix1, matrix2):
    #intersection = np.dot(matrix, matrix.T)
    intersection = np.dot(matrix1, matrix2.T)
    row_sums1 = matrix1.sum(axis=1)
    row_sums2 = matrix2.sum(axis=1)
    union = row_sums1[:, None] + row_sums2 - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

In [162]:
import numpy as np
from scipy.sparse import csr_matrix

def jaccard_similarity_sparse(matrix1, matrix2):
    # Convert dense matrices to sparse matrices (CSR format)
    sparse_matrix1 = csr_matrix(matrix1)
    sparse_matrix2 = csr_matrix(matrix2)

    # Matrix multiplication in CSR format
    intersection = sparse_matrix1.dot(sparse_matrix2.T).toarray()

    # Row sums using CSR format
    row_sums1 = sparse_matrix1.sum(axis=1).A.ravel()
    row_sums2 = sparse_matrix2.sum(axis=1).A.ravel()

    # Calculate union using the correct formula
    union = row_sums1 + row_sums2 - intersection

    # Avoid division by zero
    union = np.where(union == 0, 1, union)

    # Jaccard similarity
    jaccard_similarity = intersection / union
    return jaccard_similarity


In [163]:
def jaccard_similarity_matrix_merch(matrix):
    print("matrix", matrix.shape)
    min_matrix = np.minimum(matrix[:, None, :], matrix[None, :, :])
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix[:, None, :], matrix[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity

def jaccard_similarity_matrices_merch(matrix1, matrix2):
    print("matrix1", matrix1.shape)
    print("matrix2", matrix2.shape)
    
    min_matrix = np.minimum(matrix1[:, None, :], matrix2[None, :, :])
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix1[:, None, :], matrix2[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity


def create_binary_matrix(routeSets):
    # create binary matrix where each row represents a route
    uniqueShingles = list(set([shingle for route in routeSets for shingle in route[1]]))
    binaryMatrix = np.zeros((len(routeSets), len(uniqueShingles)))
    for i, route in enumerate(routeSets):
        for shingle in route[1]:
            binaryMatrix[i][uniqueShingles.index(shingle)] = 1
    return binaryMatrix



In [164]:
def hash_function_hash_code(num_of_hashes,n_col,next_prime):
  
    #coeffA = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))
    #coeffB = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))

    coeffA = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))
    coeffB = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))

    x = np.arange(n_col).reshape((1,n_col))

    hash_code = (np.matmul(coeffA,x) + coeffB) % next_prime # (num_of_hashes,n_col) so how each column index is permuted

    return hash_code

def minhash(u,num_of_hashes):
    (n_row, n_col) = u.shape
    next_prime = n_col
    hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

    signature_array = np.empty(shape = (n_row,num_of_hashes))

    #t2 = time.time()
    for row in tqdm(range(n_row), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(u[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature_array[row,:] = np.zeros((1,num_of_hashes))
            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature_array[row,:] = row_signature

    return signature_array

## NEW FUNCTIONS FOR TASK 2

In [165]:
def create_binary_matrices(routeSet1, routeSet2):
    # create binary matrix where each row represents a route
    uniqueShinglesBoth = list(set([shingle for route in routeSet1 for shingle in route[1]] + [shingle for route in routeSet2 for shingle in route[1]]))
    binaryMatrix1 = np.zeros((len(routeSet1), len(uniqueShinglesBoth)))
    binaryMatrix2 = np.zeros((len(routeSet2), len(uniqueShinglesBoth)))
    for i, route in enumerate(routeSet1):
        for shingle in route[1]:
            binaryMatrix1[i][uniqueShinglesBoth.index(shingle)] = 1
            
    for i, route in enumerate(routeSet2):
        for shingle in route[1]:
            binaryMatrix2[i][uniqueShinglesBoth.index(shingle)] = 1
    return binaryMatrix1, binaryMatrix2

def find_num_hashes_minhash(matrix):
    if matrix.shape[1]<150:
        num_hash_functions = matrix.shape[1]
    elif matrix.shape[1]<500:
        num_hash_functions = matrix.shape[1]//2
    elif matrix.shape[1] < 1000:
        num_hash_functions = matrix.shape[1]//10
    elif matrix.shape[1] < 10_000:
        num_hash_functions = 150
    elif matrix.shape[1] < 100_000:
        num_hash_functions = 250
    else:
        num_hash_functions = 300
    return num_hash_functions

In [166]:
def hash_function_hash_code(num_of_hashes,n_col,next_prime):

    coeffA = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))
    coeffB = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))

    x = np.arange(n_col).reshape((1,n_col))

    hash_code = (np.matmul(coeffA,x) + coeffB) % next_prime # (num_of_hashes,n_col) so how each column index is permuted

    return hash_code

# def minhash(u,num_of_hashes):
#     (n_row, n_col) = u.shape
#     next_prime = n_col
#     hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

#     signature_array = np.empty(shape = (n_row,num_of_hashes))

#     #t2 = time.time()

#     for row in tqdm(range(n_row), desc="minhashing"):
#         #print("row", row)
#         ones_index = np.where(u[row,:]==1)[0]
#         #if len(ones_index) == 0:
#         signature_array[row,:] = np.zeros((1,num_of_hashes))
#             #continue
#         corresponding_hashes = hash_code[:,ones_index]
#         #print("ones_index", ones_index.shape, ones_index)
#         #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
#         row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

#         signature_array[row,:] = row_signature

#     return signature_array

def minhash_matrices(matrix1,matrix2,num_of_hashes):
    (n_row, n_col) = matrix1.shape
    next_prime = n_col
    hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

    signature1_array = np.empty(shape = (n_row,num_of_hashes))
    signature2_array = np.empty(shape = (matrix2.shape[0],num_of_hashes))

    #t2 = time.time()

    for row in tqdm(range(n_row), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(matrix1[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature1_array[row,:] = np.zeros((1,num_of_hashes))

            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature1_array[row,:] = row_signature

    for row in tqdm(range(matrix2.shape[0]), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(matrix2[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature2_array[row,:] = np.zeros((1,num_of_hashes))

            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature2_array[row,:] = row_signature

    return signature1_array, signature2_array

def find_band_and_row_values(columns, threshold):
    previous_b = 1
    previous_r = columns
    for b in range(1, columns + 1):
        if columns % b == 0:
            r = columns // b
            if (1 / b) ** (1 / r)  <= threshold:
                if np.abs((1 / previous_b) ** (1 / previous_r) - threshold) < np.abs((1 / b) ** (1 / r) - threshold):
                    return previous_b, previous_r
                return b, r
    return columns, 1

In [167]:
def jaccard_similarity_minhash_lsh_route_merch(matrix, matrixMerch, thresh_user=0.2):
    #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    #similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    pairs = lsh(matrix, thresh_user=thresh_user)
    #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    neverSeen = set([i for i in range(matrix.shape[0])]) - uniqueRowsSet
    print("neverSeen", neverSeen)
    #print("uniqueRows numpy", len(uniqueRows))
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    #print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
    print(" num of pairs", len(pairs))
    print(" instead of", matrix.shape[0]*(matrix.shape[0]-1)/2)
    print("improved by", (1 - len(pairs) / (matrix.shape[0]*(matrix.shape[0]-1)/2)) *100, "%")
    
        
    print("Computing jaccard similarity on subset matrix...")
    #print("subset matrix", subset_matrix.shape)

    # with ProgressBar(total=len(pairs)) as progress:
    #     distance_pairs = compute_subset_similarity_matrix_only_pairs(matrix, matrixMerch, pairs, progress)
    
    sortedUniqueRowsSet = sorted(list(uniqueRowsSet))
    subset_matrix = matrix[sortedUniqueRowsSet]
    subset_matrixMerch = matrixMerch[sortedUniqueRowsSet]
    print("subset_matrix", subset_matrix.shape, subset_matrix[0])
    print("subset_matrixMerch", subset_matrixMerch.shape, subset_matrixMerch[0])
    with ProgressBar(total=len(sortedUniqueRowsSet)) as progress:
        subset_sim_matrix = compute_subset_similarity_matrix_and_merch(subset_matrix, subset_matrixMerch, progress)
    print("subset_sim_matrix", subset_sim_matrix.shape, subset_sim_matrix[0])
    print("subset_sim_matrix contains nan", np.isnan(subset_sim_matrix).any())
    print("nan indices", len(np.argwhere(np.isnan(subset_sim_matrix))), np.argwhere(np.isnan(subset_sim_matrix)))
    
    # if len(neverSeen) > 0:
    #     for i, n in enumerate(neverSeen):
    #         distance_pairs = np.concatenate([distance_pairs, [1]*(matrix.shape[0]-1-i)])
        
    #     pairs = np.concatenate([pairs, np.array([[i, j] for i,n  in enumerate(neverSeen) for j in range(i, matrix.shape[0]) if i != j])])
    #print("pairs", pairs.shape, pairs[-10:])
    # map back to original matrix
    print("Mapping back to original matrix...")
    
    lenMatrixNoNeverSeen = matrix.shape[0] - len(neverSeen)
    
    # remove never seen rows and map indices
    map_indices = {}
    sortedNeverSeen = sorted(list(neverSeen))
    counter = 0
    for i in range(matrix.shape[0]):
        if i in sortedNeverSeen:
            continue
        map_indices[i] = counter
        counter += 1
        
    print("map_indices", map_indices)
    map_indices_back = {v: k for k, v in map_indices.items()}
    
  
    subset_sim_matrix = csr_matrix(subset_sim_matrix)
    
    return subset_sim_matrix, map_indices_back

In [168]:
def lsh(minhash_matrix, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix.shape[1]
    
    # Generate the hash functions
   # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
    #bands = b
        
    print("final bands", b)
    signature_matrix = np.full((minhash_matrix.shape[0], b), np.inf)
    
    # if threshold is 0.8,
    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # For each band
    print("Computing hash values of bands...")
    hash_values = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix.shape[0], 1, minhash_matrix.reshape(-1, r))
    # Reshape the hash values to match the signature matrix
    hash_values = hash_values.reshape(minhash_matrix.shape[0], b)
    # Update the signature matrix
    signature_matrix = hash_values
            
    # find candidate pairs
    print("Finding candidate pairs...")
    candidate_pairs = []
    for i in tqdm(range(signature_matrix.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix[i+1:, :] == signature_matrix[i, :], axis=1) / b
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        indices = np.nonzero(similarities >= threshold)[0]
        # Add the pairs to the candidate pairs
        candidate_pairs.extend((i, i+1+index) for index in indices)
    
    return np.array(candidate_pairs)

def lsh_two_matrices(minhash_matrix1, minhash_matrix2, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix1.shape[1]
    
    # Generate the hash functions
    # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    # hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    def hash_function(x):
        # print("x",x)
        var = hash(",".join([str(x[i]) for i in range(len(x))]))
        # print ("str x ", (",".join([(x[i]) for i in range(len(x))])))
        # print ("var", var)
        return var % minhash_matrix1.shape[0]


    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
        
    print("final bands", b)
    signature_matrix1 = np.full((minhash_matrix1.shape[0], b), np.inf)
    signature_matrix2 = np.full((minhash_matrix2.shape[0], b), np.inf)
    

    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # # For each band
    # print("Computing hash values of bands...")
    # hash_values1 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix1.shape[0], 1, minhash_matrix1.reshape(-1, r))
    # print("hash_values1", hash_values1.shape, hash_values1)
    # hash_values2 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix2.shape[0], 1, minhash_matrix2.reshape(-1, r))
    # print("hash_values2", hash_values2.shape, hash_values2)

    print("minhash_matrix1.reshape(-1, r).shape",minhash_matrix1.reshape(-1, r).shape)

    # For each band
    print("Computing hash values of bands...")
    hash_values1 = np.apply_along_axis(hash_function, 1, minhash_matrix1.reshape(-1, r))
    # print("hash_values1", hash_values1.shape, hash_values1)
    hash_values2 = np.apply_along_axis(hash_function, 1, minhash_matrix2.reshape(-1, r))
    # print("hash_values2", hash_values2.shape, hash_values2)


    # Reshape the hash values to match the signature matrix
    hash_values1 = hash_values1.reshape(minhash_matrix1.shape[0], b)
    # print("hash_values1", hash_values1.shape, hash_values1)
    hash_values2 = hash_values2.reshape(minhash_matrix2.shape[0], b)
    # print("hash_values2", hash_values2.shape, hash_values2) 
    # Update the signature matrix
    signature_matrix1 = hash_values1
    signature_matrix2 = hash_values2
    
    
    # find candidate pairs
    print("Finding candidate pairs...")
    # similarities_actual=[]
    # candidate_pairs = np.empty((minhash_matrix1.shape[0], 2))

    data=[]
    rows=[]
    cols=[]

    for i in tqdm(range(signature_matrix1.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix2 == signature_matrix1[i, :], axis=1) / b
        # print("similarities", similarities.shape, similarities)
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        indices = np.nonzero(similarities >= threshold)[0]
        # print("indices", indices.shape, indices)

        # print("similarities[indices] ",similarities[indices])

        data.extend(similarities[indices])
        # print("data", data)
        rows.extend([i]*len(indices))
        # print("rows", rows)
        cols.extend(indices)
        # print("cols", cols)
        # indexMax = np.argmax(similarities)
        # simMax = similarities[indexMax]
        # # Add the pairs to the candidate pairs
        # #candidate_pairs.extend((i, i+1+index) for index in indices)
        # candidate_pairs[i] = [indexMax, simMax]
        # similarities_actual.append(similarities)

        

    # # Create data array for COO matrix
    # data = np.concatenate([subset_sim_matrix[indices_i, indices_j], subset_sim_matrix[indices_i, indices_j]])
    
    # # Create row and column index arrays for COO matrix
    # rows = np.concatenate([indices_i_mapped, indices_j_mapped])
    # cols = np.concatenate([indices_j_mapped, indices_i_mapped])
    # print("data", data)
    # print("rows", rows)
    # print("cols", cols)

    similarity_matrix = coo_matrix((data, (rows, cols)), shape=(minhash_matrix1.shape[0], minhash_matrix2.shape[0])).tocsr()

    return similarity_matrix





In [169]:
a = np.array([[1],[2],[3]])
b = np.array([[1],[2],[2]])
print("a==b", a[:]==b[:])
similarities = np.sum(b == a, axis=1)/3
print("similarities", similarities)
threshold = 0.1
indices = np.nonzero(similarities >= threshold)[0]
print("indices", indices)

a==b [[ True]
 [ True]
 [False]]
similarities [0.33333333 0.33333333 0.        ]
indices [0 1]


In [170]:
@jit(cache=True, nogil=True, parallel=True)
def compute_distance_pairs_Merch(sim_matrix, matrix1, matrix1Merch, matrix2, matrix2Merch, progress_proxy):
    n = sim_matrix.shape[0]
    m = sim_matrix.shape[1]
    
    # print("sim_matrix", sim_matrix.shape)    
    # print(numba.typeof(sim_matrix))
    # print(numba.typeof(matrix1))
    # print(numba.typeof(matrix1Merch))
    # print(numba.typeof(matrix2))
    # print(numba.typeof(matrix2Merch))
    # print(numba.typeof(progress_proxy))


    for i in prange(n):
        subset1 = matrix1[i].reshape(1, -1) #replicate_row(subset_matrix, i) 
        # print("subset1", subset1.shape)
        subset2 = matrix2[sim_matrix[i].nonzero()[1]]
        # print("subset2", subset2.shape)
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)

        route_distance = (np.divide(sum_min_matrix, sum_max_matrix))
        # print("route_distance", route_distance.shape)

        subset1Merch = matrix1Merch[i].reshape(1, -1) #replicate_row(subset_matrixMerch, i)
        subset2Merch = matrix2Merch[sim_matrix[i].nonzero()[1]]
        # print("subset1Merch", subset1Merch.shape)
        # print("subset2Merch", subset2Merch.shape)

        min_matrixMerch = np.minimum(subset1Merch, subset2Merch)
        sum_min_matrixMerch = np.sum(min_matrixMerch, axis=-1)

        max_matrixMerch = np.maximum(subset1Merch, subset2Merch)
        sum_max_matrixMerch = np.sum(max_matrixMerch, axis=-1)

        merch_distance = (np.divide(sum_min_matrixMerch, sum_max_matrixMerch))
        # print("merch_distance", merch_distance.shape)



        sim_matrix[i,sim_matrix[i].nonzero()[1]] = (0.8) * route_distance + (0.2) * merch_distance

        progress_proxy.update(1)
    
    return sim_matrix


def similarity_minhash_lsh_two_matrices(matrix1, matrix1Merch, matrix2, matrix2Merch, thresh_user=0.2):
    
    similarity_matrix = lsh_two_matrices(matrix1,matrix2, thresh_user=thresh_user)
    # print("similarity_matrix", similarity_matrix.shape, similarity_matrix[0])
    # print("similarity_matrix", similarity_matrix.shape, similarity_matrix[1])
    # print("similarity_matrix", similarity_matrix.shape, similarity_matrix[101])
    # print("similarity_matrix[0]", similarity_matrix[0,0])
    # print("similarity_matrix[9,9]", similarity_matrix[9,9])

    # uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    # neverSeen = set([i for i in range(matrix1.shape[0])]) - uniqueRowsSet
    

    # sortedUniqueRowsSet = sorted(list(uniqueRowsSet))
    # print("sortedUniqueRowsSet", sortedUniqueRowsSet)

    # subset_matrix1 = matrix1[sortedUniqueRowsSet]
    # subset_matrix1Merch = matrix1Merch[sortedUniqueRowsSet]
    # print("subset_matrix1", subset_matrix1.shape, subset_matrix1[0])
    # print("subset_matrix1Merch", subset_matrix1Merch.shape, subset_matrix1Merch[0])

    # subset_matrix2 = matrix2[sortedUniqueRowsSet]
    # subset_matrix2Merch = matrix2Merch[sortedUniqueRowsSet]
    # print("subset_matrix2", subset_matrix2.shape, subset_matrix2[0])
    # print("subset_matrix2Merch", subset_matrix2Merch.shape, subset_matrix2Merch[0])

    # subset_similarity_matrix = np.full((subset_matrix1.shape[0], subset_matrix2.shape[0]), np.inf)
        
    print("Computing distance  on subset matrix...")
    with ProgressBar(total=matrix1.shape[0]) as progress:
        similarity_matrix = compute_distance_pairs_Merch(similarity_matrix, matrix1, matrix1Merch, matrix2, matrix2Merch, progress)
    
    print("similarity_matrix", similarity_matrix.shape, similarity_matrix[0])
    print("similarity_matrix", similarity_matrix.shape, similarity_matrix[1])
    print("similarity_matrix", similarity_matrix.shape, similarity_matrix[101])

    return similarity_matrix

  @jit(cache=True, nogil=True, parallel=True)


## task 2

In [171]:
# convert routes and merchandise to binary matrices

print("Creating route binary matrix...")
route_matrix, route_matrix_standard = create_binary_matrices(actualSets, standardSets)
print("\nroute_matrix actual", route_matrix.shape, route_matrix[0])
print("\nroute_matrix standard", route_matrix_standard.shape, route_matrix_standard[0])
num_hash_functions = find_num_hashes_minhash(route_matrix)

print("Minhashing route matrix...")    
route_matrix, route_matrix_standard = minhash_matrices(route_matrix, route_matrix_standard, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
print("\nroute_matrix minhash", route_matrix.shape, route_matrix[0])
print("\nroute_matrix_standard minhash", route_matrix_standard.shape, route_matrix_standard[0])


print("\nACTUAL")

print("\nCreating merchandise binary matrix...")
merch_matrix = np.array([s[2] for s in actualSets])
print("\nmerch_matrix", merch_matrix.shape, merch_matrix[0])
print("merch_matrix contains nan", np.isnan(merch_matrix).any())

print("\nSTANDARD")

print("\nCreating merchandise binary matrix...")
merch_matrix_standard = np.array([s[2] for s in standardSets])
print("\nmerch_matrix_standard", merch_matrix.shape, merch_matrix[0])
print("merch_matrix_standard contains nan", np.isnan(merch_matrix).any())

# Essentials for Task 2
# standardToActualSetsDistances = None
print("Computing distance route matrix actual to standard...")

# route_matrix_distance_actual_standard, map_indices_back = distance_minhash_lsh_two_matrices(route_matrix, merch_matrix, route_matrix_standard, merch_matrix_standard, thresh_user=0.0)
# print("\nroute_similarity_standard_to_actual", route_similarity_standard_to_actual.shape, route_similarity_standard_to_actual[0])

# merch_similarity_lsh_standard_to_actual = jaccard_similarity_minhash_lsh_two_matrices(merch_matrix, merch_matrix_standard, thresh_user=0.0)



Creating route binary matrix...

route_matrix actual (100000, 2450) [0. 0. 0. ... 0. 0. 0.]

route_matrix standard (100, 2450) [0. 0. 0. ... 0. 0. 0.]
Minhashing route matrix...


minhashing: 100%|██████████| 100000/100000 [00:03<00:00, 25804.19it/s]
minhashing: 100%|██████████| 100/100 [00:00<00:00, 20052.13it/s]



route_matrix minhash (100000, 150) [271. 220. 105.  41.  20. 234. 185. 179. 202. 103.  51.  27. 235. 137.
 595. 204. 143. 313. 284.  35.  82. 399. 259.  91. 105. 129.   2.  32.
  19. 127. 239. 189. 100. 124. 102.   3. 294. 213. 224.  42. 433.  75.
 306.  12.  83. 168. 145. 122.   4. 163.  47.  87.  16. 446.   9.  17.
 173.  71.  49. 422.   6.  40.  18. 252.  37. 105. 192.  75.  65. 102.
 103.  77.  57. 104. 227. 222.  81.  19.   7.  55.  71. 452. 236.  96.
 193. 101. 114. 181.  22.   1. 160.  72. 258.  30. 125. 171.  52.  85.
  65. 191.  18. 105.   6.   5.   9. 207.  50.  25.  35.   7.  71. 235.
 315.  99. 231. 221.  54. 349. 353.  67.  74. 124. 124. 116.  75. 616.
 248. 391. 232.  71. 297.  67.   0. 512.  10. 366. 735. 106.  36. 184.
  44.  49. 107. 118.   6.  51.  28.  72. 161. 111.]

route_matrix_standard minhash (100, 150) [ 49. 220. 105.  41.  20. 234. 185.  66. 202. 103.  51.  27. 235. 137.
 595.  92. 143. 313. 159.  35.  38. 448. 259.  87. 105. 129.   2.  32.
  19. 489. 239. 18

In [173]:
route_matrix_similarity_actual_standard = similarity_minhash_lsh_two_matrices(route_matrix, merch_matrix, route_matrix_standard, merch_matrix_standard, thresh_user=0.3)

final bands 50
lsh threshold 0.2714417616594907
minhash_matrix1.reshape(-1, r).shape (5000000, 3)
Computing hash values of bands...
Finding candidate pairs...


100%|██████████| 100000/100000 [00:04<00:00, 24334.50it/s]


Computing distance  on subset matrix...


  0%|          | 0/100000 [00:00<?, ?it/s]

similarity_matrix (100000, 100)   (0, 0)	0.743368461497973
similarity_matrix (100000, 100)   (0, 0)	0.8540209154717022
similarity_matrix (100000, 100) 


In [None]:
print("\nroute_matrix_distance_actual_standard", route_matrix_similarity_actual_standard[99:101])

max_value = np.max(route_matrix_similarity_actual_standard, axis=1).toarray()
max_value = np.where(max_value == 0, np.nan, max_value)
max_value_index = np.argmax(route_matrix_similarity_actual_standard, axis=1)
max_value_index = np.where(max_value == 0, -1, max_value_index)

print("max_value_index", max_value_index.shape, max_value_index[90:110])
print("max_value", max_value.shape, max_value[95])


route_matrix_distance_actual_standard   (0, 0)	0.7943723123588063
  (0, 6)	0.39613156552462486
  (0, 8)	0.40712183477695896
  (0, 13)	0.38354297957292505
  (0, 20)	0.3663551152642981
  (0, 26)	0.4309464312930923
  (0, 48)	0.391317898655325
  (0, 55)	0.39689413448374367
  (0, 61)	0.39148177081194824
  (0, 68)	0.3812110799451795
  (0, 71)	0.3726452071224856
  (0, 92)	0.39643882219327
  (0, 94)	0.37931964131836315
  (0, 98)	0.40809749488041847
  (1, 1)	0.8406808355055525
  (1, 10)	0.4226370058855883
  (1, 24)	0.3427458699151874
  (1, 26)	0.4173460254082969
  (1, 30)	0.37054010071932186
  (1, 55)	0.37095017875611724
  (1, 59)	0.4133797264672371
  (1, 66)	0.412147634798986
  (1, 74)	0.41635970557553015
  (1, 79)	0.3847846639911346
  (1, 80)	0.37926216480462926
  (1, 89)	0.40944330885767644
  (1, 92)	0.37988493561911374
max_value_index (10000, 1) [[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]]
max_value (10000, 1) [0.79219129]


In [None]:
print("Index of the biggest value for each row:", max_value_index[850])
print("Value of the biggest value for each row:", max_value[850])


Index of the biggest value for each row: [8]
Value of the biggest value for each row: [0.77611916]


In [None]:

print("len(max_value_index)", len(max_value_index))
print("len(max_value)", len(max_value))

# [index for index, s in dfActual.iterrows() if s['driver'] == driver]
# uniqueDrivers
driver_indices = {}

for i, s in dfActual.iterrows():
    driver = s['driver']
    
    # Check if the driver is already in the dictionary
    if driver in driver_indices:
        # If yes, append the index to the existing array
        driver_indices[driver].append(i)
    else:
        # If not, create a new array with the current index
        driver_indices[driver] = [i]

print("driver_indices", driver_indices)
print("len(driver_indices)", len(driver_indices))

len(max_value_index) 10000
len(max_value) 10000
driver_indices {'H_0': [0, 12, 23, 175, 184, 461, 514, 523, 575, 665, 950, 1078, 1113, 1116, 1206, 1217, 1244, 1497, 1558, 1674, 1726, 1747, 1839, 1853, 1861, 1877, 1990, 2048, 2148, 2214, 2270, 2296, 2323, 2403, 2432, 2462, 2814, 2862, 2918, 3245, 3326, 3462, 3543, 3706, 3889, 4081, 4119, 4126, 4178, 4230, 4332, 4477, 4555, 4632, 4677, 4693, 4747, 5008, 5073, 5086, 5120, 5245, 5317, 5394, 5485, 5516, 5659, 5750, 5857, 5858, 6109, 6262, 6340, 6500, 6544, 6572, 6653, 6678, 6731, 6946, 7053, 7168, 7247, 7363, 7374, 7626, 7905, 7942, 8150, 8252, 8270, 8297, 8343, 8488, 8588, 8599, 8698, 8715, 9041, 9061, 9082, 9104, 9222, 9227, 9373, 9481, 9653, 9805, 9853, 9962, 9988, 9999], 'J_0': [1, 86, 418, 531, 607, 663, 937, 1096, 1104, 1510, 1587, 1725, 1776, 1921, 2015, 2377, 2408, 2429, 2610, 2614, 2733, 2755, 2815, 2929, 3188, 3235, 3264, 3281, 3335, 3452, 3472, 3492, 3519, 3544, 3658, 3668, 3685, 3690, 3806, 3876, 4088, 4098, 4148, 4208, 4354, 43

In [None]:
# Create a dictionary of all drivers' routes
drivers_routes = {}

for driver in uniqueDrivers:
    print("driver", driver)

    driver_standard_index = np.array(max_value_index[driver_indices[driver]])
    driver_max_value = np.array(max_value[driver_indices[driver]])

    # Assuming driver_standard_index is a NumPy array
    unique_values_index = np.unique(driver_standard_index[~np.isnan(driver_standard_index)]).astype(int)
    # print("unique_values_index", unique_values_index)

    # Calculate the mean for each unique value
    # means = [np.mean(driver_max_value[driver_standard_index == idx]) for idx in unique_values_index if idx != np.nan]
    # print("means", means)

    weighted_sums = [np.sum(driver_max_value[driver_standard_index == idx]) * np.count_nonzero(driver_standard_index == idx) for idx in unique_values_index if not np.isnan(idx)]
    # print("weighted_sums", weighted_sums)

    best_route_Ids = []
    # Print the results for each driver
    for idx, mean in zip(unique_values_index, weighted_sums):
        # print(f"Driver: {driver}, Unique Value: {idx}, Mean: {mean}")
        best_route_Ids.append([standardIds[idx], mean])

    # Sort the routes by their mean
    best_route_Ids.sort(key=lambda x: x[1], reverse=True)
    # print("best_route_Ids", best_route_Ids)
    
    # Keep the top 5 routes
    top_5_routes = best_route_Ids[:5]

    # Update the driver's routes in the dictionary
    drivers_routes[driver] = {'driver': driver, 'routes': [id for id,value in top_5_routes]}

# Convert the dictionary to a list for JSON serialization
result_list = list(drivers_routes.values())

# Write the result to driver.json
with open(os.path.join('results', 'driver.json'), 'w') as outfile:
    json.dump(result_list, outfile, ensure_ascii=False ,indent=2)

print("JSON driver data has been written to results/driver.json")

driver A_0
driver A_1
driver A_2
driver A_3
driver B_0
driver B_1
driver B_2
driver B_3
driver C_0
driver C_1
driver C_2
driver C_3
driver D_0
driver D_1
driver D_2
driver D_3
driver E_0
driver E_1
driver E_2
driver E_3
driver F_0
driver F_1
driver F_2
driver F_3
driver G_0
driver G_1
driver G_2
driver G_3
driver H_0
driver H_1
driver H_2
driver H_3
driver I_0
driver I_1
driver I_2
driver I_3
driver J_0
driver J_1
driver J_2
driver J_3
driver K_0
driver K_1
driver K_2
driver K_3
driver L_0
driver L_1
driver L_2
driver L_3
driver M_0
driver M_1
driver M_2
driver M_3
driver N_0
driver N_1
driver N_2
driver N_3
driver O_0
driver O_1
driver O_2
driver O_3
driver P_0
driver P_1
driver P_2
driver P_3
driver Q_0
driver Q_1
driver Q_2
driver Q_3
driver R_0
driver R_1
driver R_2
driver R_3
driver S_0
driver S_1
driver S_2
driver S_3
driver T_0
driver T_1
driver T_2
driver T_3
driver U_0
driver U_1
driver U_2
driver U_3
driver V_0
driver V_1
driver V_2
driver V_3
driver W_0
driver W_1
driver W_2

## STANDARD ROUTES x STANDARD ROUTES

In [None]:
x+=lol

NameError: name 'x' is not defined

In [None]:
@njit(cache=True, nogil=True, parallel=True)
def compute_subset_similarity_matrix_and_merch(matrix, matrixMerch, progress_proxy):
    n = matrix.shape[0]
    n1 = matrix.shape[1]
    m = matrixMerch.shape[1]
    similarity_pairs = np.zeros((n,n))
    subset2 = matrix
    subset2Merch = matrixMerch
    squareMatrix = np.full((n, m), 2)
    routeWeights = np.full(n, 0.9)
    merchWeights = np.full(n, 0.1)
    print("n", n, "m", m)
    print("matrix merch", matrixMerch.shape)
    print("matrix square", squareMatrix.shape)
    print("routeWeights", routeWeights.shape)
    print("merchWeights", merchWeights.shape)
    normsSubset2Merch = np.sqrt(np.sum(np.power(subset2Merch, squareMatrix), axis=1))
    for i in prange(n):
        subset1 = matrix[i].reshape(1, -1) #replicate_row(subset_matrix, i)  
        subset1Merch = matrixMerch[i].reshape(1, -1)
        #print("subset1", subset1.shape)
        #print("subset2", subset2.shape)
        
        min_matrix = np.minimum(subset1, subset2)
        sum_min_matrix = np.sum(min_matrix, axis=-1)
        
        max_matrix = np.maximum(subset1, subset2)
        sum_max_matrix = np.sum(max_matrix, axis=-1)
        
        #print("sum_min_matrix", sum_min_matrix.shape)
        
        #print("merch1", subset1Merch.shape)
        #print("merch2", subset2Merch.shape)
        
        #distMerch = 1 - np.abs(np.dot(subset1Merch, subset2Merch.T) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch)))
        #distMerch = 1 - (((subset1Merch * subset2Merch).sum(axis=1) / (np.sqrt(np.sum(np.power(subset1Merch, squareMatrix),axis=1)) * normsSubset2Merch)) + 1) / 2
        
        
        min_matrix_merch = np.minimum(subset1Merch, subset2Merch)
        sum_min_matrix_merch = np.sum(min_matrix_merch, axis=-1)
        
        max_matrixMerch = np.maximum(subset1Merch, subset2Merch)
        sum_max_matrixMerch = np.sum(max_matrixMerch, axis=-1)
        distMerch = 1 - (sum_min_matrix_merch / sum_max_matrixMerch)
        
        # if i == 0 or i == n-1:
        #     print("i", i, "distMerch", distMerch.shape, distMerch)
        #     print("i", i, "sum_min_matrix", (1 - (sum_min_matrix / sum_max_matrix)).shape, (1 - (sum_min_matrix / sum_max_matrix)))
        #     print("i", i, "prod", ((1 - (sum_min_matrix / sum_max_matrix)) * distMerch).shape, ((1 - (sum_min_matrix / sum_max_matrix)) * distMerch))
        #print(i, (1 - (sum_min_matrix / sum_max_matrix)) * distMerch)
        #similarity_pairs[i] = ((1 - (sum_min_matrix / sum_max_matrix)) + distMerch)/2
        routeDistance = 1 - (sum_min_matrix / sum_max_matrix)
        similarity_pairs[i] = np.power(routeDistance, routeWeights) * np.power(distMerch, merchWeights)
        # if np.isnan(similarity_pairs[i]).any():
        #     np.set_printoptions(threshold=10000)
        #     # print("similarity_pairs[i]", similarity_pairs[i])
        #     print("dist merch", distMerch)
        #     # print("dist routes", routeDistance)
        #     # print("pow1", np.power(routeDistance, routeWeights))
        #     # print("pow2", np.power(distMerch, merchWeights))
        #     # print("prod", np.power(routeDistance, routeWeights) * np.power(distMerch, merchWeights))
        #     # print("powers", routeWeights, merchWeights)
        #     print("BROKEN")
        #     return
        #similarity_pairs[i] = (1 - (sum_min_matrix / sum_max_matrix)) * (distMerch)
        #similarity_pairs[i] = 1 - (sum_min_matrix / sum_max_matrix + distMerch) / 2
        # if similarity_pairs[i] >= 1:
        #     print("similarity_pairs[i]", similarity_pairs[i])
        #     print("dist merch", distMerch, "cosine ", np.abs(np.dot(subset1Merch, subset2Merch) / (np.linalg.norm(subset1Merch) * np.linalg.norm(subset2Merch))))
        #     print("dist routes", (1 - (sum_min_matrix / sum_max_matrix)))
        #     print("prod", (1 - (sum_min_matrix / sum_max_matrix)) * distMerch)
        progress_proxy.update(1)
    return similarity_pairs

In [None]:
def jaccard_similarity_minhash_lsh_route_merch(matrix, matrixMerch, thresh_user=0.2):
    #similarity_matrix = csr_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    #similarity_matrix = lil_matrix((matrix.shape[0], matrix.shape[0]), dtype=np.float64)
    pairs = lsh(matrix, thresh_user=thresh_user)
    #uniqueRows = np.unique([i for i, j in pairs] + [j for i, j in pairs])
    uniqueRowsSet = set([i for i, j in pairs] + [j for i, j in pairs]) # (1,2) (1,4) (1,5)
    neverSeen = set([i for i in range(matrix.shape[0])]) - uniqueRowsSet
    print("neverSeen", neverSeen)
    #print("uniqueRows numpy", len(uniqueRows))
    print("num of subset of rows to check similarity:", len(uniqueRowsSet))
    #print(" num of pairs", len(uniqueRowsSet)*(len(uniqueRowsSet)-1)/2)
    print(" num of pairs", len(pairs))
    print(" instead of", matrix.shape[0]*(matrix.shape[0]-1)/2)
    print("improved by", (1 - len(pairs) / (matrix.shape[0]*(matrix.shape[0]-1)/2)) *100, "%")
    #print("num of pairs", len(pairs))
  
        
    print("Computing jaccard similarity on subset matrix...")
    
    sortedUniqueRowsSet = sorted(list(uniqueRowsSet))
    subset_matrix = matrix[sortedUniqueRowsSet]
    subset_matrixMerch = matrixMerch[sortedUniqueRowsSet]
    print("subset_matrix", subset_matrix.shape, subset_matrix[0])
    print("subset_matrixMerch", subset_matrixMerch.shape, subset_matrixMerch[0])
    with ProgressBar(total=len(sortedUniqueRowsSet)) as progress:
        subset_sim_matrix = compute_subset_similarity_matrix_and_merch(subset_matrix, subset_matrixMerch, progress)
    print("subset_sim_matrix", subset_sim_matrix.shape, subset_sim_matrix[0])
    print("subset_sim_matrix contains nan", np.isnan(subset_sim_matrix).any())
    print("nan indices", len(np.argwhere(np.isnan(subset_sim_matrix))), np.argwhere(np.isnan(subset_sim_matrix)))
    
 
    print("Mapping back to original matrix...")
    
    lenMatrixNoNeverSeen = matrix.shape[0] - len(neverSeen)
    
    # remove never seen rows and map indices
    map_indices = {}
    sortedNeverSeen = sorted(list(neverSeen))
    counter = 0
    for i in range(matrix.shape[0]):
        if i in sortedNeverSeen:
            continue
        map_indices[i] = counter
        counter += 1
        
    print("map_indices", map_indices)
    map_indices_back = {v: k for k, v in map_indices.items()}
    
    
    #similarity_matrix.setdiag(1)
    subset_sim_matrix = csr_matrix(subset_sim_matrix)
    
    return subset_sim_matrix, map_indices_back


In [None]:
def jaccard_similarity_matrix_merch(matrix):
    print("matrix", matrix.shape)
    min_matrix = np.minimum(matrix[:, None, :], matrix[None, :, :]) # (10, 100) -> (10, 1, 100) -> (1, 10, 100) -> (10, 10, 100)
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix[:, None, :], matrix[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity

In [None]:
# convert routes and merchandise to binary matrices
# binary matrix where each row represents a route
print("Creating route binary matrix...")
route_matrix, route_matrix_standard = create_binary_matrices(actualSets, standardSets)
print("\nroute_matrix standard", route_matrix_standard.shape, route_matrix_standard[0])

print("Minhashing route matrix...")    
num_hash_functions = find_num_hashes_minhash(route_matrix_standard)
route_matrix_standard = minhash(route_matrix_standard, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1)
print("\nroute_matrix minhash", route_matrix_standard.shape, route_matrix_standard[0])
# binary matrix where each row represents merchandise

print("Creating merchandise binary matrix...")
merch_matrix = np.array([s[2] for s in standardSets])

print("\nmerch_matrix", merch_matrix.shape, merch_matrix)
print("merch_matrix contains nan", np.isnan(merch_matrix).any())


print("Computing Jaccard similarity route matrix...")
actualSetsDistances, map_indices_back = jaccard_similarity_minhash_lsh_route_merch(route_matrix_standard, merch_matrix, thresh_user=0.4)
#route_similarity = jaccard_similarity_matrix(route_matrix)
print("\nactualSetsDistances", type(actualSetsDistances), actualSetsDistances.shape,actualSetsDistances[0, 0], actualSetsDistances[0])
print("map indices back", map_indices_back)

Creating route binary matrix...

route_matrix standard (10, 3098) [0. 0. 0. ... 0. 0. 0.]
Minhashing route matrix...


minhashing: 100%|██████████| 10/10 [00:00<00:00, 1254.43it/s]



route_matrix minhash (10, 150) [162. 209.   4.  60.  89.  58. 248.   1.  85. 490.  55.  66. 156. 560.
  18.  33. 168.   9.  57.  33.  77.  44.  90. 365.   2.  52.  49. 224.
  11.  18. 471. 136.  29. 239.  84.  22. 127. 146. 128. 117.   6.   6.
 180.  61.  68.  42.   1.  42. 145.   4. 279. 206.  74. 309.  29. 181.
  16. 116.  16. 183.   2. 239. 267.  57. 108. 337.  41. 271.  67.  12.
  14.  70. 205. 129. 221. 143.  35. 208.  58.  46.  51. 101. 121.  66.
  70. 198.  50.  48.  61. 374.  17.  34. 177. 122.  30. 119.  22.  26.
 100.  44.   8.  50. 147.  46. 163. 152.  29.  34.  59.  18. 132.  60.
 215.  99.  65. 102.  15.  35.  69.  92.  65.  36. 416.  44.  58. 166.
 140.  71. 535. 248.  93.  20.  71. 124.  70. 158. 137.  71. 118. 166.
   6. 100.  23. 188. 202.  94. 173.  26. 112.  44.]
Creating merchandise binary matrix...

merch_matrix (10, 50) [[0.24172414 0.34586207 0.24965517 0.28275862 0.33482759 0.21206897
  0.19551724 0.20827586 0.26206897 0.22586207 0.23413793 0.22827586
  0.21758

100%|██████████| 10/10 [00:00<00:00, 10027.02it/s]

neverSeen {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
num of subset of rows to check similarity: 0
 num of pairs 0
 instead of 45.0
improved by 100.0 %
Computing jaccard similarity on subset matrix...





IndexError: index 0 is out of bounds for axis 0 with size 0