for each driver, creates a list of standard routes in that order so that the higher in the list a standard route is, the least the diversion of the driver will be, and 
the output of the program is: 

a file called driver.json that has for each driver, the 5 standard routes routes that if the driver does them, it minimizes the diversion. You can test this by considering as pool of standard routes those that originally the company has and also those that you recommend in the recStandard.json. The file driver.json has the following syntax:
[
	{driver:C, routes:[s10, s20, s2, s6, s10}}, 
	{driver:A, routes:[s1, s2, s22, s61, s102]}, 
….
]


In [8]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import time
import math
import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

# from sklearn.manifold import TSNE
# from sklearn.cluster import KMeans
# from sklearn.cluster import HDBSCAN, DBSCAN
# from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

from scipy.sparse import csr_matrix, issparse, lil_matrix

from tqdm import tqdm
from pandarallel import pandarallel

from numba import njit, prange
from numba_progress import ProgressBar

HOME:  c:\Users\matti\Desktop\CODE\DataMiningProject23-24


In [9]:
STANDARD_FILE = 'standard_big_new_2.json'
ACTUAL_FILE = 'actual_big_new_2.json'


K_SHINGLES = 3
ALPHA = 0.7

In [10]:
# load standard and actual data
print("\nReading standard data...")
with open(os.path.join('data',STANDARD_FILE)) as f:
    standard = json.load(f)

print("\nReading actual data...")
with open(os.path.join('data', ACTUAL_FILE)) as f:
    actual = json.load(f)

# load the data into a dataframe
print("\nCreating standard dataframe...")
dfStandard = pd.DataFrame(standard)
print("\nCreating actual dataframe...")
dfActual = pd.DataFrame(actual)

# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())

# get the unique cities and items of the standard data
cities = []
items = []
drivers = []
longestRoute = 0
shortestRoute = np.inf
maxItemQuantity = 0

standardRefIds = []
for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    standardRefIds.append(int(idS[1]))
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing standard data")

actualRefStandardIds = []
for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    idStandard = s['sroute']
    drivers.append(s['driver'])
    actualRefStandardIds.append(int(idStandard[1]))
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)
print("\nFinished preparing actual data")

# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
#uniqueCities.insert(0, 'NULL')          # add NULL city, for padding vectors with different lengths (trips in routes)
uniqueItems = sorted(list(set(items)))
uniqueDrivers = sorted(list(set(drivers)))

if shortestRoute < 2:
    K_SHINGLES = 2

threeShingles = []

for i, c1 in enumerate(uniqueCities):
    for j, c2 in enumerate(uniqueCities):
        if i == j:
            continue
        for k, c3 in enumerate(uniqueCities):
            if j == k or i == k:
                continue
            threeShingles.append([c1, c2, c3])
            
permutations = math.perm(len(uniqueCities), K_SHINGLES)

print("\nUnique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)
print("Unique drivers: ", uniqueDrivers)

standardIds = dfStandard['id'].tolist()
print("standardIds: ", standardIds)

print("\nNumber of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))

print("\nLongest route: ", longestRoute)
print("Shortest route: ", shortestRoute)

print("\nMax item quantity: ", maxItemQuantity)

print("\nNumber of three-shingles: ", len(threeShingles))

print(f"\n{K_SHINGLES}-shingles: ", math.perm(len(uniqueCities), K_SHINGLES))
print(f"{K_SHINGLES}-shingles: ", math.comb(len(uniqueCities), K_SHINGLES))

print(f"\n\033[92mK-Shingles used: {K_SHINGLES} \033[0m")



Reading standard data...

Reading actual data...

Creating standard dataframe...

Creating actual dataframe...
   id                                              route
0  s0  [{'from': 'Caltanissetta', 'to': 'Piacenza', '...
1  s1  [{'from': 'Rome', 'to': 'Cerignola', 'merchand...
2  s2  [{'from': 'Massa', 'to': 'Treviso', 'merchandi...
3  s3  [{'from': 'Cerignola', 'to': 'Perugia', 'merch...
4  s4  [{'from': 'Massa', 'to': 'Rome', 'merchandise'...
   id driver sroute                                              route
0  a0      D     s0  [{'from': 'Massa', 'to': 'Rome', 'merchandise'...
1  a1      H     s0  [{'from': 'Massa', 'to': 'Rome', 'merchandise'...
2  a2      I     s0  [{'from': 'Massa', 'to': 'Rome', 'merchandise'...
3  a3      E     s0  [{'from': 'Massa', 'to': 'Rome', 'merchandise'...
4  a4      J     s0  [{'from': 'Massa', 'to': 'Rome', 'merchandise'...

Finished preparing standard data

Finished preparing actual data

Unique cities:  ['Caltanissetta', 'Cerignola', 'Foggi

In [11]:
def hashShingles(shingles, n):
    # hash shingles
    string = "" 
    for shingle in shingles:
        string += str(shingle) + "," # [45, 4, 8] -> "45,4,8,"
    
    return hash(string) #% n

def createShingles(df, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    # create shingles for each route
    shingles = []
    for index, s in df.iterrows():
        idS = s['id']
        route = s['route']
        shingle = [index]
        citiesInRoute = [] # napoli roma milano teramo bergamo [10,4,5,48,12] [10,4,5] [4,5,48] [5,48,12]
        merchandiseInRoute = np.zeros(len(uniqueItems))
        for trip in route:
            citiesInRoute.append(uniqueCities.index(trip['from']))
            #merchandiseInRoute += np.array(list(trip['merchandise'].values()))
            for item, n in trip['merchandise'].items():
                merchandiseInRoute[uniqueItems.index(item)] += n
        if len(route) > 0:
            citiesInRoute.append(uniqueCities.index(route[-1]['to']))
        if len(route) > 0:
            merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
        
        hashedShingles = []
        for i in range(len(citiesInRoute)-k+1):
            # Q: is it correct to set the modulo for the hash function to the number of permutations?
            # A: yes, because we want to have a unique hash for each shingle
            # Q: would it be better to use a different hash function?
            # A: yes, because the modulo function is not a good hash function
            hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations) )
        
        shingle.append(np.array(hashedShingles))
        
        shingle.append(merchandiseInRoute) # quantity hot encoding
        
        shingles.append(shingle)
        
    return shingles # [ index, [shingles], [merchandise] ]

def create_shingles(s, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    idS = s['id']
    route = s['route']
    shingle = [s.name]
    citiesInRoute = [] 
    merchandiseInRoute = np.zeros(len(uniqueItems))
    for trip in route:
        citiesInRoute.append(uniqueCities.index(trip['from']))
        for item, n in trip['merchandise'].items():
            merchandiseInRoute[uniqueItems.index(item)] += n
    if len(route) > 0:
        citiesInRoute.append(uniqueCities.index(route[-1]['to']))
    if len(route) > 0:
        merchandiseInRoute = merchandiseInRoute / (maxItemQuantity*len(route))
    
    hashedShingles = []
    for i in range(len(citiesInRoute)-k+1):
        hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations))
    
    shingle.append(np.array(hashedShingles))
    shingle.append(merchandiseInRoute)
    
    return shingle

In [12]:
standardSets = createShingles(dfStandard, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
actualSets = createShingles(dfActual, k=K_SHINGLES, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
#pandarallel.initialize(progress_bar=True)
# standardSets = dfStandard.parallel_apply(lambda s: create_shingles(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
# standardSets = standardSets.tolist()
# actualSets = dfActual.parallel_apply(lambda s: create_shingles(s, K_SHINGLES, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations), axis=1)
# actualSets = actualSets.tolist()

print("\nstandardSets", len(standardSets), "shape first element", standardSets[0][1].shape, standardSets[0])
print("\nactualSets", len(actualSets),  "shape first element", standardSets[0][1].shape, actualSets[0])

print("\nstandardSets:", len(standardSets))
print("actualSets:", len(actualSets))

assert len(standardSets[0]) == 3, "The length of the standard set is not equal to 3 (index, shingles, merchandise)"
assert len(standardSets[0][2]) == len(uniqueItems), "The length of the merchandise vector is not equal to the number of unique items"


standardSets 10 shape first element (2,) [0, array([-6682426417645961081,  6371620475143737465], dtype=int64), array([0.3 , 0.15, 0.2 , 0.2 , 0.4 , 0.4 , 0.4 , 0.45, 0.1 , 0.2 ])]

actualSets 100 shape first element (2,) [0, array([ 4210742651727736179,   -60461073256499310,  4449988028164795064,
        1103541041183974940, -6870364706471386894, -9215655811683404505,
       -3808473816662730513], dtype=int64), array([0.48571429, 0.35714286, 0.18571429, 0.35714286, 0.5       ,
       0.17142857, 0.21428571, 0.38571429, 0.48571429, 0.35714286])]

standardSets: 10
actualSets: 100


### FUNCTIONS

In [13]:
def jaccard_similarity_matrix(matrix):
    intersection = np.dot(matrix, matrix.T)
    row_sums = matrix.sum(axis=1)
    union = row_sums[:, None] + row_sums - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

def jaccard_similarity_two_matrices(matrix1, matrix2):
    #intersection = np.dot(matrix, matrix.T)
    intersection = np.dot(matrix1, matrix2.T)
    row_sums1 = matrix1.sum(axis=1)
    row_sums2 = matrix2.sum(axis=1)
    union = row_sums1[:, None] + row_sums2 - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

In [14]:
import numpy as np
from scipy.sparse import csr_matrix

def jaccard_similarity_sparse(matrix1, matrix2):
    # Convert dense matrices to sparse matrices (CSR format)
    sparse_matrix1 = csr_matrix(matrix1)
    sparse_matrix2 = csr_matrix(matrix2)

    # Matrix multiplication in CSR format
    intersection = sparse_matrix1.dot(sparse_matrix2.T).toarray()

    # Row sums using CSR format
    row_sums1 = sparse_matrix1.sum(axis=1).A.ravel()
    row_sums2 = sparse_matrix2.sum(axis=1).A.ravel()

    # Calculate union using the correct formula
    union = row_sums1 + row_sums2 - intersection

    # Avoid division by zero
    union = np.where(union == 0, 1, union)

    # Jaccard similarity
    jaccard_similarity = intersection / union
    return jaccard_similarity


In [15]:
def jaccard_similarity_matrix_merch(matrix):
    print("matrix", matrix.shape)
    min_matrix = np.minimum(matrix[:, None, :], matrix[None, :, :])
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix[:, None, :], matrix[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity

def jaccard_similarity_matrices_merch(matrix1, matrix2):
    print("matrix1", matrix1.shape)
    print("matrix2", matrix2.shape)
    
    min_matrix = np.minimum(matrix1[:, None, :], matrix2[None, :, :])
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix1[:, None, :], matrix2[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity


def create_binary_matrix(routeSets):
    # create binary matrix where each row represents a route
    uniqueShingles = list(set([shingle for route in routeSets for shingle in route[1]]))
    binaryMatrix = np.zeros((len(routeSets), len(uniqueShingles)))
    for i, route in enumerate(routeSets):
        for shingle in route[1]:
            binaryMatrix[i][uniqueShingles.index(shingle)] = 1
    return binaryMatrix



In [16]:
def create_binary_matrices(routeSet1, routeSet2):
    # create binary matrix where each row represents a route
    uniqueShinglesBoth = list(set([shingle for route in routeSet1 for shingle in route[1]] + [shingle for route in routeSet2 for shingle in route[1]]))
    binaryMatrix1 = np.zeros((len(routeSet1), len(uniqueShinglesBoth)))
    binaryMatrix2 = np.zeros((len(routeSet2), len(uniqueShinglesBoth)))
    for i, route in enumerate(routeSet1):
        for shingle in route[1]:
            binaryMatrix1[i][uniqueShinglesBoth.index(shingle)] = 1
            
    for i, route in enumerate(routeSet2):
        for shingle in route[1]:
            binaryMatrix2[i][uniqueShinglesBoth.index(shingle)] = 1
    return binaryMatrix1, binaryMatrix2

In [17]:
def find_band_and_row_values(columns, threshold):
    previous_b = 1
    previous_r = columns
    for b in range(1, columns + 1):
        if columns % b == 0:
            r = columns // b
            if (1 / b) ** (1 / r)  <= threshold:
                if np.abs((1 / previous_b) ** (1 / previous_r) - threshold) < np.abs((1 / b) ** (1 / r) - threshold):
                    return previous_b, previous_r
                return b, r
    return columns, 1

In [43]:
def lsh_two_matrices(minhash_matrix1, minhash_matrix2, thresh_user=0.2):
    # Initialize the signature matrix
    columns = minhash_matrix1.shape[1]
    
    # Generate the hash functions
    # hash_functions = [lambda x, a=a, b=b: (a * x + b) % minhash_matrix.shape[1] for a, b in zip(random.sample(range(1000), bands), random.sample(range(1000), bands))]
    hash_function = lambda x: hash(",".join([str(x[i]) for i in range(len(x))]))
    
    # b = bands
    # r = columns//bands
    b, r = find_band_and_row_values(columns, thresh_user)
    # If columns is not divisible by bands
    if columns % b != 0:
        # Find the closest number that makes it divisible
        while columns % b != 0:
            b -= 1
        r = columns // b
    #bands = b
        
    print("final bands", b)
    signature_matrix1 = np.full((minhash_matrix1.shape[0], b), np.inf)
    signature_matrix2 = np.full((minhash_matrix2.shape[0], b), np.inf)
    
    # if threshold is 0.8,
    threshold = (1 / b) ** (1 / r) 
    print("lsh threshold", threshold)
    
    # For each band
    print("Computing hash values of bands...")
    hash_values1 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix1.shape[0], 1, minhash_matrix1.reshape(-1, r))
    hash_values2 = np.apply_along_axis(lambda x: hash_function(x) % minhash_matrix2.shape[0], 1, minhash_matrix2.reshape(-1, r))
    # Reshape the hash values to match the signature matrix
    hash_values1 = hash_values1.reshape(minhash_matrix1.shape[0], b)
    hash_values2 = hash_values2.reshape(minhash_matrix2.shape[0], b)
    # Update the signature matrix
    signature_matrix1 = hash_values1
    signature_matrix2 = hash_values2
            
    # find candidate pairs
    print("Finding candidate pairs...")
    candidate_pairs = np.empty((minhash_matrix1.shape[0], 2))
    for i in tqdm(range(signature_matrix1.shape[0])):
        # Compute the similarity of the current row with all following rows
        similarities = np.sum(signature_matrix2 == signature_matrix1[i, :], axis=1) / b
        # Find the indices of the rows that have a similarity greater than or equal to the threshold
        #indices = np.nonzero(similarities >= threshold)[0]
        indexMax = np.argmax(similarities)
        simMax = similarities[indexMax]
        # Add the pairs to the candidate pairs
        #candidate_pairs.extend((i, i+1+index) for index in indices)
        candidate_pairs[i] = [indexMax, simMax]
        
    return candidate_pairs


In [19]:
def hash_function_hash_code(num_of_hashes,n_col,next_prime):
  
    #coeffA = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))
    #coeffB = np.array(pick_random_coefficients(num_of_hashes,max_column_length)).reshape((num_of_hashes,1))

    coeffA = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))
    coeffB = np.array(random.sample(range(0,n_col*100),num_of_hashes)).reshape((num_of_hashes,1))

    x = np.arange(n_col).reshape((1,n_col))

    hash_code = (np.matmul(coeffA,x) + coeffB) % next_prime # (num_of_hashes,n_col) so how each column index is permuted

    return hash_code

def minhash(u,num_of_hashes):
    (n_row, n_col) = u.shape
    next_prime = n_col
    hash_code = hash_function_hash_code(num_of_hashes,n_col,next_prime)

    signature_array = np.empty(shape = (n_row,num_of_hashes))

    #t2 = time.time()
    for row in tqdm(range(n_row), desc="minhashing"):
        #print("row", row)
        ones_index = np.where(u[row,:]==1)[0]
        #if len(ones_index) == 0:
        signature_array[row,:] = np.zeros((1,num_of_hashes))
            #continue
        corresponding_hashes = hash_code[:,ones_index]
        #print("ones_index", ones_index.shape, ones_index)
        #print("corresponding_hashes", corresponding_hashes.shape, corresponding_hashes)
        row_signature = np.amin(corresponding_hashes,axis=1).reshape((1,num_of_hashes))

        signature_array[row,:] = row_signature

    return signature_array

In [20]:
def find_num_hashes_minhash(matrix):
    if matrix.shape[1] < 1000:
        num_hash_functions = matrix.shape[1]//10
    elif matrix.shape[1] < 10_000:
        num_hash_functions = 150
    elif matrix.shape[1] < 100_000:
        num_hash_functions = 250
    else:
        num_hash_functions = 300
    return num_hash_functions

## FIRST IDEA:

In [60]:
# convert routes and merchandise to binary matrices
# binary matrix where each row represents a route
print("Creating  binary matrices...")
route_actual_matrix, route_standard_matrix = create_binary_matrices(actualSets, standardSets)# [actual, vector] , [standard, vector] 
print("\nroute_matrix actual", route_actual_matrix.shape, route_actual_matrix[0])
print("\nroute_matrix standard", route_standard_matrix.shape, route_standard_matrix[0])

print("\nMinhashing binary matrices...\n")    
num_hash_functions = find_num_hashes_minhash(route_actual_matrix)
num_hash_functions = 100

route_actual_matrix = minhash(route_actual_matrix, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1) # [actual, vector/num_hash_functions] 
print("\nroute_actual_matrix minhash\n", route_actual_matrix.shape, route_actual_matrix[0])
route_standard_matrix = minhash(route_standard_matrix, num_hash_functions if num_hash_functions % 2 == 0 else num_hash_functions + 1) # [standard, vector/num_hash_functions]
print("\nroute_matrix_standard minhash", route_standard_matrix.shape, route_standard_matrix[0])


print("\nComputing Jaccard similarity...")
matrix_sim_route_actual_standard = lsh_two_matrices(route_actual_matrix, route_standard_matrix,thresh_user=0.2) #[actual, [standard_index, similarity]]

print("matrix_sim_actual_standard", matrix_sim_route_actual_standard.shape)
print("matrix_sim_actual_standard[0]", matrix_sim_route_actual_standard)


# # 
# merch_actual_matrix = np.array([s[2] for s in actualSets])
# merch_standard_matrix = np.array([s[2] for s in standardSets])

# # compute Jaccard similarity for merchandise
# matrix_sim_merch_actual_standard = jaccard_similarity_matrices_merch(merch_actual_matrix, merch_standard_matrix)



# SetsSimilarities = matrix_sim_route_actual_standard**ALPHA + matrix_sim_merch_actual_standard**(1-ALPHA)
# # SetsSimilarities=np.nan_to_num(SetsSimilarities, nan=0)

# print("SetsSimilarities", SetsSimilarities.shape)
# print("SetsSimilarities[0]", SetsSimilarities[0])

Creating  binary matrices...

route_matrix actual (100, 81) [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]

route_matrix standard (10, 81) [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Minhashing binary matrices...



minhashing: 100%|██████████| 100/100 [00:00<00:00, 11136.11it/s]



route_actual_matrix minhash
 (100, 100) [ 5.  5.  0. 23.  4.  3. 10.  1. 17.  9. 32.  4.  3.  5.  6. 18. 14.  6.
 26. 18.  4.  8. 20. 15. 17. 10.  1. 18.  8. 17. 10.  6.  2.  6.  4. 39.
 19.  1.  8. 39. 13. 10. 11.  8. 15.  0. 15.  5.  4.  8. 13.  2.  0.  1.
  0.  0.  6.  9.  0. 20. 13.  6. 30.  1.  2.  3.  2.  2.  2. 10.  3.  0.
  1.  4. 15.  8. 23. 19. 13.  5.  5. 18. 21. 31.  7.  7. 10.  0. 15.  2.
 15. 13.  8.  9. 12.  6. 26.  5.  6. 19.]


minhashing: 100%|██████████| 10/10 [00:00<00:00, 10051.05it/s]



route_matrix_standard minhash (10, 100) [43.  8. 18. 47. 56. 26. 53. 51. 34. 15.  9. 43.  6.  4. 47. 70. 62.  3.
 48.  1.  3. 18. 18. 57. 18. 22.  0. 11. 30. 20. 16. 23.  3. 51. 41. 49.
 36. 19. 17. 25.  3. 47. 27. 31. 45. 30. 66. 28. 20. 36.  4. 46. 20. 51.
 63.  1. 10. 50. 13. 26. 13. 38.  5.  9. 17.  7. 17.  2. 25. 14. 21. 28.
  8.  9. 24. 45. 44. 26. 32.  6.  7.  8. 15. 39. 20. 10.  5. 36. 22. 60.
 55. 43. 43. 12.  7.  8.  1. 39. 52. 39.]

Computing Jaccard similarity...
final bands 50
lsh threshold 0.1414213562373095
Computing hash values of bands...
Finding candidate pairs...


100%|██████████| 100/100 [00:00<00:00, 33423.41it/s]

matrix_sim_actual_standard (100, 2)
matrix_sim_actual_standard[0] [[7.   0.02]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [8.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [8.   0.04]
 [9.   0.02]
 [2.   0.02]
 [3.   0.02]
 [6.   0.02]
 [0.   0.  ]
 [9.   0.02]
 [2.   0.02]
 [2.   0.02]
 [9.   0.02]
 [0.   0.  ]
 [7.   0.04]
 [7.   0.04]
 [7.   0.04]
 [7.   0.04]
 [7.   0.04]
 [7.   0.04]
 [7.   0.04]
 [7.   0.06]
 [7.   0.04]
 [7.   0.04]
 [7.   0.04]
 [3.   0.04]
 [1.   0.04]
 [7.   0.04]
 [7.   0.04]
 [1.   0.02]
 [1.   0.02]
 [1.   0.04]
 [7.   0.04]
 [1.   0.02]
 [5.   0.02]
 [5.   0.02]
 [5.   0.02]
 [5.   0.02]
 [0.   0.02]
 [0.   0.02]
 [0.   0.02]
 [0.   0.02]
 [0.   0.02]
 [0.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [1.   0.02]
 [5.   0.04]
 [2.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [1.   0.04]
 [2.   0.02]
 [2.   0.02




In [None]:
max_value_index = np.argmax(SetsSimilarities, axis=1)
max_value = np.max(SetsSimilarities, axis=1)

# print("Index of the biggest value for each row:", max_value_index)
# print("Value of the biggest value for each row:", max_value)

# print("len(max_value_index)", len(max_value_index))
# print("len(max_value)", len(max_value))

# [index for index, s in dfActual.iterrows() if s['driver'] == driver]
# uniqueDrivers
driver_indices = {}

for i, s in dfActual.iterrows():
    driver = s['driver']
    
    # Check if the driver is already in the dictionary
    if driver in driver_indices:
        # If yes, append the index to the existing array
        driver_indices[driver].append(i)
    else:
        # If not, create a new array with the current index
        driver_indices[driver] = [i]

print("driver_indices", driver_indices)
print("len(driver_indices)", len(driver_indices))

driver_indices {'E': [0, 7, 14, 25, 34, 40, 46, 49, 50, 64, 78, 88, 98, 122, 123, 126, 127, 129, 149, 163, 180, 188, 199, 200, 203, 225, 226, 265, 274, 292, 306, 329, 342, 345, 354, 361, 370, 375, 384, 394, 419, 420, 422, 431, 438, 439, 440, 443, 444, 447, 454, 456, 476, 493, 505, 516, 527, 532, 535, 550, 553, 561, 564, 566, 588, 614, 637, 660, 661, 663, 675, 680, 688, 717, 723, 729, 735, 751, 760, 763, 780, 789, 791, 810, 820, 824, 841, 861, 865, 868, 877, 896, 901, 910, 921, 923, 946, 953, 954, 965, 973, 974, 991, 996, 999], 'G': [1, 17, 20, 24, 28, 35, 48, 59, 94, 95, 100, 116, 125, 130, 132, 138, 157, 164, 170, 182, 196, 240, 249, 255, 282, 304, 308, 320, 324, 330, 337, 373, 391, 398, 409, 414, 416, 426, 436, 460, 463, 497, 506, 530, 531, 552, 568, 571, 577, 595, 602, 618, 635, 638, 645, 649, 658, 659, 673, 678, 693, 726, 727, 747, 758, 764, 784, 800, 808, 811, 822, 828, 831, 843, 845, 870, 871, 883, 885, 905, 906, 929, 933, 935, 947, 968, 975, 976, 982, 989], 'A': [2, 8, 15, 22, 2

In [None]:
# Create a dictionary of all drivers' routes
drivers_routes = {}

for driver in uniqueDrivers:
    print("driver", driver)
    # print("driver_indices[driver]", driver_indices[driver])
    # print("len(driver_indices[driver])", len(driver_indices[driver]))

    driver_standard_index = max_value_index[driver_indices[driver]]
    driver_max_value = max_value[driver_indices[driver]]
    # print("driver_max_value_index", driver_standard_index)

    # Assuming driver_max_value_index is a NumPy array
    unique_values_index = np.unique(driver_standard_index)
    # print("unique_values", unique_values_index)


    # Calculate the mean for each unique value
    means = [np.mean(driver_max_value[driver_standard_index == idx]) for idx in unique_values_index]
    # means = [np.mean(np.array(driver_route_values)[np.where(np.array(driver_standard[driver])==idx)[0]]) for idx in unique_values_index]
    # print("means", means)

    best_route_Ids = []
    # Print the results for each driver
    for idx, mean in zip(unique_values_index, means):
        print(f"Driver: {driver}, Unique Value: {idx}, Mean: {mean}")
        best_route_Ids.append([standardIds[idx], mean])

    # print("best_route_Ids", best_route_Ids)
    # Sort the routes by their mean
    best_route_Ids.sort(key=lambda x: x[1], reverse=True)
    # print("best_route_Ids", best_route_Ids)
    
    # Keep the top 5 routes
    top_5_routes = best_route_Ids[:5]
    # print("top_5_routes", top_5_routes)

    # Update the driver's routes in the dictionary
    drivers_routes[driver] = {'driver': driver, 'routes': [id for id,value in top_5_routes]}

# Convert the dictionary to a list for JSON serialization
result_list = list(drivers_routes.values())

# Write the result to driver.json
with open(os.path.join('results', 'driver.json'), 'w') as outfile:
    json.dump(result_list, outfile, ensure_ascii=False ,indent=2)

print(f"JSON driver data has been written to data/driver.json")

driver A
Driver: A, Unique Value: 0, Mean: 0.8657383692406707
Driver: A, Unique Value: 1, Mean: 0.8414914356286038
Driver: A, Unique Value: 2, Mean: 0.7834869241457741
Driver: A, Unique Value: 3, Mean: 0.8619251834690733
Driver: A, Unique Value: 4, Mean: 0.6281142247140746
Driver: A, Unique Value: 5, Mean: 0.8458879422336785
Driver: A, Unique Value: 6, Mean: 0.7348764797589156
Driver: A, Unique Value: 7, Mean: 0.7227157102224463
Driver: A, Unique Value: 8, Mean: 0.8767143470077671
Driver: A, Unique Value: 9, Mean: 0.7034413540109373
driver B
Driver: B, Unique Value: 0, Mean: 0.8240522009885515
Driver: B, Unique Value: 1, Mean: 0.6973449239322511
Driver: B, Unique Value: 2, Mean: 0.7497878061411593
Driver: B, Unique Value: 3, Mean: 0.7446710739987588
Driver: B, Unique Value: 4, Mean: 0.8794052033505697
Driver: B, Unique Value: 5, Mean: 0.9252906368224032
Driver: B, Unique Value: 6, Mean: 0.7648631374338595
Driver: B, Unique Value: 7, Mean: 0.6627251760074426
Driver: B, Unique Value: 8, 

## ERIC's IDEA

In [None]:
# binary matrix where each row represents a route
route_actual_matrix, route_standard_matrix = create_binary_matrices(actualSets,standardSets)

# compute Jaccard similarity for routes
matrix_sim__route_actual_standard = jaccard_similarity_two_matrices(route_actual_matrix, route_standard_matrix)

print("matrix_sim_actual_standard", matrix_sim__route_actual_standard.shape)
print("matrix_sim_actual_standard[0]", matrix_sim__route_actual_standard[1])

# 
merch_actual_matrix = np.array([s[2] for s in actualSets])
merch_standard_matrix = np.array([s[2] for s in standardSets])

# compute Jaccard similarity for merchandise
matrix_sim_merch_actual_standard = jaccard_similarity_matrices_merch(merch_actual_matrix, merch_standard_matrix)



SetsSimilarities = (matrix_sim__route_actual_standard + matrix_sim_merch_actual_standard) / 2
SetsSimilarities=np.nan_to_num(SetsSimilarities, nan=0)

print("SetsSimilarities", SetsSimilarities.shape)
print("SetsSimilarities[0]", SetsSimilarities[0])

matrix_sim_actual_standard (1000, 10)
matrix_sim_actual_standard[0] [0.71590909 0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]
matrix1 (1000, 100)
matrix2 (10, 100)
sum_min_matrix (1000, 10)
sum_max_matrix (1000, 10)
SetsSimilarities (1000, 10)
SetsSimilarities[0] [0.89598255 0.42668496 0.36567354 0.3842105  0.35124729 0.41024127
 0.4234152  0.39357334 0.39831321 0.39539493]


In [None]:
# max_value_index = np.argmax(SetsSimilarities, axis=1)
# max_value = np.max(SetsSimilarities, axis=1)
# "sroute": "s0",

driver_indices = {}
driver_standard = {}

for i, s in dfActual.iterrows():
    driver = s['driver']
    route = s['sroute']
    
    # Check if the driver is already in the dictionary
    if driver in driver_indices:
        # If yes, append the index to the existing array
        driver_indices[driver].append(i)
        driver_standard[driver].append(route)
    else:
        # If not, create a new array with the current index
        driver_indices[driver] = [i]
        driver_standard[driver] = [route]

print("driver_indices", driver_indices)
print("len(driver_indices)", len(driver_indices))

print("driver_standard", driver_standard)
print("len(driver_standard)", len(driver_standard))

driver_indices {'E': [0, 7, 14, 25, 34, 40, 46, 49, 50, 64, 78, 88, 98, 122, 123, 126, 127, 129, 149, 163, 180, 188, 199, 200, 203, 225, 226, 265, 274, 292, 306, 329, 342, 345, 354, 361, 370, 375, 384, 394, 419, 420, 422, 431, 438, 439, 440, 443, 444, 447, 454, 456, 476, 493, 505, 516, 527, 532, 535, 550, 553, 561, 564, 566, 588, 614, 637, 660, 661, 663, 675, 680, 688, 717, 723, 729, 735, 751, 760, 763, 780, 789, 791, 810, 820, 824, 841, 861, 865, 868, 877, 896, 901, 910, 921, 923, 946, 953, 954, 965, 973, 974, 991, 996, 999], 'G': [1, 17, 20, 24, 28, 35, 48, 59, 94, 95, 100, 116, 125, 130, 132, 138, 157, 164, 170, 182, 196, 240, 249, 255, 282, 304, 308, 320, 324, 330, 337, 373, 391, 398, 409, 414, 416, 426, 436, 460, 463, 497, 506, 530, 531, 552, 568, 571, 577, 595, 602, 618, 635, 638, 645, 649, 658, 659, 673, 678, 693, 726, 727, 747, 758, 764, 784, 800, 808, 811, 822, 828, 831, 843, 845, 870, 871, 883, 885, 905, 906, 929, 933, 935, 947, 968, 975, 976, 982, 989], 'A': [2, 8, 15, 22, 2

In [None]:
# Create a dictionary of all drivers' routes
drivers_routes = {}

for driver in uniqueDrivers:
    print("driver", driver)
    print("driver_indices[driver]", driver_indices[driver])
    # print("len(driver_indices[driver])", len(driver_indices[driver]))
    print("driver_standard[driver]", driver_standard[driver])
    # print("len(driver_standard[driver])", len(driver_standard[driver]))

    driver_route_values = []
    
    for index, standard in zip(driver_indices[driver], driver_standard[driver]):
        driver_standard_value = SetsSimilarities[index][standardIds.index(standard)]
        driver_route_values.append(driver_standard_value)

    print("driver_route", driver_route_values)

    # Assuming driver_max_value_index is a NumPy array
    unique_values_index = np.unique(driver_standard[driver])
    print("unique_values", unique_values_index)
    
    # # Calculate the mean for each unique value
    means = [np.mean(np.array(driver_route_values)[np.where(np.array(driver_standard[driver])==idx)[0]]) for idx in unique_values_index]
    print("means", means)

    best_route_Ids = []
    # Print the results for each driver
    for idx, mean in zip(unique_values_index, means):
        print(f"Driver: {driver}, Unique Value: {idx}, Mean: {mean}")
        best_route_Ids.append([idx, mean])

    # print("best_route_Ids", best_route_Ids)
    # Sort the routes by their mean
    best_route_Ids.sort(key=lambda x: x[1], reverse=True)
    # print("best_route_Ids", best_route_Ids)
    
    # Keep the top 5 routes
    top_5_routes = best_route_Ids[:5]
    # print("top_5_routes", top_5_routes)

    # Update the driver's routes in the dictionary
    drivers_routes[driver] = {'driver': driver, 'routes': [id for id,value in top_5_routes]}

# Convert the dictionary to a list for JSON serialization
result_list = list(drivers_routes.values())

# Write the result to driver.json
with open(os.path.join('results', 'driver2.json'), 'w') as outfile:
    json.dump(result_list, outfile, ensure_ascii=False, indent=2)

print(f"JSON driver data has been written to data/driver.json")

driver A
driver_indices[driver] [2, 8, 15, 22, 29, 39, 66, 80, 83, 89, 92, 105, 120, 134, 140, 158, 161, 165, 172, 174, 176, 202, 208, 211, 231, 241, 250, 253, 256, 257, 260, 271, 272, 278, 299, 305, 309, 316, 317, 353, 362, 367, 374, 401, 404, 430, 449, 455, 462, 471, 478, 480, 496, 512, 518, 520, 523, 526, 545, 555, 556, 579, 583, 594, 606, 607, 608, 617, 619, 651, 652, 653, 656, 672, 679, 691, 716, 732, 736, 748, 759, 782, 795, 804, 821, 826, 832, 834, 836, 844, 854, 864, 881, 891, 912, 913, 917, 920, 927, 936, 942, 943, 948, 967, 979, 980, 985, 998]
driver_standard[driver] ['s0', 's0', 's0', 's0', 's0', 's0', 's0', 's0', 's0', 's0', 's0', 's1', 's1', 's1', 's1', 's1', 's1', 's1', 's1', 's1', 's1', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's2', 's3', 's3', 's3', 's3', 's3', 's3', 's3', 's3', 's4', 's4', 's4', 's4', 's4', 's4', 's4', 's4', 's4', 's4', 's5', 's5', 's5', 's5', 's5', 's5', 's5', 's5', 's5', 's5', 's5', 's6', 's6', 's6', 's6', 's6', '

In [None]:
# driver_standard[driver]=='s0'

teesttt = np.array(driver_route_values)
np.mean(teesttt[(np.where(np.array(driver_standard['B'])=='s0')[0])])

0.6708515533827281

In [None]:
arrya_test=[0.8136104580151244, 0.7065116279069767, 0.7862841497880358, 0.808658256880734, 0.7468018667215702, 0.9802943969610637, 0.7415419829998469, 0.7698524898254347, 0.7021086927968885, 0.9703038674033149]

In [None]:
print("len(arrya_test)", len(arrya_test))
mean_test = np.mean(arrya_test)
print("mean_test", mean_test)

len(arrya_test) 10
mean_test 0.8025967789298989
