for each driver, creates a list of standard routes in that order so that the higher in the list a standard route is, the least the diversion of the driver will be, and 
the output of the program is: 

a file called driver.json that has for each driver, the 5 standard routes routes that if the driver does them, it minimizes the diversion. You can test this by considering as pool of standard routes those that originally the company has and also those that you recommend in the recStandard.json. The file driver.json has the following syntax:
[
	{driver:C, routes:[s10, s20, s2, s6, s10}}, 
	{driver:A, routes:[s1, s2, s22, s61, s102]}, 
….
]


In [68]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import math
import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import HDBSCAN
from scipy.spatial.distance import cosine

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go


HOME:  c:\Users\matti\Desktop\CODE\DataMiningProject23-24


In [69]:
STANDARD_FILE = 'standard_medium.json'
ACTUAL_FILE = 'actual_medium.json'

In [70]:
# load standard and actual data
with open(os.path.join('data',STANDARD_FILE)) as f:
    standard = json.load(f)

with open(os.path.join('data', ACTUAL_FILE)) as f:
    actual = json.load(f)

# load the data into a dataframe
dfStandard = pd.DataFrame(standard)
dfActual = pd.DataFrame(actual)

# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())

# get the unique cities and items of the standard data
cities = []
items = []
drivers = []
longestRoute = 0
shortestRoute = np.inf
maxItemQuantity = 0
for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)

for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    drivers.append(s['driver'])
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        maxItemQuantity = max(maxItemQuantity, max(trip['merchandise'].values()))
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)

# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
#uniqueCities.insert(0, 'NULL')          # add NULL city, for padding vectors with different lengths (trips in routes)
uniqueItems = sorted(list(set(items)))
uniqueDrivers = sorted(list(set(drivers)))

threeShingles = []

for i, c1 in enumerate(uniqueCities):
    for j, c2 in enumerate(uniqueCities):
        if i == j:
            continue
        for k, c3 in enumerate(uniqueCities):
            if j == k or i == k:
                continue
            threeShingles.append([c1, c2, c3])
            
permutations = math.perm(len(uniqueCities), 3)

print("Unique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)
print("Unique drivers: ", uniqueDrivers)

print("Number of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))
print("Number of drivers: ", len(uniqueDrivers))

print("\nLongest route: ", longestRoute)
print("Shortest route: ", shortestRoute)

print("\nMax item quantity: ", maxItemQuantity)

print("\nNumber of three-shingles: ", len(threeShingles))

print("\nThree-shingles: ", math.perm(len(uniqueCities), 3))
print("Three-shingles: ", math.comb(len(uniqueCities), 3))


standardIds = dfStandard['id'].tolist()
print("\nstandardIds: ", standardIds)
print("standardIds shape: ", np.array(standardIds).shape)
print("standardIds type: ", type(standardIds))


   id                                              route
0  s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
1  s1  [{'from': 'Parma', 'to': 'Anzio', 'merchandise...
2  s2  [{'from': 'Savona', 'to': 'Sassari', 'merchand...
3  s3  [{'from': 'Avellino', 'to': 'Moncalieri', 'mer...
4  s4  [{'from': 'Varese', 'to': 'Ravenna', 'merchand...
   id driver sroute                                              route
0  a0      F     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
1  a1      E     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
2  a2      D     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
3  a3      A     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
4  a4      D     s0  [{'from': 'Velletri', 'to': 'Asti', 'merchandi...
Unique cities:  ['Acerra', 'Ancona', 'Andria', 'Anzio', 'Asti', 'Avellino', 'Bisceglie', 'Bolzano-Bozen', 'Brescia', 'Carpi', 'Carrara', 'Catania', "Cava de' Tirreni", 'Cesena', 'Crotone', 'Gallarate', 'Grosseto', 'Lamezia Terme', '

In [71]:
def hashShingles(shingles, n):
    # hash shingles
    string = "" 
    for shingle in shingles:
        string += str(shingle) + "," # [45, 4, 8] -> "45,4,8,"
    
    return hash(string) #% n

def createShingles(df, k, uniqueCities, uniqueItems, longestRoute, maxItemQuantity, permutations):
    # create shingles for each route
    shingles = []
    for index, s in df.iterrows():
        idS = s['id']
        route = s['route']
        shingle = [index]
        citiesInRoute = [] # napoli roma milano teramo bergamo [10,4,5,48,12] [10,4,5] [4,5,48] [5,48,12]
        merchandiseInRoute = np.zeros(len(uniqueItems))
        for trip in route:
            citiesInRoute.append(uniqueCities.index(trip['from']))
            #merchandiseInRoute += np.array(list(trip['merchandise'].values()))
            for item, n in trip['merchandise'].items():
                merchandiseInRoute[uniqueItems.index(item)] += n
        if len(route) > 0:
            citiesInRoute.append(uniqueCities.index(route[-1]['to']))
        merchandiseInRoute = np.array(merchandiseInRoute) / (maxItemQuantity*len(route))
        
        hashedShingles = []
        for i in range(len(citiesInRoute)-k+1):
            # Q: is it correct to set the modulo for the hash function to the number of permutations?
            # A: yes, because we want to have a unique hash for each shingle
            # Q: would it be better to use a different hash function?
            # A: yes, because the modulo function is not a good hash function
            hashedShingles.append(hashShingles(citiesInRoute[i:i+k], permutations) )
        
        shingle.append(np.array(hashedShingles))
        
        shingle.append(merchandiseInRoute) # quantity hot encoding
        
        shingles.append(shingle)
        
    return shingles # [ index, [shingles], [merchandise] ]

In [72]:
standardSets = createShingles(dfStandard, k=3, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)
actualSets = createShingles(dfActual, k=3, uniqueCities=uniqueCities, uniqueItems=uniqueItems, longestRoute=longestRoute, maxItemQuantity=maxItemQuantity, permutations=permutations)

print("\nstandardSets", len(standardSets), "shape first element", standardSets[0][1].shape, standardSets[0])
print("\nactualSets", len(actualSets),  "shape first element", standardSets[0][1].shape, actualSets[0])

print("\nstandardSets:", len(standardSets))
print("actualSets:", len(actualSets))




assert len(standardSets[0]) == 3, "The length of the standard set is not equal to 3 (index, shingles, merchandise)"
assert len(standardSets[0][2]) == len(uniqueItems), "The length of the merchandise vector is not equal to the number of unique items"


standardSets 100 shape first element (39,) [0, array([-5869878232918110230,  5786580813146119747, -6055155626649329242,
        2336233510608621390, -1391445244834571428,  7411181989319450967,
       -6484768109089413921, -4864053067261812944, -7192692404044199867,
        4496214598380810472, -3910955356469902341, -6017651266501587447,
       -6594288062891706050,  -505467053753939884,  5483391440369257321,
       -1006001304773827846, -1534349821768088923, -1185767421347266877,
        7305943749240911221,  4659684731836094208, -2865891476083387661,
        1918518921692267490, -2957242224482532459, -6840622752454886010,
       -8272195487789062828,  8149538799014640058, -1370546119268022447,
        8824185091986805822, -2512402976817644600,  8315552236129821774,
        5394045379085790514, -1155786090034328550,  6589297883968820293,
       -7278328804880998021, -2220735306815704689, -7171796015360940435,
        7807102811461180758, -1269748780829999000, -4212108734806183374],
  

  merchandiseInRoute = np.array(merchandiseInRoute) / (maxItemQuantity*len(route))


In [73]:
def jaccard_similarity_matrix(matrix):
    intersection = np.dot(matrix, matrix.T)
    row_sums = matrix.sum(axis=1)
    union = row_sums[:, None] + row_sums - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

def jaccard_similarity_two_matrices(matrix1, matrix2):
    #intersection = np.dot(matrix, matrix.T)
    intersection = np.dot(matrix1, matrix2.T)
    row_sums1 = matrix1.sum(axis=1)
    row_sums2 = matrix2.sum(axis=1)
    union = row_sums1[:, None] + row_sums2 - intersection
    union = np.where(union == 0, 1, union)  # avoid division by zero
    jaccard_similarity = intersection / union
    return jaccard_similarity

In [74]:
def jaccard_similarity_matrix_merch(matrix):
    print("matrix", matrix.shape)
    min_matrix = np.minimum(matrix[:, None, :], matrix[None, :, :])
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix[:, None, :], matrix[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity

def jaccard_similarity_matrices_merch(matrix1, matrix2):
    print("matrix1", matrix1.shape)
    print("matrix2", matrix2.shape)
    
    min_matrix = np.minimum(matrix1[:, None, :], matrix2[None, :, :])
    sum_min_matrix = np.sum(min_matrix, axis=-1)
    print("sum_min_matrix", sum_min_matrix.shape)
    
    max_matrix = np.maximum(matrix1[:, None, :], matrix2[None, :, :])
    sum_max_matrix = np.sum(max_matrix, axis=-1)
    print("sum_max_matrix", sum_max_matrix.shape)
    
    jaccard_similarity = sum_min_matrix / sum_max_matrix
    return jaccard_similarity


def create_binary_matrix(routeSets):
    # create binary matrix where each row represents a route
    uniqueShingles = list(set([shingle for route in routeSets for shingle in route[1]]))
    binaryMatrix = np.zeros((len(routeSets), len(uniqueShingles)))
    for i, route in enumerate(routeSets):
        for shingle in route[1]:
            binaryMatrix[i][uniqueShingles.index(shingle)] = 1
    return binaryMatrix

def create_binary_matrices(routeSet1, routeSet2):
    # create binary matrix where each row represents a route
    uniqueShinglesBoth = list(set([shingle for route in routeSet1 for shingle in route[1]] + [shingle for route in routeSet2 for shingle in route[1]]))
    binaryMatrix1 = np.zeros((len(routeSet1), len(uniqueShinglesBoth)))
    binaryMatrix2 = np.zeros((len(routeSet2), len(uniqueShinglesBoth)))
    for i, route in enumerate(routeSet1):
        for shingle in route[1]:
            binaryMatrix1[i][uniqueShinglesBoth.index(shingle)] = 1
            
    for i, route in enumerate(routeSet2):
        for shingle in route[1]:
            binaryMatrix2[i][uniqueShinglesBoth.index(shingle)] = 1
    return binaryMatrix1, binaryMatrix2

In [75]:
# binary matrix where each row represents a route
route_actual_matrix, route_standard_matrix = create_binary_matrices(actualSets,standardSets)

# compute Jaccard similarity for routes
matrix_sim__route_actual_standard = jaccard_similarity_two_matrices(route_actual_matrix, route_standard_matrix)

print("matrix_sim_actual_standard", matrix_sim__route_actual_standard.shape)
print("matrix_sim_actual_standard[0]", matrix_sim__route_actual_standard[1])


# 
merch_actual_matrix = np.array([s[2] for s in actualSets])
merch_standard_matrix = np.array([s[2] for s in standardSets])

# compute Jaccard similarity for merchandise
matrix_sim_merch_actual_standard = jaccard_similarity_matrices_merch(merch_actual_matrix, merch_standard_matrix)



SetsSimilarities = (matrix_sim__route_actual_standard + matrix_sim_merch_actual_standard) / 2
SetsSimilarities=np.nan_to_num(SetsSimilarities, nan=0)

print("SetsSimilarities", SetsSimilarities.shape)
print("SetsSimilarities[0]", SetsSimilarities[0])

matrix_sim_actual_standard (2500, 100)
matrix_sim_actual_standard[0] [0.80952381 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0

In [76]:
max_value_index = np.argmax(SetsSimilarities, axis=1)
max_value = np.max(SetsSimilarities, axis=1)

# print("Index of the biggest value for each row:", max_value_index)
# print("Value of the biggest value for each row:", max_value)

# print("len(max_value_index)", len(max_value_index))
# print("len(max_value)", len(max_value))

# [index for index, s in dfActual.iterrows() if s['driver'] == driver]
# uniqueDrivers
driver_indices = {}

for i, s in dfActual.iterrows():
    driver = s['driver']
    
    # Check if the driver is already in the dictionary
    if driver in driver_indices:
        # If yes, append the index to the existing array
        driver_indices[driver].append(i)
    else:
        # If not, create a new array with the current index
        driver_indices[driver] = [i]

print("driver_indices", driver_indices)
print("len(driver_indices)", len(driver_indices))

driver_indices {'F': [0, 23, 45, 49, 53, 54, 62, 70, 82, 85, 101, 111, 113, 146, 155, 158, 170, 182, 185, 192, 208, 213, 217, 220, 250, 258, 291, 298, 306, 318, 328, 331, 342, 350, 353, 371, 436, 439, 453, 473, 478, 481, 484, 495, 496, 500, 504, 510, 511, 525, 546, 553, 565, 574, 576, 586, 591, 596, 608, 616, 621, 625, 628, 638, 649, 650, 657, 664, 678, 680, 682, 684, 690, 694, 711, 714, 715, 723, 724, 732, 736, 760, 786, 807, 809, 811, 812, 817, 824, 826, 839, 840, 860, 867, 869, 881, 905, 907, 911, 914, 946, 948, 950, 963, 973, 983, 991, 995, 1016, 1031, 1032, 1046, 1055, 1057, 1074, 1091, 1095, 1102, 1133, 1178, 1179, 1187, 1188, 1208, 1217, 1218, 1224, 1231, 1237, 1238, 1245, 1249, 1255, 1257, 1260, 1263, 1278, 1282, 1291, 1311, 1316, 1320, 1348, 1362, 1374, 1375, 1382, 1395, 1404, 1409, 1411, 1414, 1420, 1427, 1428, 1429, 1434, 1445, 1456, 1457, 1466, 1497, 1508, 1518, 1534, 1542, 1543, 1545, 1550, 1553, 1558, 1571, 1584, 1591, 1594, 1596, 1605, 1612, 1636, 1670, 1693, 1702, 1706,

In [78]:
# Create a dictionary of all drivers' routes
drivers_routes = {}

for driver in uniqueDrivers:
    print("driver", driver)
    # print("driver_indices[driver]", driver_indices[driver])
    # print("len(driver_indices[driver])", len(driver_indices[driver]))

    driver_standard_index = max_value_index[driver_indices[driver]]
    driver_max_value = max_value[driver_indices[driver]]
    # print("driver_max_value_index", driver_standard_index)

    # Assuming driver_max_value_index is a NumPy array
    unique_values_index = np.unique(driver_standard_index)
    # print("unique_values", unique_values_index)


    # Calculate the mean for each unique value
    means = [np.mean(driver_max_value[driver_standard_index == idx]) for idx in unique_values_index]
    # print("means", means)

    best_route_Ids = []
    # Print the results for each driver
    for idx, mean in zip(unique_values_index, means):
        # print(f"Driver: {driver}, Unique Value: {idx}, Mean: {mean}")
        best_route_Ids.append([standardIds[idx], mean])

    # print("best_route_Ids", best_route_Ids)
    # Sort the routes by their mean
    best_route_Ids.sort(key=lambda x: x[1], reverse=True)
    # print("best_route_Ids", best_route_Ids)
    
    # Keep the top 5 routes
    top_5_routes = best_route_Ids[:5]
    # print("top_5_routes", top_5_routes)

    # Update the driver's routes in the dictionary
    drivers_routes[driver] = {'driver': driver, 'routes': [id for id,value in top_5_routes]}

# Convert the dictionary to a list for JSON serialization
result_list = list(drivers_routes.values())

# Write the result to driver.json
with open(os.path.join('results', 'driver.json'), 'w') as outfile:
    json.dump(result_list, outfile, indent=2)

print(f"JSON driver data has been written to data/driver.json")

driver A
driver B
driver C
driver D
driver E
driver F
driver G
driver H
driver I
driver J
JSON driver data has been written to data/driver.json


## ERIC's IDEA