for each driver, creates a list of standard routes in that order so that the higher in the list a standard route is, the least the diversion of the driver will be, and 
the output of the program is: 

a file called driver.json that has for each driver, the 5 standard routes routes that if the driver does them, it minimizes the diversion. You can test this by considering as pool of standard routes those that originally the company has and also those that you recommend in the recStandard.json. The file driver.json has the following syntax:
[
	{driver:C, routes:[s10, s20, s2, s6, s10}}, 
	{driver:A, routes:[s1, s2, s22, s61, s102]}, 
….
]


In [1]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go


HOME:  c:\Users\matti\Desktop\CODE\DataMiningProject23-24


In [2]:
STANDARD_FILE = 'standard_big.json'
ACTUAL_FILE = 'actual_big.json'

In [3]:
# load standard and actual data
with open(os.path.join('data',STANDARD_FILE)) as f:
    standard = json.load(f)

with open(os.path.join('data', ACTUAL_FILE)) as f:
    actual = json.load(f)


# load the data into a dataframe
dfStandard = pd.DataFrame(standard)
dfActual = pd.DataFrame(actual)


# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())


# get the unique cities, items and drivers of the standard data
cities = []
items = []
drivers = []
longestRoute = 0
shortestRoute = np.inf


for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)

for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    drivers.append(s['driver'])
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)


# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
uniqueItems = sorted(list(set(items)))
uniqueDrivers = sorted(list(set(drivers)))

print("Unique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)
print("Unique drivers: ", uniqueDrivers)

print("Number of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))
print("Number of drivers: ", len(uniqueDrivers))

print("Longest route: ", longestRoute)
print("Shortest route: ", shortestRoute)


   id                                              route
0  s0  [{'from': 'Turin', 'to': 'Cinisello Balsamo', ...
1  s1  [{'from': 'Cava de' Tirreni', 'to': 'Coriglian...
2  s2  [{'from': 'Pordenone', 'to': 'Savona', 'mercha...
3  s3  [{'from': 'L’Aquila', 'to': 'Andria', 'merchan...
4  s4  [{'from': 'Andria', 'to': 'Cava de' Tirreni', ...
   id driver sroute                                              route
0  a0      J     s0  [{'from': 'Turin', 'to': 'Cinisello Balsamo', ...
1  a1      I     s0  [{'from': 'Turin', 'to': 'Cinisello Balsamo', ...
2  a2      C     s0  [{'from': 'Turin', 'to': 'Cinisello Balsamo', ...
3  a3      H     s0  [{'from': 'Turin', 'to': 'Cinisello Balsamo', ...
4  a4      H     s0  [{'from': 'Turin', 'to': 'Cinisello Balsamo', ...
Unique cities:  ['Afragola', 'Agrigento', 'Ancona', 'Andria', 'Anzio', 'Arezzo', 'Asti', 'Barletta', 'Bitonto', 'Bolzano-Bozen', 'Brescia', 'Busto Arsizio', 'Carpi', 'Caserta', 'Catanzaro', "Cava de' Tirreni", 'Cerignola', 'Cinisell

In [4]:
def extractFeatureVector(df, uniqueCities, uniqueItems, longestRoute):
    # create a feature vector for each row
    featureVectors = []
    for index, row in df.iterrows():
        featureVector = [] 
        idS = row['id']
        route = row['route']
        for trip in route:
            # add the city of departure
            featureVector.append(uniqueCities.index(trip['from']))
            # add the city of arrival
            featureVector.append(uniqueCities.index(trip['to']))
            # add the items
            qntHotMerch = np.zeros(len(uniqueItems))
            for item, n in trip["merchandise"].items():
                qntHotMerch[uniqueItems.index(item)] = n
            #print(qntHotMerch)
            featureVector.extend(np.array(qntHotMerch.tolist()))
        # pad the feature vector with zeros
        while len(featureVector) < longestRoute * (2+len(uniqueItems)):
            featureVector.extend(np.zeros(2+len(uniqueItems)))
        featureVectors.append(featureVector)
    return featureVectors

In [5]:
# extract feature vectors from the data

standardVectors = extractFeatureVector(dfStandard, uniqueCities, uniqueItems, longestRoute)
print("standardVectors: ", standardVectors[0])
print("standardVectors shape: ", np.array(standardVectors).shape)
print("standardVectors type: ", type(standardVectors))

actualVectors = extractFeatureVector(dfActual, uniqueCities, uniqueItems, longestRoute)
print("actualVectors: ", actualVectors[0])
print("actualVectors shape: ", np.array(actualVectors).shape)
print("actualVectors type: ", type(actualVectors))

print('\n\n')

standardIds = dfStandard['id'].tolist()
print("standardIds: ", standardIds)
print("standardIds shape: ", np.array(standardIds).shape)
print("standardIds type: ", type(standardIds))

actualIds = dfActual['sroute'].tolist()
print("actualIds: ", actualIds)
print("actualIds shape: ", np.array(actualIds).shape)
print("actualIds type: ", type(actualIds))


#print("sroute: ", actualIds)

minRouteLength = np.min([len(route) for route in standardVectors] + [len(route) for route in actualVectors])
maxRouteLength = np.max([len(route) for route in standardVectors] + [len(route) for route in actualVectors])
print(standardVectors)
print("Min route length: ", minRouteLength)
print("Max route length: ", maxRouteLength)
        
standardVectors = np.array(standardVectors).astype(float)
actualVectors = np.array(actualVectors).astype(float)


# shape: (numRoutes, maxNumItems+2 (ie: from, to)))
print("standard shape", standardVectors.shape)
print("actual shape", actualVectors.shape)


assert standardVectors.shape[1] == actualVectors.shape[1], "The number of features is not the same between standard and actual data"
assert standardVectors.shape[1] == longestRoute * (2+len(uniqueItems)), "The number of features does not match the expected number, possibly due to a different longest route"

standardVectors:  [45, 17, 1.0, 29.0, 0.0, 12.0, 2.0, 28.0, 22.0, 30.0, 5.0, 0.0, 17, 24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 24, 3, 14.0, 15.0, 1.0, 15.0, 0.0, 0.0, 23.0, 23.0, 16.0, 0.0, 3, 31, 11.0, 13.0, 17.0, 27.0, 4.0, 5.0, 12.0, 0.0, 30.0, 0.0, 31, 19, 10.0, 0.0, 12.0, 30.0, 0.0, 23.0, 0.0, 0.0, 0.0, 20.0, 19, 29, 0.0, 12.0, 0.0, 11.0, 0.0, 19.0, 21.0, 21.0, 29.0, 13.0, 29, 47, 24.0, 13.0, 3.0, 21.0, 24.0, 26.0, 4.0, 0.0, 1.0, 5.0, 47, 1, 0.0, 19.0, 6.0, 0.0, 14.0, 27.0, 30.0, 25.0, 18.0, 17.0, 1, 41, 3.0, 27.0, 12.0, 0.0, 3.0, 0.0, 14.0, 19.0, 29.0, 26.0, 41, 34, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 6.0, 23.0, 0.0, 34, 35, 0.0, 0.0, 24.0, 17.0, 0.0, 30.0, 29.0, 0.0, 14.0, 19.0, 35, 36, 20.0, 12.0, 3.0, 0.0, 0.0, 11.0, 0.0, 0.0, 0.0, 0.0, 36, 10, 27.0, 11.0, 13.0, 0.0, 5.0, 30.0, 1.0, 0.0, 6.0, 0.0, 10, 46, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0, 0.0, 0.0, 0.0, 0.0, 46, 14, 4.0, 30.0, 14.0, 27.0, 13.0, 21.0, 4.0, 29.0, 10.0, 9.0, 14, 20, 0.0, 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [9]:
def get_unique(seq, idfun=None):
    # order preserving
   if idfun is None:
       def idfun(x): return x
   seen = {}
   result = []
   for item in seq:
       marker = idfun(item)
       # in old Python versions:
       # if seen.has_key(marker)
       # but in new ones:
       if marker in seen: continue
       seen[marker] = 1
       result.append(item)
   return result

In [11]:
def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Create a dictionary of all drivers' routes
drivers_routes = {}

# Precompute standard vectors
standard_vectors = np.array([route.reshape(1, -1) for route in standardVectors])

for driver in uniqueDrivers:
    print(f'driver {driver}')

    # Find the actual route indices of the driver
    index_array = [index for index, s in dfActual.iterrows() if s['driver'] == driver]
    best_route_Ids = []

    for index in index_array:
        actual_route = actualVectors[index].reshape(1, -1)

        sims = cosine_similarity(actual_route, standard_vectors[:, 0, :])

        best_route_index = np.argmax(sims)
        best_route_Ids.append([standardIds[best_route_index], sims[0, best_route_index]])

    # Sort the routes based on similarity in descending order

    best_route_Ids.sort(key=lambda x: x[1], reverse=True)
    # print(best_route_Ids)
    
    # Extract unique elements based on the first position of each sublist
    best_route_Ids = f7(list(zip(*best_route_Ids))[0])
    # print(best_route_Ids)

    # Keep the top 5 routes
    top_5_routes = best_route_Ids[:5]

    # Update the driver's routes in the dictionary
    drivers_routes[driver] = {'driver': driver, 'routes': [id for id in top_5_routes]}



# Convert the dictionary to a list for JSON serialization
result_list = list(drivers_routes.values())

# Write the result to driver.json
with open(os.path.join('results', 'driver.json'), 'w') as outfile:
    json.dump(result_list, outfile, indent=2)

print(f"JSON driver data has been written to data/driver.json")


driver A
driver B
driver C
driver D
driver E
driver F
driver G
driver H
driver I
driver J
JSON driver data has been written to data/driver.json


In [27]:
res = f7(list(zip(*best_route_Ids))[0])
len(res)



100

|name| # drivers | actual | standard | recommended| time (s) | notes |
|----|----------|----------|----------|-----------|--------|-----|
|small|  10 | (1000, 588) | (10, 588) | (10, 588)| 7,5s |
|big  | 10 | (20000, 588) | (100, 588) | (100, 588) | 1400s  |   140sec per driver |
| Row 3, Col 1 | Row 3, Col 2 | Row 3, Col 3 |




distanza tra tutte le actual e le standard routes vicine (per ogni driver)
top 5 sort of distance
fittness function per il top 5


fare test sulla scala
fare test su scala con 1000 driver


**remove duplicates before and after in the final results**