for each driver, creates a list of standard routes in that order so that the higher in the list a standard route is, the least the diversion of the driver will be, and 
the output of the program is: 

a file called driver.json that has for each driver, the 5 standard routes routes that if the driver does them, it minimizes the diversion. You can test this by considering as pool of standard routes those that originally the company has and also those that you recommend in the recStandard.json. The file driver.json has the following syntax:
[
	{driver:C, routes:[s10, s20, s2, s6, s10}}, 
	{driver:A, routes:[s1, s2, s22, s61, s102]}, 
….
]


In [1]:
import os
HOME = os.getcwd()
print('HOME: ',HOME)

import json
import random
import pandas as pd
import sys
import lxml
import sklearn as sk
import numpy as np

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go


HOME:  c:\Users\matti\Desktop\CODE\DataMiningProject23-24


In [2]:
STANDARD_FILE = 'standard_medium.json'
ACTUAL_FILE = 'actual_medium.json'

In [5]:
# load standard and actual data
with open(os.path.join('data',STANDARD_FILE)) as f:
    standard = json.load(f)

with open(os.path.join('data', ACTUAL_FILE)) as f:
    actual = json.load(f)

# load the data into a dataframe
dfStandard = pd.DataFrame(standard)
dfActual = pd.DataFrame(actual)

# print head of the dataframes
print(dfStandard.head())
print(dfActual.head())

# get the unique cities, items and drivers of the standard data
cities = []
items = []
drivers = []
longestRoute = 0
shortestRoute = np.inf
for index, s in dfStandard.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    for trip in route:
        cities.append(trip['from']) 
        items.extend(trip['merchandise'].keys())
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
        
    if len(route) < shortestRoute:
        shortestRoute = len(route)

for index, s in dfActual.iterrows():
    #print(s)
    idS = s['id']
    route = s['route']
    drivers.append(s['driver'])
    for trip in route:
        cities.append(trip['from'])
        items.extend(trip['merchandise'].keys())
        
    if len(route) > 0:
        cities.append(route[-1]['to'])
        
    if len(route) > longestRoute:
        longestRoute = len(route)
    
    if len(route) < shortestRoute:
        shortestRoute = len(route)

# find the unique cities and items
uniqueCities = sorted(list(set(cities)))
uniqueItems = sorted(list(set(items)))
uniqueDrivers = sorted(list(set(drivers)))

print("Unique cities: ", uniqueCities)
print("Unique items: ", uniqueItems)
print("Unique drivers: ", uniqueDrivers)

print("Number of cities: ", len(uniqueCities))
print("Number of items: ", len(uniqueItems))
print("Number of drivers: ", len(uniqueDrivers))

print("Longest route: ", longestRoute)
print("Shortest route: ", shortestRoute)


   id                                              route
0  s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
1  s1  [{'from': 'Parma', 'to': 'Anzio', 'merchandise...
2  s2  [{'from': 'Savona', 'to': 'Sassari', 'merchand...
3  s3  [{'from': 'Avellino', 'to': 'Moncalieri', 'mer...
4  s4  [{'from': 'Varese', 'to': 'Ravenna', 'merchand...
   id driver sroute                                              route
0  a0      F     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
1  a1      E     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
2  a2      D     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
3  a3      A     s0  [{'from': 'Velletri', 'to': 'Bolzano-Bozen', '...
4  a4      D     s0  [{'from': 'Velletri', 'to': 'Asti', 'merchandi...
Unique cities:  ['Acerra', 'Ancona', 'Andria', 'Anzio', 'Asti', 'Avellino', 'Bisceglie', 'Bolzano-Bozen', 'Brescia', 'Carpi', 'Carrara', 'Catania', "Cava de' Tirreni", 'Cesena', 'Crotone', 'Gallarate', 'Grosseto', 'Lamezia Terme', '

In [20]:
# # Load recommended standard routes
# with open('data/recStandard.json') as f:
#     recommended_routes = json.load(f)

# Combine original and recommended routes
all_standard_routes = standard #+ recommended_routes
drivers_routes = {}

# for index, s in dfActual.iterrows():
#     print(f'index {index} \n {s}')
#     driver = s['driver']
#     sroute = s['sroute']

#     # Find the corresponding standard route
#     for route in all_standard_routes:
#         if route['id'] == sroute:
#             if driver not in drivers_routes:
#                 drivers_routes[driver] = []

#             drivers_routes[driver].append(route)

# # Sort the routes for each driver based on minimizing diversion
# for driver, routes in drivers_routes.items():
#     routes.sort(key=lambda x: len(x['route']))  # Sort routes by route length

#     # Take the top 5 routes for each driver
#     top_routes = routes[:5]

#     # Update the driver's routes in the dictionary
#     drivers_routes[driver] = {'driver': driver, 'routes': [r['id'] for r in top_routes]}


for driver in drivers:
    #TODO: find the best std routes for each driver
    routes = [
    {
    "id": "s0",
    "route": [
      {
        "from": "Velletri",
        "to": "Bolzano-Bozen",
        "merchandise": {
        }
      }]}
      ]

    # Take the top 5 routes for each driver
    top_routes = routes

    # Update the driver's routes in the dictionary
    drivers_routes[driver] = {'driver': driver, 'routes': [r['id'] for r in top_routes]}


# Convert the dictionary to a list for JSON serialization
result_list = list(drivers_routes.values())

# Write the result to driver.json
with open('data/driver.json', 'w') as json_file:
    json.dump(result_list, json_file, indent=2)

print(f"JSON driver data has been written to data/driver.json")


JSON driver data has been written to data/driver.json
