# Data Mining


In [62]:
## get home directory
import os
HOME = os.getcwd()
print('HOME: ',HOME)

HOME:  /Users/ericsuardi/Desktop/DataMiningProject23-24


## create the dataset

In [63]:
import json
import random
import pandas as pd
import sys
import lxml
import string
import numpy as np

### Constants of the project

In [72]:
MAX_ITEM_QUANTITY = 30      # max number of items per order
SUBSAMPLE_CITY = 50         # < 140
SUBSAMPLE_ITEM = 20         # < 230
NUM_DRIVER = 10           # < 100

NUM_STANDARD_ROUTES = 100   # number of standard routes to generate
FORWARD_EXPANSION = 25      # number of times each standard route is altered
DIVERGENCE = 0.1            # probability of divergence from standard route (an alteration)

DATA_DIR = os.path.join(HOME,'data')
STANDARD_FILENAME = "standard_medium.json"
ACTUAL_FILENAME = "actual_medium.json"

### Web scraping to collect cities and items

#### Boilerplate functions

In [73]:
def generate_sequence_drivers_names(total):
    alphabet = string.ascii_uppercase
    num_len = len(str(total // len(alphabet)))
    format_str = "{:0" + str(num_len) + "d}"
    
    for i in range(total):
        letter = alphabet[i % len(alphabet)]
        number = format_str.format(i // len(alphabet))
        if total > len(alphabet):
            yield letter + '_' + number
        else:
            yield letter

In [74]:
list_of_cities = pd.read_html('https://en.wikipedia.org/wiki/List_of_cities_in_Italy')[0].City.to_list()

# 230 products
list_of_items = [
    'Air Freshener', 'Air Mattress', 'Air Purifier', 'Aluminum Foil', 'Apple', 'Baby Formula', 'Baby Wipes', 'Backpack', 'Bacon', 'Banana', 'Band-Aids', 'Bar Stools', 'Barware Set', 'Basketball', 'Batteries', 'Beach Chair', 'Beach Towel', 'Bed Sheets', 'Bedside Table', 'Beef', 'Bell Pepper', 'Bike', 'Biscuits', 'Blankets', 'Bleach', 'Blender', 'Blueberries', 'Bluetooth Speaker', 'Board Games', 'Bookshelf', 'Bread', 'Broccoli', 'Broom', 'Camera', 'Camp Stove', 'Can Opener', 'Candles', 'Canned Beans', 'Canned Soup', 'Canned Vegetables', 'Carrot', 'Cat Litter', 'Cereal', 'Chairs', 'Charcoal', 'Charging Cable', 'Cheese', 'Chewing Gum', 'Chicken', 'Chips', 'Chocolate', 'Coffee', 'Coffee Maker', 'Coffee Table', 'Cookies', 'Cookware Set', 'Cooler', 'Cooling Fan', 'Corkscrew', 'Cotton Swabs', 'Couch', 'Crackers', 'Cucumber', 'Curtains', 'Cutlery Set', 'Cutting Board', 'Desk', 'Diapers', 'Dining Table', 'Dish Soap', 'Dish Towels', 'Dishwasher Detergent', 'Diving Mask', 'Dresser', 'Dumbbells', 'Dustpan', 'Eggs', 'Envelopes', 'Exercise Ball', 'Fabric Softener', 'Fertilizer', 'Firewood', 'First Aid Kit', 'Fishing Rod', 'Fitness Tracker', 'Flashlight', 'Football', 'Frisbee', 'Garbage Bags', 'Garden Hose', 'Gloves', 'Glue', 'Goggles', 'Granola Bars', 'Grapes', 'Grill', 'Hand Sanitizer', 'Handbag', 'Hangers', 'Hat', 'Headphones', 'Heater', 'Helmet', 'Hiking Boots', 'Humidifier', 'Ice Cream', 'Ice Skates', 'Incense', 'Insect Repellent', 'Iron', 'Ironing Board', 'Jelly', 'Juice', 'Jump Rope', 'Ketchup', 'Knee Pads', 'Lamps', 'Laptop', 'Laundry Detergent', 'Lawn Mower', 'Lettuce', 'Light Bulbs', 'Lighter Fluid', 'Lock', 'Luggage', 'Markers', 'Matches', 'Mayonnaise', 'Measuring Cups', 'Microwave', 'Milk', 'Mirrors', 'Mixing Bowls', 'Mop', 'Mustard', 'Napkins', 'Notepads', 'Office Chair', 'Onion', 'Orange', 'Outdoor Furniture', 'Pain Relievers', 'Paper Towels', 'Pasta', 'Peanut Butter', 'Peanuts', 'Peeler', 'Pens', 'Pet Food', 'Phone Charger', 'Pickles', 'Picture Frames', 'Pillows', 'Pizza', 'Plastic Wrap', 'Playing Cards', 'Portable Charger', 'Potato', 'Potted Plants', 'Printer Paper', 'Pruning Shears', 'Puzzle', 'Rain Jacket', 'Raincoat', 'Resistance Bands', 'Rice', 'Rollerblades', 'Rugs', 'Salad Dressing', 'Salmon', 'Sandals', 'Sausages', 'Scarf', 'Scissors', 'Shampoo', 'Shrimp', 'Skateboard', 'Ski Equipment', 'Sleeping Bag', 'Smartwatch', 'Snorkel', 'Snowboard', 'Soap', 'Soccer Ball', 'Soda', 'Spatula', 'Spinach', 'Stationary Bike', 'Storage Bins', 'Strawberries', 'Sun Hat', 'Sunglasses', 'Sunscreen', 'Surfboard', 'Swimsuit', 'TV Stand', 'Tackle Box', 'Tape', 'Tea', 'Tent', 'Thermometer', 'Throw Pillows', 'Tire Pump', 'Tissues', 'Toaster', 'Toilet Paper', 'Tomato', 'Tool Kit', 'Toothpaste', 'Tote Bag', 'Towels', 'Trail Mix', 'Trash Bags', 'Trash Cans', 'Travel Adapter', 'Travel Pillow', 'Treadmill', 'Tripod', 'Tweezers', 'Umbrella', 'Vacuum Cleaner', 'Wallet', 'Water Bottle', 'Watering Can', 'Watermelon', 'Wine Glasses', 'Winter Coat', 'Yoga Mat', 'Yogurt', 'Ziploc Bags'
]

#drivers = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S','T', 'U','V','W','X','Y','Z']

# generate random drivers names
drivers = list(generate_sequence_drivers_names(NUM_DRIVER))

## Subsampling
assert  0 < SUBSAMPLE_CITY and  SUBSAMPLE_CITY < len(list_of_cities), "SUBSAMPLE_CITY must be less than {}".format(len(list_of_cities))
assert 0 < SUBSAMPLE_ITEM and SUBSAMPLE_ITEM < len(list_of_items), "SUBSAMPLE_ITEM must be less than {}".format(len(list_of_items))

list_of_cities = random.sample(list_of_cities, SUBSAMPLE_CITY)
#list_of_cities = list_of_cities[:10]
#list_of_items = list_of_items[:5]
list_of_items = random.sample(list_of_items, SUBSAMPLE_ITEM)
#drivers = drivers[:3]

print('number of cities: ',len(list_of_cities))
print("number of products: ", len(list_of_items))
print('number of drivers: ', len(drivers))

drivers:  ['A_0', 'B_0', 'C_0', 'D_0', 'E_0', 'F_0', 'G_0', 'H_0', 'I_0', 'J_0', 'K_0', 'L_0', 'M_0', 'N_0', 'O_0', 'P_0', 'Q_0', 'R_0', 'S_0', 'T_0', 'U_0', 'V_0', 'W_0', 'X_0', 'Y_0', 'Z_0', 'A_1', 'B_1', 'C_1', 'D_1']
number of cities:  50
number of products:  20
number of drivers:  30


In [67]:
def create_merchandise_dict(items):
    sampled_items = random.sample(items, random.randint(1, len(items)))
    sampled_quantities = [random.randint(1,MAX_ITEM_QUANTITY) for i in range(len(sampled_items))]
    return {item:quantity for item, quantity in zip(sampled_items, sampled_quantities)}

# generates the standard routes JSON file
def create_standard_routes(n_routes, cities, items, max_quantity=100, s_filename="standard.json"):
    """
    Generates a standard routes JSON file.
        
    Parameters
    ----------
    n_routes : int
        Number of routes to generate.
    cities : list
        List of cities.
    items : list
        List of items.
    max_quantity : int
        Maximum quantity of items to be delivered in a single trip, eg Apples: 100 is the max
        
    Returns
    -------
    None
    """
    
    data = []
    for route in range(n_routes):

        n_cities = random.randint(2, len(cities)) # number of cities to visit in a single route, eg from 2 cities to all 140 cities
        sampled_cities = random.sample(cities, n_cities)

        single_route = []
        for i in range(n_cities - 1):
            trip = {'from':sampled_cities[i], 'to':sampled_cities[i+1]}

            ## Random items sample
            items_dict = create_merchandise_dict(items)

            trip['merchandise'] = items_dict
            single_route.append(trip)

        data.append({'id':'s'+str(route), 'route':single_route})

    # Write the data to the JSON file
    file_path = os.path.join(DATA_DIR, s_filename) #"standard.json"
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=2)

    print(f"JSON standard data has been written to {file_path}")

In [68]:
# not used
def permute_route(sroute):
    cities = [route['from'] for route in sroute]
    cities = random.sample(cities, len(cities))

    sampled_cities = random.sample(cities, len(cities))

    single_route = []
    for i in range(len(cities) - 1):
        trip = {'from':sampled_cities[i], 'to':sampled_cities[i+1], 'merchandise':sroute[i]['merchandise']}
        single_route.append(trip)
    return single_route

# function to alterate a route by adding/removing cities, and adding/removing items, and changing the quantities of items
def alterate_route(sroute, cities, items, divergence=0.1):
    # # Create a copy of the route
    # route = sroute.copy()

    # # Add/remove cities
    # for i in range(len(sroute)):
    #     if random.random() < divergence:
    #         # Remove a city
    #         route.pop(i)
    #     elif random.random() < divergence:
    #         # Add a city that is not already in the route
    #         available_cities = [city for city in cities if city not in [route[i]['from'] for i in range(len(route))] + [route[-1]["to"]]]
    #         if len(available_cities) > 0:
    #             new_city = random.choice(available_cities)
    #             route.insert(i, {'from':route[i]['from'], 'to':new_city, 'merchandise':route[i]['merchandise']})
    #             route[i+1]['from'] = new_city
    
    # Create a copy of the route
    route = []
    counter = -1
    rands = [random.random() for i in range(len(sroute))]
    counterRemoved = 0
    counterAdded = 0
    
    # Remove cities
    for i in range(len(sroute)):
        if rands[i] < divergence / 2:
            counterRemoved += 1
            # Copy the city if it is not removed
            if i == 0 or i == len(sroute) - 1:
                continue
            else:
                if len(route) > 0:
                    route[counter]["to"] = sroute[i+1]["from"] # milan -> rome, rome -> naples, naples -> catania, catania -> palermo
        else:
            counter += 1
            route.append(sroute[i].copy())
    
    #print("route", route)
    #routes_new = []
    banned_cities = [route[p]['from'] for p in range(len(route))] + [route[-1]["to"]] if len(route) > 0 else []
    # counter = 0
    # for i in range(len(route)):
    #     routes_new.append(route[i])
    #     if random.random() < divergence:
    #         # Add a city that is not already in the route
    #         banned_cities = [route[p]['from'] for p in range(len(route))] + [route[-1]["to"]] + cities_added
    #         print("banned_cities", banned_cities)
    #         available_cities = [city for city in cities if city not in banned_cities]
    #         if len(available_cities) > 0:
    #             new_city = random.choice(available_cities)
    #             assert new_city not in cities_added, "new_city is already in cities_added"
    #             cities_added.append(new_city)
    #             old_city_from = route[i]['from']
    #             routes_new.insert(i, {'from':old_city_from, 'to':new_city, 'merchandise':create_merchandise_dict(items)} )
    #             routes_new[i+1]['from'] = new_city
    #             #routes_to_add.append( (i, {'from':route[i]['from'], 'to':new_city, 'merchandise':create_merchandise_dict(items)}) )
                
    #             # print("modified route", route[i])
    #             # print("---------")
    
    routes_to_add = []
    len_route = len(route)
    for i in range(len(route)):
        if divergence / 2 <= rands[i] < divergence:
            counterAdded += 1
            # Add a city that is not already in the route
            #banned_cities =  + cities_added
            #print("banned_cities", banned_cities)
            available_cities = [city for city in cities if city not in banned_cities]
            #print("available_cities", available_cities)
            #available_cities = [city for city in cities if city not in [route[i]['from'] for i in range(len(route))] + [route[-1]["to"]]]
            if len(available_cities) > 0:
                new_city = random.choice(available_cities)
                assert new_city not in banned_cities, "new_city is already in cities_added"
                banned_cities.append(new_city)
                
                #print('new city: ', new_city)
                # print("i", i)
                # print("current route", route[i])
                # print("route to add", {'from':route[i]['from'], 'to':new_city, 'merchandise':create_merchandise_dict(items)})
                routes_to_add.append( (i, {'from':route[i]['from'], 'to':new_city, 'merchandise':create_merchandise_dict(items)}) )
                #print("merchandise", routes_to_add[-1][1]['merchandise'])
                #route.insert(i, {'from':route[-1]['from'], 'to':new_city, 'merchandise':route[i]['merchandise']})
                route[i]['from'] = new_city
                # print("modified route", route[i])
                # print("---------")
                
    # [a,B,c,d,e,5,F,g,H,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z]
    # [GENOA>milan, TRENTO>naples, naples>catania, catania>palermo] { rome>genoa milan>trento }
    # [rome>genoa, GENOA>milan, milan>trento, TRENTO>naples, naples>catania, catania>palermo]
    # i [0,1]
    
    
    for j, (index, r) in enumerate(routes_to_add):
        #route.insert(i+len(routes_to_add[:i]), r2)
        #route.insert(i+len(routes_to_add[:j]), r)
        # print("adding route at index: ", "i", i, "j", j, "i+j", i+j)
        # print("route", r)
        # print("current route i", route[i])
        # print("current route i+j", route[i+j])
        route.insert(index+j, r)
        # print("modified route i+j", route[i+j])
        # print("modified route i+j+1", route[i+j+1])
        # print("xxxxxxxxxxxx")
    
    rands = np.array(rands)
    citiesRemoved = counterRemoved / len(sroute)
    citiesAdded = counterAdded / len_route if len_route > 0 else 0

    # Add/change_quantity/remove items
    for i in range(len(route)):
        merchandise_copy = route[i]['merchandise'].copy()
        #print("merchandise_copy", merchandise_copy.keys())
        for item in route[i]['merchandise'].keys():
            choice = random.random()
            if choice < divergence / 3:
                # Remove an item
                if len(merchandise_copy) > 1:
                    merchandise_copy.pop(item)
                #print(i, "popped", item)
            elif choice < 2*divergence / 3:
                # Change the quantity of an item
                #if item in merchandise_copy.keys():
                merchandise_copy[item] = random.randint(1, MAX_ITEM_QUANTITY)
                    #print(i, "changed", item, "to", merchandise_copy[item])
            elif choice < divergence:
                # Add an item
                available_items = [item for item in items if item not in route[i]['merchandise'].copy().keys()] # case when an item was removed earlier, and now it is added again
                if len(available_items) > 0:
                    merchandise_copy[random.choice(available_items)] = random.randint(1, MAX_ITEM_QUANTITY)
                    #print(i, "added", item)
        route[i]['merchandise'] = merchandise_copy.copy()

    return route, citiesRemoved, citiesAdded

# generates the actual routes JSON file
def create_actual_routes(drivers, cities, items, s_filename='standard.json', a_filename="actual.json", forward_expansion=10, divergence=0.1):
    '''
    Generates the actual routes JSON file.

    Parameters
    ----------
    drivers : list
        List of drivers.
    cities : list
        List of cities.
    items : list
        List of items.
    s_filename : str
        Standard routes JSON file name.
    a_filename : str
        Actual routes JSON output file name.
    max_item_quantity : int
        Maximum quantity of items to be delivered in a single trip, eg Apples: 100 is the max
    forward_expansion : int
        Number of routes to generate from a single standard route.
    divergence : float
        The probability that a driver will deviate from the standard route.
    '''
    actual_routes = []
    counter = 0
    counterDrivers = np.zeros(len(drivers))
    citiesRemovedList = []
    citiesAddedList = []
    
    file_path = os.path.join(DATA_DIR, s_filename) #"standard.json"
    with open(file_path, 'r') as json_file:
        # Load the JSON data
        standard_routes = json.load(json_file)

        for route in standard_routes:
            
            id_sroute = route['id']

            for i in range(forward_expansion):
                driver = random.choice(drivers)
                counterDrivers[drivers.index(driver)] += 1
                altered_route, citiesRemoved, citiesAdded = alterate_route(route['route'], cities, items, divergence)
                citiesRemovedList.append(citiesRemoved)
                citiesAddedList.append(citiesAdded)
                a_route = {'id':'a'+str(counter), 'driver':driver, 'sroute':id_sroute, 'route':altered_route}
                actual_routes.append(a_route)
                counter += 1

    # Write the data to the JSON file
    file_path = os.path.join(DATA_DIR, a_filename) #"actual.json"
    with open(file_path, 'w') as json_file:
        json.dump(actual_routes, json_file, indent=2)
        
    # stats
    print("number of actual routes: ", len(actual_routes))
    print("number of drivers: ", len(drivers))
    # per driver
    print("\npercentage of routes per driver:")
    print(" expected: ", 1 / len(drivers))
    print("   actual: ", np.mean(counterDrivers) / len(actual_routes))
    print("  std dev: ", np.std(counterDrivers) / len(actual_routes))
    print("      min: ", np.min(counterDrivers) / len(actual_routes))
    print("      max: ", np.max(counterDrivers) / len(actual_routes))
    #print(counterDrivers)

    
    citiesRemovedList = np.array(citiesRemovedList)
    citiesAddedList = np.array(citiesAddedList)
    print("\nnumber of cities removed per route:")
    print(" expected: ", divergence / 2)
    print("   actual: ", np.mean(citiesRemovedList))
    
    print("\nnumber of cities added per route:")
    print(" expected: ", divergence / 2)
    print("   actual: ", np.mean(citiesAddedList))

    print(f"\nJSON actual data has been written to {a_filename}")

In [69]:
create_standard_routes(NUM_STANDARD_ROUTES, list_of_cities, list_of_items, s_filename=STANDARD_FILENAME)

JSON standard data has been written to /Users/ericsuardi/Desktop/DataMiningProject23-24/data/standard_medium.json


In [70]:
create_actual_routes(drivers, list_of_cities, list_of_items, forward_expansion=FORWARD_EXPANSION, divergence=DIVERGENCE, s_filename=STANDARD_FILENAME, a_filename=ACTUAL_FILENAME)

number of actual routes:  2500
number of drivers:  10

percentage of routes per driver:
 expected:  0.1
   actual:  0.1
  std dev:  0.0031799371062962867
      min:  0.0948
      max:  0.1064

number of cities removed per route:
 expected:  0.05
   actual:  0.05030245969223134

number of cities added per route:
 expected:  0.05
   actual:  0.05055284556159661

JSON actual data has been written to actual_medium.json
