# Data Mining


In [1]:
## get home directory
import os
HOME = os.path.dirname(os.path.realpath(__file__))
print('HOME: ',HOME)

HOME:  /Users/ericsuardi/Desktop/DataMiningProject23-24


## create the dataset

In [2]:
import json
import random
import pandas as pd
import sys
import lxml
import string
import numpy as np
from tqdm import tqdm

### Constants of the project

In [3]:
MAX_ITEM_QUANTITY = 100      # max number of items per order
SUBSAMPLE_CITY = 30         # < 140
SUBSAMPLE_ITEM = 30         # < 230
NUM_DRIVER = 50           # < 100

MIN_CITIES = 3
SHUFFLE_DATA = False


NUM_STANDARD_ROUTES = 10   # number of standard routes to generate
FORWARD_EXPANSION = 5000      # number of times each standard route is altered
DIVERGENCE = -1     # if -1 every driver different divergence for every standard       # probability of divergence from standard route (an alteration)

DATA_DIR = os.path.join(HOME,'data')
STANDARD_FILENAME = "standard_heavy.json"
ACTUAL_FILENAME = "actual_heavy.json"

### Web scraping to collect cities and items

#### Boilerplate functions

In [4]:
def generate_sequence_drivers_names(total):
    alphabet = string.ascii_uppercase
    num_len = len(str(total // len(alphabet)))
    format_str = "{:0" + str(num_len) + "d}"
    
    for i in range(total):
        letter = alphabet[i % len(alphabet)]
        number = format_str.format(i // len(alphabet))
        if total > len(alphabet):
            yield letter + '_' + number
        else:
            yield letter

In [5]:
list_of_cities = pd.read_html('https://en.wikipedia.org/wiki/List_of_cities_in_Italy')[0].City.to_list()

# 230 products
list_of_items = [
    'Air Freshener', 'Air Mattress', 'Air Purifier', 'Aluminum Foil', 'Apple', 'Baby Formula', 'Baby Wipes', 'Backpack', 'Bacon', 'Banana', 'Band-Aids', 'Bar Stools', 'Barware Set', 'Basketball', 'Batteries', 'Beach Chair', 'Beach Towel', 'Bed Sheets', 'Bedside Table', 'Beef', 'Bell Pepper', 'Bike', 'Biscuits', 'Blankets', 'Bleach', 'Blender', 'Blueberries', 'Bluetooth Speaker', 'Board Games', 'Bookshelf', 'Bread', 'Broccoli', 'Broom', 'Camera', 'Camp Stove', 'Can Opener', 'Candles', 'Canned Beans', 'Canned Soup', 'Canned Vegetables', 'Carrot', 'Cat Litter', 'Cereal', 'Chairs', 'Charcoal', 'Charging Cable', 'Cheese', 'Chewing Gum', 'Chicken', 'Chips', 'Chocolate', 'Coffee', 'Coffee Maker', 'Coffee Table', 'Cookies', 'Cookware Set', 'Cooler', 'Cooling Fan', 'Corkscrew', 'Cotton Swabs', 'Couch', 'Crackers', 'Cucumber', 'Curtains', 'Cutlery Set', 'Cutting Board', 'Desk', 'Diapers', 'Dining Table', 'Dish Soap', 'Dish Towels', 'Dishwasher Detergent', 'Diving Mask', 'Dresser', 'Dumbbells', 'Dustpan', 'Eggs', 'Envelopes', 'Exercise Ball', 'Fabric Softener', 'Fertilizer', 'Firewood', 'First Aid Kit', 'Fishing Rod', 'Fitness Tracker', 'Flashlight', 'Football', 'Frisbee', 'Garbage Bags', 'Garden Hose', 'Gloves', 'Glue', 'Goggles', 'Granola Bars', 'Grapes', 'Grill', 'Hand Sanitizer', 'Handbag', 'Hangers', 'Hat', 'Headphones', 'Heater', 'Helmet', 'Hiking Boots', 'Humidifier', 'Ice Cream', 'Ice Skates', 'Incense', 'Insect Repellent', 'Iron', 'Ironing Board', 'Jelly', 'Juice', 'Jump Rope', 'Ketchup', 'Knee Pads', 'Lamps', 'Laptop', 'Laundry Detergent', 'Lawn Mower', 'Lettuce', 'Light Bulbs', 'Lighter Fluid', 'Lock', 'Luggage', 'Markers', 'Matches', 'Mayonnaise', 'Measuring Cups', 'Microwave', 'Milk', 'Mirrors', 'Mixing Bowls', 'Mop', 'Mustard', 'Napkins', 'Notepads', 'Office Chair', 'Onion', 'Orange', 'Outdoor Furniture', 'Pain Relievers', 'Paper Towels', 'Pasta', 'Peanut Butter', 'Peanuts', 'Peeler', 'Pens', 'Pet Food', 'Phone Charger', 'Pickles', 'Picture Frames', 'Pillows', 'Pizza', 'Plastic Wrap', 'Playing Cards', 'Portable Charger', 'Potato', 'Potted Plants', 'Printer Paper', 'Pruning Shears', 'Puzzle', 'Rain Jacket', 'Raincoat', 'Resistance Bands', 'Rice', 'Rollerblades', 'Rugs', 'Salad Dressing', 'Salmon', 'Sandals', 'Sausages', 'Scarf', 'Scissors', 'Shampoo', 'Shrimp', 'Skateboard', 'Ski Equipment', 'Sleeping Bag', 'Smartwatch', 'Snorkel', 'Snowboard', 'Soap', 'Soccer Ball', 'Soda', 'Spatula', 'Spinach', 'Stationary Bike', 'Storage Bins', 'Strawberries', 'Sun Hat', 'Sunglasses', 'Sunscreen', 'Surfboard', 'Swimsuit', 'TV Stand', 'Tackle Box', 'Tape', 'Tea', 'Tent', 'Thermometer', 'Throw Pillows', 'Tire Pump', 'Tissues', 'Toaster', 'Toilet Paper', 'Tomato', 'Tool Kit', 'Toothpaste', 'Tote Bag', 'Towels', 'Trail Mix', 'Trash Bags', 'Trash Cans', 'Travel Adapter', 'Travel Pillow', 'Treadmill', 'Tripod', 'Tweezers', 'Umbrella', 'Vacuum Cleaner', 'Wallet', 'Water Bottle', 'Watering Can', 'Watermelon', 'Wine Glasses', 'Winter Coat', 'Yoga Mat', 'Yogurt', 'Ziploc Bags'
]

#drivers = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S','T', 'U','V','W','X','Y','Z']

# generate random drivers names
drivers = list(generate_sequence_drivers_names(NUM_DRIVER))

## Subsampling
assert  0 < SUBSAMPLE_CITY and  SUBSAMPLE_CITY <= len(list_of_cities), "SUBSAMPLE_CITY must be less than {}".format(len(list_of_cities))
assert 0 < SUBSAMPLE_ITEM and SUBSAMPLE_ITEM <= len(list_of_items), "SUBSAMPLE_ITEM must be less than {}".format(len(list_of_items))

list_of_cities = random.sample(list_of_cities, SUBSAMPLE_CITY)
#list_of_cities = list_of_cities[:10]
#list_of_items = list_of_items[:5]
list_of_items = random.sample(list_of_items, SUBSAMPLE_ITEM)
#drivers = drivers[:3]

print('number of cities: ',len(list_of_cities))
print("number of products: ", len(list_of_items))
print('number of drivers: ', len(drivers))

number of cities:  30
number of products:  30
number of drivers:  50


In [6]:
def create_merchandise_dict(items):
    sampled_items = random.sample(items, random.randint(1, len(items)))
    sampled_quantities = [random.randint(1,MAX_ITEM_QUANTITY) for i in range(len(sampled_items))]
    # return a dictionary of items and quantities, items sorted by name
    return {item:quantity for item, quantity in sorted(zip(sampled_items, sampled_quantities))}

# generates the standard routes JSON file
def create_standard_routes(n_routes, cities, items, max_quantity=100, s_filename="standard.json"):
    """
    Generates a standard routes JSON file.
        
    Parameters
    ----------
    n_routes : int
        Number of routes to generate.
    cities : list
        List of cities.
    items : list
        List of items.
    max_quantity : int
        Maximum quantity of items to be delivered in a single trip, eg Apples: 100 is the max
        
    Returns
    -------
    None
    """
    
    data = []
    for route in range(n_routes):

        n_cities = random.randint(MIN_CITIES, len(cities)) # number of cities to visit in a single route, eg from 2 cities to all 140 cities
        sampled_cities = random.sample(cities, n_cities)

        single_route = []
        for i in range(n_cities - 1):
            trip = {'from':sampled_cities[i], 'to':sampled_cities[i+1]}

            ## Random items sample
            items_dict = create_merchandise_dict(items)

            trip['merchandise'] = items_dict
            single_route.append(trip)

        data.append({'id':'s'+str(route), 'route':single_route})

    # shuffle the routes, in place
    if SHUFFLE_DATA:
        random.shuffle(data)
    
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
    
    # Write the data to the JSON file
    file_path = os.path.join(DATA_DIR, s_filename) #"standard.json"
    with open(file_path, 'w', encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=2, ensure_ascii=False)

    print(f"JSON standard data has been written to {file_path}")

In [7]:
# function to alterate a route by adding/removing cities, and adding/removing items, and changing the quantities of items
def alterate_route(sroute, cities, items, divergence=0.1):
    """
    Modifies a given route by removing cities, adding cities, and altering the quantity of items.

    Args:
        sroute (list): The original route to be modified.
        cities (list): The list of available cities.
        items (list): The list of available items.
        divergence (float, optional): The divergence factor that determines the probability of modifications. Defaults to 0.1.

    Returns:
        tuple: A tuple containing the modified route, the percentage of cities removed, the percentage of cities added,
               the percentage of items removed, the percentage of items added, and the percentage of items changed.
    """
    # Function code...
def alterate_route(sroute, cities, items, divergence=0.1):
    # Create a copy of the route
    route = []
    counter = -1
    rands = [random.random() for i in range(len(sroute))]
    counterRemoved = 0
    counterAdded = 0
    
    # Remove cities
    if len(sroute) > MIN_CITIES - 1:
        for i in range(len(sroute)):
            if rands[i] < divergence / 2 and counterRemoved < len(sroute) - MIN_CITIES + 1: 
                counterRemoved += 1
                # Copy the city if it is not removed
                if i == 0 or i == len(sroute) - 1:
                    continue
                else:
                    if len(route) > 0:
                        route[counter]["to"] = sroute[i+1]["from"]
            else:
                counter += 1
                route.append(sroute[i].copy())
    else:
        route = sroute.copy()
        #print("len sroute", len(sroute))   
                
    assert len(route) > 0, "route is empty"
    
    banned_cities = [route[p]['from'] for p in range(len(route))] + [route[-1]["to"]] if len(route) > 0 else []
    
    routes_to_add = []
    len_route = len(route)
    for i in range(len(route)):
        if divergence / 2 <= rands[i] < divergence:
            counterAdded += 1
            # Add a city that is not already in the route
            available_cities = [city for city in cities if city not in banned_cities]
            
            if len(available_cities) > 0:
                new_city = random.choice(available_cities)
                assert new_city not in banned_cities, "new_city is already in cities_added"
                banned_cities.append(new_city)
                
                routes_to_add.append( (i, {'from':route[i]['from'], 'to':new_city, 'merchandise':create_merchandise_dict(items)}) )
                
                route[i]['from'] = new_city
    
    
    for j, (index, r) in enumerate(routes_to_add):
        route.insert(index+j, r)
    
    rands = np.array(rands)
    citiesRemoved = counterRemoved / len(sroute)
    citiesAdded = counterAdded / len_route if len_route > 0 else 0

    # Add/change_quantity/remove items
    counterItems = 0
    counterItemsRemoved = 0
    counterItemsAdded = 0
    counterItemsChanged = 0
    for i in range(len(route)):
        merchandise_copy = route[i]['merchandise'].copy()
        for item in route[i]['merchandise'].keys():
            counterItems += 1
            choice = random.random()
            if choice < divergence / 3:
                # Remove an item
                if len(merchandise_copy) > 1:
                    counterItemsRemoved += 1
                    merchandise_copy.pop(item)
            elif choice < 2*divergence / 3:
                # Change the quantity of an item
                counterItemsChanged += 1
                merchandise_copy[item] = random.randint(1, MAX_ITEM_QUANTITY)
            elif choice < divergence:
                # Add an item
                available_items = [item for item in items if item not in route[i]['merchandise'].copy().keys()] # case when an item was removed earlier, and now it is added again
                if len(available_items) > 0:
                    counterItemsAdded += 1
                    merchandise_copy[random.choice(available_items)] = random.randint(1, MAX_ITEM_QUANTITY)
                    #print(i, "added", item)
        route[i]['merchandise'] = merchandise_copy.copy()

    # Avoid division by zero
    if counterItems == 0:
        return route, citiesRemoved, citiesAdded, 0, 0, 0
    else:
        return route, citiesRemoved, citiesAdded, counterItemsRemoved / counterItems, counterItemsAdded / counterItems, counterItemsChanged / counterItems

# generates the actual routes JSON file
def create_actual_routes(drivers, cities, items, s_filename='standard.json', a_filename="actual.json", forward_expansion=10, divergence=0.1):
    '''
    Generates the actual routes JSON file.

    Parameters
    ----------
    drivers : list
        List of drivers.
    cities : list
        List of cities.
    items : list
        List of items.
    s_filename : str
        Standard routes JSON file name.
    a_filename : str
        Actual routes JSON output file name.
    max_item_quantity : int
        Maximum quantity of items to be delivered in a single trip, eg Apples: 100 is the max
    forward_expansion : int
        Number of routes to generate from a single standard route.
    divergence : float
        The probability that a driver will deviate from the standard route.
    '''
    actual_routes = []
    counter = 0
    counterDrivers = np.zeros(len(drivers))
    citiesRemovedList = []
    citiesAddedList = []
    itemsRemovedList = []
    itemsAddedList = []
    itemsChangedList = []
    
    file_path = os.path.join(DATA_DIR, s_filename) #"standard.json"
    with open(file_path, 'r') as json_file:
        # Load the JSON data
        standard_routes = json.load(json_file)


    # Extract the id of each route and store them in a list
    id_sroute_list = [route['id'] for route in standard_routes]
    # Sort the list by the integer part of the string
    id_sroute_list.sort(key=lambda s: int(s[1:]))
    print("id_sroute_list: ", id_sroute_list)

    if divergence == -1:
        divergence_drivers = [[random.uniform(0,0.5) for i in range(len(id_sroute_list))] for j in range(len(drivers))]
    else:
        divergence_drivers = [[divergence for i in range(len(id_sroute_list))] for j in range(len(drivers))]
    print("divergence: ", divergence_drivers)
    

    for route in tqdm(standard_routes):
        
        id_sroute = route['id']

        for i in range(forward_expansion):
            driver = random.choice(drivers)
            counterDrivers[drivers.index(driver)] += 1
            altered_route, citiesRemoved, citiesAdded, itemsRemoved, itemsAdded, itemsChanged = alterate_route(route['route'], cities, items, divergence_drivers[drivers.index(driver)][id_sroute_list.index(id_sroute)])
            citiesRemovedList.append(citiesRemoved)
            citiesAddedList.append(citiesAdded)
            itemsRemovedList.append(itemsRemoved)
            itemsAddedList.append(itemsAdded)
            itemsChangedList.append(itemsChanged)
            
            a_route = {'id':'a'+str(counter), 'driver':driver, 'sroute':id_sroute, 'route':altered_route}
            actual_routes.append(a_route)
            counter += 1

    # shuffle the routes to avoid bias, in place
    if SHUFFLE_DATA:
        random.shuffle(actual_routes)
    
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
    # Write the data to the JSON file
    print("Writing actual routes to file...")
    file_path = os.path.join(DATA_DIR, a_filename) #"actual.json"
    with open(file_path, 'w', encoding="utf-8") as json_file:
        json.dump(actual_routes, json_file, indent=2, ensure_ascii=False)
        
    # stats
    print("number of actual routes: ", len(actual_routes))
    print("number of drivers: ", len(drivers))
    # per driver
    print("\npercentage of routes per driver:")
    print(" expected: ", 1 / len(drivers))
    print("   actual: ", np.mean(counterDrivers) / len(actual_routes))
    print("  std dev: ", np.std(counterDrivers) / len(actual_routes))
    print("      min: ", np.min(counterDrivers) / len(actual_routes))
    print("      max: ", np.max(counterDrivers) / len(actual_routes))
    #print(counterDrivers)

    if divergence != -1:
        citiesRemovedList = np.array(citiesRemovedList)
        citiesAddedList = np.array(citiesAddedList)
        print("\nnumber of cities removed per route:")
        print(" expected: ", divergence / 2)
        print("   actual: ", np.mean(citiesRemovedList))
        
        print("\nnumber of cities added per route:")
        print(" expected: ", divergence / 2)
        print("   actual: ", np.mean(citiesAddedList))
        
        print("\nnumber of items removed per route:")
        print(" expected: ", divergence / 3)
        print("   actual: ", np.mean(itemsRemovedList))
        
        print("\nnumber of items added per route:")
        print(" expected: ", divergence / 3)
        print("   actual: ", np.mean(itemsAddedList))
        
        print("\nnumber of items changed per route:")
        print(" expected: ", divergence / 3)
        print("   actual: ", np.mean(itemsChangedList))
    

    print(f"\nJSON actual data has been written to {a_filename}")

    return divergence_drivers

In [8]:
create_standard_routes(NUM_STANDARD_ROUTES, list_of_cities, list_of_items, s_filename=STANDARD_FILENAME)

JSON standard data has been written to /Users/ericsuardi/Desktop/DataMiningProject23-24/data/standard_heavy.json


In [9]:
divergence = create_actual_routes(drivers, list_of_cities, list_of_items, forward_expansion=FORWARD_EXPANSION, divergence=DIVERGENCE, s_filename=STANDARD_FILENAME, a_filename=ACTUAL_FILENAME)

id_sroute_list:  ['s0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']
divergence:  [[0.393358611020405, 0.30076969441119195, 0.21201263538041326, 0.3383374417860294, 0.2370856334743358, 0.3381693881751204, 0.2418515200387787, 0.19778934395396114, 0.10850267335389746, 0.240387447430148], [0.3610024378315964, 0.3260727596552027, 0.26621582782493086, 0.04818333870128677, 0.47131228567069866, 0.41925728906039866, 0.06890576246601765, 0.09608058152457533, 0.0022954819041102192, 0.35315407732463283], [0.14265675579646986, 0.3500772118347257, 0.2164181761583644, 0.12235607333411164, 0.1272626000078474, 0.48388442863365905, 0.39130814983443707, 0.1685344206989957, 0.18747097332457718, 0.04045572436258221], [0.10756087836331962, 0.01965338826478641, 0.27809414372293323, 0.2130540657797228, 0.032998034772225016, 0.24922492375874117, 0.13645141925483395, 0.3850166770172182, 0.3496569713927953, 0.11618533355394145], [0.4239715940504185, 0.4067029344132307, 0.45178590925172446, 0.0821604712

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:06<00:00,  1.45it/s]


Writing actual routes to file...
number of actual routes:  50000
number of drivers:  50

percentage of routes per driver:
 expected:  0.02
   actual:  0.02
  std dev:  0.0005227925018590072
      min:  0.01904
      max:  0.02104

JSON actual data has been written to actual_heavy.json
