In [1]:
# Imports 
import warnings # need to be imported before other imports
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

import os 
import csv
import sys
import math
import time
import copy
import glob 
import pickle
import shutil
import random
import ortools
import logging
import datetime
import matplotlib 
import numpy as np
import pandas as pd
import configparser   
from shapely import geometry
from functools import reduce
import matplotlib.pyplot as plt
# from geopy.geocoders import Nominatim  

matplotlib.rc('xtick', labelsize=26) 
matplotlib.rc('ytick', labelsize=26) 

plt.rcParams['font.size'] = '26'
plt.rcParams['figure.figsize'] = (10,7.5)

plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 1.50

High level view -- The workflow has $4$ stages:  
1. **Assignment** : Get the probability distribution corresponding to each driver.  

2. **File Generation** : Swap the drivers' files (a way of creating drivers with existing real-world information) and copy/create de_data (containing shift information) files.   

3. **Simulation** : Use last-mile delivery algorithm (FoodMatch) to simulate real-world deliveries. Store the simulation files.  

4. **Evalutaion** : Evaluate the performance on the evalution metrics (using income information from the simulation files of all $6$ days)

Each stage has been explicitly marked in this notebook. You can opt to separate each step by creating a script for each of these using this notebook.

In [2]:
# Parameters
city = 'A'            # takes values in {'A', 'B'}
flag = 0              # takes values in {0, 1, 2}
day_num = 1           # takes values in {1, 2, 3, 4, 5, 6}
w1, w2 = 0.6, 0.4     # w_i can take values in [0, 1] such that sum(w_i for i in {1, 2}) = 1
NUM_DRIVERS = 977     # Number of drivers to be chosen out of the intersection of drivers; -1 => take all drivers in driver_idf
NUM_SIM = 30          # Number of similar drivers for each driver to be considered in the fairness constraint
K_VAL = 10            # Number of nearest zones to which a driver can be assigned
wts = -1
algo = 'FairAssign'  

In [3]:
# IGNORE: 
# -------
""" 
city = str(sys.argv[1])             # takes values in {'A', 'B', 'C'}
day_num = str(sys.argv[2])          # takes values in {1, 2, 3, 4, 5, 6}
# w1, w2 = 0.6, 0.4                 # w_i can take values in [0, 1] such that sum(w_i for i in {1, 2}) = 1
# w1, w2 = 0.7, 0.3
# w1, w2 = 0.8, 0.2
NUM_DRIVERS = int(sys.argv[3])     # Number of drivers to be chosen out of the intersection of drivers; -1 => take all drivers in driver_idf
NUM_SIM = int(sys.argv[4])         # Number of similar drivers for each driver to be considered in the fairness constraint
K_VAL = int(sys.argv[5])
flag = int(sys.argv[6])            # takes values in {0, 1, 2} 

wts = int(sys.argv[7])              # 64 (or) 73 (or) 82
w1, w2 = (wts//10)/10, (wts%10)/10    # w1,w2=0.6,0.4 (or) w1,w2=0.7,0.3 (or) w1,w2=0.8,0.2

algo = 'FairAssign'
"""

" \ncity = str(sys.argv[1])             # takes values in {'A', 'B', 'C'}\nday_num = str(sys.argv[2])          # takes values in {1, 2, 3, 4, 5, 6}\n# w1, w2 = 0.6, 0.4                 # w_i can take values in [0, 1] such that sum(w_i for i in {1, 2}) = 1\n# w1, w2 = 0.7, 0.3\n# w1, w2 = 0.8, 0.2\nNUM_DRIVERS = int(sys.argv[3])     # Number of drivers to be chosen out of the intersection of drivers; -1 => take all drivers in driver_idf\nNUM_SIM = int(sys.argv[4])         # Number of similar drivers for each driver to be considered in the fairness constraint\nK_VAL = int(sys.argv[5])\nflag = int(sys.argv[6])            # takes values in {0, 1, 2} \n\nwts = int(sys.argv[7])              # 64 (or) 73 (or) 82\nw1, w2 = (wts//10)/10, (wts%10)/10    # w1,w2=0.6,0.4 (or) w1,w2=0.7,0.3 (or) w1,w2=0.8,0.2\n\nalgo = 'FairAssign'\n"

Exemplar input command:   

**ipython food_dlvry.ipynb A $1$ $1000$ $30$ $10$ $0$**   
**ipython food_dlvry.ipynb A $1$ $1000$ $30$ $7$ $1$**  
**ipython food_dlvry.ipynb A $1$ $1000$ $30$ $7$ $2$ $64$**    

Note that **File Generation** for all $6$ days needs to be done before simulation.  


In [4]:
# Input Summary:
print("======================= Input Summary =======================")
print("City:", city)
print("Number of drivers:", NUM_DRIVERS)
print("Number of similar drivers considered per driver:", NUM_SIM) 
print("Number of nearest zones for assignment:", K_VAL)
print("Day (for file generation):", day_num)
print(f"w1={w1}, w2={w2}")
print("Algorithm (for evaluation):", algo)
print("==============================================================")

City: A
Number of drivers: 977
Number of similar drivers considered per driver: 30
Number of nearest zones for assignment: 10
Day (for file generation): 1
w1=0.6, w2=0.4
Algorithm (for evaluation): FairAssign


**flag**  
"flag" decides which distance metric/measure to consider:    
0: euclidean distance (or physical distance)   
1: rating   
2: combination of euclidean distance and rating   
   where 'w1' is weight given to euclidean distance and 'w2' is weight given to rating 


**algo**  
"algo" decides the assignment algorithm;    
It can take values in {'FairAssign', 'RoundRobin', 'LIPA'}

In [5]:
# assigns_path = "./assign_results"
# if os.path.exists(assigns_path):
#     shutil.rmtree(assigns_path)

# logs_path = "./logs"
# if os.path.exists(logs_path):
#     shutil.rmtree(logs_path)

In [6]:
# LOADING DATASETS
## All of the data here is UNANONYMIZED
import glob 

driver_files_A = sorted(glob.glob("data/driver_locs/A/driver_data_A_day*.csv"))
driver_files_B = sorted(glob.glob("data/driver_locs/B/driver_data_B_day*.csv"))
driver_files_C = sorted(glob.glob("data/driver_locs/C/driver_data_C_day*.csv"))

num_days = len(driver_files_A) 
assert len(driver_files_A)==len(driver_files_B), "error in reading data or incomplete data"
assert len(driver_files_A)==len(driver_files_C), "error in reading data or incomplete data"

driver_dfs_A = [pd.read_csv(driver_files_A[idx]) for idx in range(num_days)]
driver_dfs_B = [pd.read_csv(driver_files_B[idx]) for idx in range(num_days)]
driver_dfs_C = [pd.read_csv(driver_files_C[idx]) for idx in range(num_days)]
driver_dfs_dict = {'A': driver_dfs_A, 'B': driver_dfs_B, 'C': driver_dfs_C}

zone_df_A = pd.read_csv("data/zone_data/zone_data_A.csv")
zone_df_B = pd.read_csv("data/zone_data/zone_data_B.csv")
zone_df_C = pd.read_csv("data/zone_data/zone_data_C.csv")
zone_dfs_dict = {'A': zone_df_A, 'B': zone_df_B, 'C': zone_df_C}

income_df_A = pd.read_csv("data/income_data/incomes_A.csv")
income_df_B = pd.read_csv("data/income_data/incomes_B.csv")
income_df_C = pd.read_csv("data/income_data/incomes_C.csv")
income_dfs_dict = {'A': income_df_A, 'B': income_df_B, 'C': income_df_C}

base_zone_A = pd.read_csv("data/base_zones/A_base_zones.csv")
base_zone_B = pd.read_csv("data/base_zones/B_base_zones.csv")
base_zone_C = pd.read_csv("data/base_zones/C_base_zones.csv")
base_zones_dict = {'A': base_zone_A, 'B': base_zone_B, 'C': base_zone_C}

**Nothing changes even on anonymizing 'base_zones', 'driver_data' and 'orders_data'**

In [None]:
# UTILITIES
#'city' takes values in {'A'}

def driver_union(drivers_dict):
    """
    Finding union of all the drivers over the days 
    """
    driver_dfs = drivers_dict[city] 
    num_days = len(driver_dfs)
    
    driver_udf = pd.concat([driver_dfs[idx] for idx in range(num_days)])
    driver_udf = driver_udf.drop('Unnamed: 0', axis=1)
    driver_udf = driver_udf.drop_duplicates('de_id').reset_index().drop('index', axis=1)
    
    return driver_udf


def driver_intersection(drivers_dict):
    """
    Finding intersection of all the drivers over the days
    """
    driver_dfs = drivers_dict[city]
    driver_idf = reduce(lambda left,right: pd.merge(left,right,on='de_id'), driver_dfs)
    driver_idf = driver_idf[['de_id', 'lat_x', 'lng_x']]
    driver_idf = driver_idf.loc[:, ~driver_idf.columns.duplicated()] 
    driver_idf = driver_idf.rename(columns={'lat_x':'lat', 'lng_x':'lng'})
    
    return driver_idf


def drivers_zones(drivers_dict, zones_dict):
    """
    To get the data to be input to fair_clustering: "driver_locs" and "zone_locs"
    """
    driver_idf = driver_intersection(drivers_dict) 
    
    # finding "driver_locs":
    driver_locs = driver_idf[['lat', 'lng']] 
    driver_locs = driver_locs.values 
    
    # finding "zone_locs":
    zone_df = zones_dict[city]
    zone_locs = np.array(zone_df[['lat', 'lng']])
    
    return driver_locs, zone_locs


def get_capacities(zones_dict):
    """
    returns "lower_caps" and "upper_caps"
    lower_caps: [1 x num_centres] array with lower capacity of each zone
    upper_caps: [1 x num_centres] array with upper capacity of each zone
    """
    zone_df = zones_dict[city] 

    lower_caps = 0.3*zone_df['avg_cap'].values
    upper_caps = 1.0*zone_df['avg_cap'].values
    
    return lower_caps, upper_caps

In [None]:
# LOGGER

import logging

logs_path = "./logs"
if not os.path.exists(logs_path):
    os.mkdir(logs_path)

logging.basicConfig(filename=f"logs/{city}_{NUM_DRIVERS}_{NUM_SIM}.log", format='%(asctime)s  %(message)s', filemode='w')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

**Getting the inputs to fair_clustering**

In [None]:
driver_udf = driver_union(driver_dfs_dict)
driver_idf = driver_intersection(driver_dfs_dict) 

driver_locs, zone_locs = drivers_zones(driver_dfs_dict, zone_dfs_dict)
num_drivers, num_centres = driver_locs.shape[0], zone_locs.shape[0]

lower_caps, upper_caps = get_capacities(zone_dfs_dict)

Here in the entire experimentation, we consider only those drivers who were present/delivered on $7$ days (see drivers_data). The number of such drivers for different cities is as follows:
- City $A$ : $977$
- City $B$ : $3655$ 
- City $C$ : $4365$   
   
These are also the number of drivers obtained in **driver_idf**. In the rest of the codebase, however, we use the first $6$ days of the above referenced $7$ days.

In [None]:
def euclidean_distance(d_loc, z_loc):
    lat1, lng1 = d_loc[0], d_loc[1]
    lat2, lng2 = z_loc[0], z_loc[1]
    dist = np.sqrt(np.power(lat1-lat2, 2) + np.power(lng1-lng2, 2))
    return dist

def L2Distance(data):
  # "data": latitude-longitude level locations 
  transposed = np.expand_dims(data, axis = 1)
  distance = np.power(data - transposed, 2)
  distance = np.power(np.abs(distance).sum(axis = 2), 0.5) 
  return distance 

In [None]:
if NUM_DRIVERS!=-1 and NUM_DRIVERS<=driver_locs.shape[0]:
    driver_locs = driver_locs[:NUM_DRIVERS] 

driver_dists = L2Distance(driver_locs)

# Adjust lower_caps for given NUM_DRIVERS
# Do not adjust upper_caps
num_drivers = driver_locs.shape[0]
print(f"Sum of lower capacities: {lower_caps.sum()}")
while(lower_caps.sum()>num_drivers):
    # print(lower_caps)
    for idx in range(lower_caps.shape[0]):
        lower_caps[idx] = max(lower_caps[idx]-50, 0)

print(f"Sum of (adjusted) lower capacities: {lower_caps.sum()}")
print(f"Sum of upper capacities: {upper_caps.sum()}")
      
assert num_drivers>lower_caps.sum() and num_drivers<upper_caps.sum(), \
"This set of num_drivers, lower_caps and upper_caps will lead to an infeasible solution !"

In [None]:
num_drivers, num_zones = driver_locs.shape[0], zone_locs.shape[0]

dz_dist = np.zeros(shape=(num_drivers, num_zones))
for d_idx, driver in enumerate(driver_locs):
    d_dist = np.zeros(num_zones)
    for z_idx, zone in enumerate(zone_locs):
        dist = euclidean_distance(driver, zone)
        d_dist[z_idx] = dist 
    dz_dist[d_idx] = d_dist

# print(dz_dist.shape)

In [None]:
def get_prohibited_assigns(d_dist, k):
    prohibited_assignments = np.zeros(shape=(num_drivers, num_zones))
    
    for d_idx, d_dist in enumerate(d_dist):
        idx = np.argpartition(d_dist, k) 
        prohibited_assignments[d_idx][idx[k:]] = 1 # set the indices NOT corresponding to k-smallest elements 
    
    return prohibited_assignments

In [None]:
# assigning ratings to sellers:
from scipy.stats import truncnorm
from numpy.random import SeedSequence 
from numpy.random import default_rng

def get_truncated_normal(mean, sd, low, upp):
    return truncnorm( (low-mean)/sd, (upp-mean)/sd, loc=mean, scale=sd) 

def generate_ratings(num_drivers):
    mean = 3.5
    sd = 1
    min_rating = 0.0
    max_rating = 5.0
    seedVal = 36778738061272522495168595294022739449 # arbitrary
    rng = default_rng(seedVal)
    dist = get_truncated_normal(mean, sd, min_rating, max_rating)
    ratings = dist.rvs(num_drivers, random_state=rng)
    ratings = [round(x, 1) for x in ratings]
    return ratings

def abs_difference(ratings):
    transposed = np.expand_dims(ratings, axis=1)
    diff = abs(ratings-transposed) 
    return diff   

def minmax(distance, fair_distance):
    num_samples = len(distance)
    mx, mn = distance.max(), distance.min()
    dists = distance.flatten()
    dists = np.asarray( [((x-mn)/(mx-mn)) for x in dists] )
    distance = dists.reshape((num_samples, num_samples))
    fair_distance = (fair_distance-mn)/(mx-mn)
    return distance, fair_distance

In [None]:
# fair_dist = driver_dists.mean()/dnr

# dnr to NUM_SIM maps: (tells the number of similar drivers per driver for a given dnr (hence fair_dist))
# manually curated for quick experimentation
if flag==0:
    sim2dnr_A = {80:4, 60:5, 40:7, 30:8, 20:11, 15:14, 10:21, 7:28, 5:53} 
    sim2dnr_B = {70:8, 60:9, 50:10, 40:12, 30:14, 20:19, 15:23, 10:32, 7:45, 5:65}
    sim2dnr_C = {90:10, 80:11, 70:12, 60:13, 50: 15, 40:17, 30:22, 20:32, 15:42, 10:65, 8:80}


These values are true only when all the drivers in "driver_idf" are considered !   
For "driver_udf", you can to create these dicts separately by using the following (commented out) cell    
If these dicts are not created then we'll take driver_dists.mean() as the fair distance by default.   
Recall, Fair distance : the distance within which pair of drivers are considreed

In [None]:
# IGNORE: 
# -------
"""
# used for creating sim2dnr_city
driver_dists = L2Distance(driver_locs)
fair_dist = driver_dists.mean()/7
# print(fair_dist)

num_similar_drivers = []
for idx in range(len(driver_dists)):
    curr_driver = driver_dists[idx]
    # the drivers 'similar' to this driver are the ones within fair_distance from this driver
    num_sim = curr_driver[curr_driver<=fair_dist].shape[0]
    # print(num_sim)
    num_similar_drivers.append(num_sim) 

# print(num_similar_drivers)
print("Number of similar drivers:", np.mean(num_similar_drivers))
"""

In [None]:
# IGNORE: 
# -------
"""
# used for creating sim2dnr_city
driver_dists = L2Distance(driver_locs)
ratings = generate_ratings(NUM_DRIVERS)
ratings_dists = abs_difference(ratings)
combined = w1*minmax(driver_dists, 0)[0] + w2*minmax(ratings_dists, 0)[0]
fair_dist = driver_dists.mean()/3
# print(fair_dist)

num_similar_drivers = []
for idx in range(len(combined)):
    curr_driver = combined[idx]
    # the drivers 'similar' to this driver are the ones within fair_distance from this driver
    num_sim = curr_driver[curr_driver<=fair_dist].shape[0]
    # print(num_sim)
    num_similar_drivers.append(num_sim) 

# print(num_similar_drivers)
print("Number of similar drivers:", np.mean(num_similar_drivers))
# """

In [None]:
zone_ids = zone_dfs_dict[city]['zone_id']
zone_id2idx = {zone_id: idx for idx, zone_id in enumerate(zone_ids)}

---

**ALGORITHMS**

**FairAssign**

In [None]:
# will go with the default parameters of cplex:
from cplex import Cplex
model = Cplex()
model.parameters.simplex.tolerances.feasibility.get(),\
model.parameters.simplex.tolerances.optimality.get(),\
model.parameters.simplex.tolerances.markowitz.get()      

# model.parameters.workmem.set(10240) # 10GB  
# model.parameters.emphasis.memory.set(1)

In [None]:
# Fair Clustering - LPP contstraints and Cplex
from cplex import Cplex
# from lp_tools import *
from lp_tools_kn import * # with coefficient adjustment for k-nearest zone allocation

alpha_fair = 2

def fair_clustering(dataset, centres, lower_cap, upper_cap, fair_distance, prohibited_assignments):
  # Step 1: 	 Create an instance of Cplex 
  problem = Cplex()
  problem.parameters.simplex.tolerances.feasibility.set(float(1e-9))
  problem.parameters.simplex.tolerances.optimality.set(float(1e-9))
  problem.parameters.simplex.tolerances.markowitz.set(float(0.9999)) 
  problem.parameters.emphasis.memory.set(1)
  problem.parameters.workmem.set(10240)

  # Step 2: 	 Declare that this is a minimization problem
  problem.objective.set_sense(problem.objective.sense.minimize)
    
  """
   Step 3.   Declare and  add variables to the model. 
        The function prepare_to_add_variables (dataset, centres) prepares all the required information for this stage.
  
    objective: a list of coefficients (float) in the linear objective function
    lower bound: a list of floats containing the lower bounds for each variable
    upper bound: a list of floats containing the upper bounds for each variable
    variable_names: a list of strings that contains the name of the variables
  """
  ## if working with "lp_tools":
  print("Adding Variables...")
  
  ## if working with "lp_tools":
  # objective, lower_bound, upper_bound, variable_names, P,C = prepare_to_add_variables(dataset, centres)
  ## if working with "lp_tools_kn": 
  objective, lower_bound, upper_bound, variable_names, P,C = prepare_to_add_variables(dataset, centres, prohibited_assignments)
  problem.variables.add(
      obj = objective,
      lb = lower_bound,
      ub = upper_bound,
      names = variable_names
    )
  
  print("Variables Added !")
    
    
  """
  Step 4.   Declare and add constraints to the model.
            There are few ways of adding constraints: row wise, col wise and non-zero entry wise.
            Assume the constraint matrix is A. We add the constraints non-zero entry wise.
            The function prepare_to_add_constraints(dataset, centres) prepares the required data for this step.
  
   coefficients: Three tuple containing the row number, column number and the value of the constraint matrix
   senses: a list of strings that identifies whether the corresponding constraint is
           an equality or inequality. "E" : equals to (=), "L" : less than (<=), "G" : greater than equals (>=)
   rhs: a list of floats corresponding to the rhs of the constraints.
   constraint_names: a list of string corresponding to the name of the constraint
  """
  print("Adding Constraints...")
    
  rhs, senses, row_names, coefficients = prepare_to_add_constraints(dataset, centres, upper_cap,lower_cap, P,C, alpha_fair, fair_distance, ratings, flag)
  print("num_constraints:", len(senses)) 
  logger.info(f"\t\t\tnum_constraints = {len(senses)}")
  problem.linear_constraints.add(
      rhs = rhs,
      senses = senses,
      names = row_names
    )
  problem.linear_constraints.set_coefficients(coefficients)

  print("Constraints Added !")
    
  # Step 5.	Solve the problem
  problem.solve()

  result = {
    "status": problem.solution.get_status(),
    "success": problem.solution.get_status_string(),
    "objective": problem.solution.get_objective_value(),
    "assignment": problem.solution.get_values(),
  }
    
  qm = problem.solution.quality_metric  
  print("Solution Quality:", problem.solution.get_float_quality([qm.max_x, qm.max_primal_infeasibility]))
  
  # print("Status:", result['status']) # outputs a number: "1" for optimal solution, "2" for unbounded ray and "3" for infeasible solution
  solution_status = result['status']
  assert solution_status==1, "Solution isn't optimal !"

  print("Status:", problem.solution.get_status_string()) # optimal, unbounded ray, infeasible

  return result

In [None]:
# Fair Assignment of drivers to the FFCs / warehouses
import copy
import dependent_rounding as dp

# configParser.read(configFilePath)

num_samples, num_centres = driver_locs.shape[0], zone_locs.shape[0]

def fair_assignment(prob_dis, driver_loc):
  '''Assigning the driver using the probaility distribution using dependent rounding'''  
  
  # "prob_dis" is the result of the Fair-LP program "fair_clustering"  
  prob_dist = copy.deepcopy(prob_dis)
  # print("prob_dist shape [num_drivers x num_ffc]:", prob_dist.shape)

  rounding = dp.DependentRounding(prob_dist)
  rounding._buildGraph(prob_dist)
  final_assignment = rounding.round()
  final_assignment = np.around(final_assignment,2)
  print(final_assignment)
  driver_df = pd.DataFrame(driver_loc,columns=["geolocation_lat","geolocation_lng"])
  driver_df['ffc_index'] = -1 # unassigned

  for i in range(num_samples):
    for j in range(num_centres):
      if abs(final_assignment[i][j]-1) < 0.2: # 0.01
        driver_df.at[i,'ffc_index'] = j
        
  return driver_df, final_assignment


In [None]:
def sanityCheck(probs):
    """
    To cope with bound violations which can occur upto the feasibility parameter range 
    So the lower bound of 0.0 on the probabilities can get violated and the values can go down to (0-feasibility_parameter_value)
    """
    for i in range(len(probs)):
        last_pos_index = -1
        neg_value = 0
        
        for j in range(len(probs[0])):
            assert probs[i][j] >= -1e-9 # 1e-6
            
            if probs[i][j] < 0:
                neg_value += probs[i][j]
                probs[i][j] = 0
            elif probs[i][j] > 0:
                last_pos_index = j

        max_pos_index = np.argmax(probs[i])
        probs[i][max_pos_index] += neg_value
        
        assert probs[i][max_pos_index] > 0
        
    return probs


def picklify(ds, filepath):
    pickling_on = open(filepath, "wb")
    pickle.dump(ds, pickling_on)
    pickling_on.close()
    return

In [None]:
# main :
def FairAssign_solver(driver_locs, zone_locs, lower_cap, upper_cap, fair_distance, prohibited_assignments):
    # Fair-LP:
    # lp_output = fair_clustering(driver_locs, zone_locs, lower_cap, upper_cap, fair_distance, prohibited_assignments)
    try:
        lp_output = fair_clustering(driver_locs, zone_locs, lower_cap, upper_cap, fair_distance, prohibited_assignments)
    except:
        logger.error("Solution Non-optimal (Unbounded Ray or Infeasible) !")
        return None, None
    prob_dis = np.reshape(lp_output['assignment'][:num_samples*num_centres], (-1, num_centres))
    
    try:
        prob_dist = sanityCheck(copy.deepcopy(prob_dis)) # this might raise an assertion error
    except:
        logger.error("Sanity Check Assertion !")
        return None, None
    
    # Randomized Dependent Rounding:
    try:
        df = fair_assignment(prob_dist, driver_locs)[0] # this might raise an assertion error
        final_assignment = df['ffc_index'].values
    except:
        logger.error("Dependent Rounding Assertion !")
        return prob_dist, None
    
    return prob_dist, final_assignment
    

In [None]:
# COMMENT OUT THIS CELL IF THE ASSIGNMENT PICKLE FILE IS ALREADY PRESENT #

print(f"# drivers : {num_drivers}")
print(f"# zones : {num_zones}")

if city=='A':
    nk_list = [K_VAL] # only one element included, can add more for experimentation
elif city=='B':
    nk_list = [K_VAL] # [K_VAL] + [num_zones, num_zones//2]
elif city=='C':
    nk_list = [K_VAL]
    
print("How many nearest zones? :", nk_list)

# k_list = [(x-1) for x in nk_list] # assign only to k-nearest zones # [7, 5, 3]  
try:    
    if city=='A':
        fd_dnr = sim2dnr_A[NUM_SIM]
    elif city=='B':
        fd_dnr = sim2dnr_B[NUM_SIM]
    elif city=='C':
        fd_dnr = sim2dnr_C[NUM_SIM]
except:
    fd_dnr = 1

fd_list = [(driver_dists.mean()/alpha) for alpha in [fd_dnr]] # fair_distances
ratings = generate_ratings(num_drivers)

num_runs = 1
# k_list and fd_list contain hyperparameters

k_fd_dict = {k:\
                {fd_idx:\
                    {num_run:\
                        {'p_dist':None, 'assignment':None}
                        for num_run in range(num_runs)
                    } 
                    for fd_idx in range(len(fd_list))
                } 
            for k in nk_list
            }

assign_results_path = f"assign_results/results_{city}/"
if not os.path.exists(assign_results_path):
    os.makedirs(assign_results_path) # directory to store the results of FairAssign_solver

## BEWARE
# shutil.rmtree(assign_results_path)
# os.mkdir(assign_results_path) # doesn't work for nested directories

start = time.time()
for k in nk_list:
    logger.info(f"Considering k = {k} nearest zones")
    prhbtd_assigns = get_prohibited_assigns(dz_dist, k)
    
    for f_idx, fair_distance in enumerate(fd_list):
        logger.info(f"\tfair_distance = {fair_distance}")
        for num_run in range(num_runs):
            logger.info(f"\t\tnum_run = {num_run}")
            prob_dist, final_assignment = FairAssign_solver(driver_locs, zone_locs, lower_caps, upper_caps, fair_distance, prhbtd_assigns)
            k_fd_dict[k][f_idx][num_run]['p_dist'] = prob_dist
            k_fd_dict[k][f_idx][num_run]['assignment'] = final_assignment 
            
    # Store intermediate results as well as fail-safe:
    # saving current state of "k_fd_dict":
    if wts==-1:
        filepath = os.path.join(assign_results_path, f"dict_k={k}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}.pickle")
    else:
        filepath = os.path.join(assign_results_path, f"dict_k={k}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}_{wts}.pickle")
    
    # print(filepath)
    picklify(k_fd_dict, filepath)

end = time.time()
print(f"Execution time: {(end-start)/3600}hrs")
logger.info(f"Execution time: {(end-start)/3600}hrs")

if wts==-1:
    final_file_path = os.path.join(assign_results_path, f"Assignments_{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}.pickle")
else:
    final_file_path = os.path.join(assign_results_path, f"Assignments_{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}_{wts}.pickle")

picklify(k_fd_dict, final_file_path)
print("Assignments file saved in:", final_file_path)

In [None]:
def get_assignment(result_path, driver_locs, k):
    '''
    returns ffc_index (or zone index) for each driver based on the "FairAssign" assignment 
    '''
    pickle_off = open(result_path, "rb")
    assignments = pickle.load(pickle_off)
    prob_dist = assignments[k][0][0]['p_dist']
    ## get assignment by applying dependent rounding: 
    df = fair_assignment(prob_dist, driver_locs)[0] 
    final_assignment = df['ffc_index'].values

    return final_assignment

# RoundRobin assignment while maintaining upper capacity bounds of zones only:
def round_robin_dist(day_num, driver_locs):
    num_drivers = len(driver_locs)
    num_zones = len(lower_caps)
    # print(num_drivers, num_zones)
    rr_df = pd.DataFrame(driver_locs, columns=['lat', 'lng'])
    rr_df['bz_idx_rr'] = -1
    temp_upper_cap = copy.deepcopy(list(upper_caps))
    for i in range(num_drivers):
        zone = (day_num+1) % num_zones
        while(temp_upper_cap[zone]<=0):
            zone = (zone+1) % num_zones 
        rr_df.at[i, 'bz_idx_rr'] = int(zone)
        temp_upper_cap[zone] -= 1 
    return rr_df

# Random assignment while maintaining only upper capacity bounds of the zones:
def random_dist(day_num, driver_locs, upper_caps):
    random.seed(1234+day_num)
    num_drivers = len(driver_locs)
    num_zones = len(lower_caps)
    # print(num_drivers, num_zones)
    rand_df = pd.DataFrame(driver_locs, columns=["lat", "lng"])
    rand_df['bz_idx_rand'] = -1
    temp_upper_cap = copy.deepcopy(list(upper_caps))
    for i in range(num_drivers):
        zone = random.randint(1, num_zones)-1
        while(temp_upper_cap[zone]<=0):
            zone = random.randint(1, num_zones)-1
        rand_df.at[i, 'bz_idx_rand'] = zone 
        temp_upper_cap[zone] -= 1
    return rand_df 

# LIPA while maintaining upper capacity bounds of zones only:
def lipa_dist(driver_locs, driver_prev_incomes, zone_prev_incomes):
    num_drivers = len(driver_locs)
    num_zones = len(lower_caps)

    lipa_df = pd.DataFrame(driver_locs, columns=["lat", "lng"])
    lipa_df['bz_idx_lipa'] = -1

    temp_upper_cap = copy.deepcopy(upper_caps)

    driver_idx_inc = np.argsort(np.array(driver_prev_incomes))
    zone_idx_inc = np.argsort(np.array(zone_prev_incomes))
  
    j = num_zones-1
    for i in driver_idx_inc:
        zone = zone_idx_inc[j]
        while(temp_upper_cap[zone]<=0):
            j = j-1
            zone = zone_idx_inc[j]
        lipa_df.at[i, 'bz_idx_lipa'] = zone
        temp_upper_cap[zone] -= 1
    return lipa_df

In [None]:
assert(False), "Probability distributions have been calculated! Now, just perform Step 2 (File generation) for all 6 days before going to Step 3 [refer to the instructions in the next part]" 

---

# **2. FILE GENERATION**

- Now we have the probability distributions corresponding to each driver. The next step is to round these distributions to get an assignemnt for 'day_num'. And then run a last-mile delivery algorithm on top of this assignment.

- However, in order to perform simulations using a last-mile delivery algorithm such as [FoodMatch](https://github.com/idea-iitd/FoodMatch/tree/master/Swiggy) or [FairFoody](https://github.com/idea-iitd/fairfoody), we need to generate relevant "de_intervals" files that are required by those algorithms based on our assignment.

Step 2 (File Generation) needs to run separately for each of the $6$ days. On each day, for each driver $d$, sample a zone by rounding the probability distribution obtained for $d$ in Step 1 (Assignments). Store the assignments by each algo on each day (will be used later while computing the 'spatial stability' evaluation metric)    

In [None]:
num_days = 6
algos = ['FairAssign', 'RoundRobin', 'LIPA']
global_assignments = {alg:\
                        {day+1: [-1]*NUM_DRIVERS for day in range(num_days)}
                      for alg in algos
                      }
# global_assignments[algo][day_num].shape() : 1 x num_drivers  
# global_assignments : dict[dict[List]] ; the innermost List is of size NUM_DRIVERS with i-th element representing the assigned zone for driver by the respective algorithm 
# we don't need to store driver ids in 'global_assignemnts' since they follow the same order as present in 'driver_idf'


The following cells (upto Step 3 (Simulations)) need to run for each **day_num** i.e., *day_num* $\in$ {1, 2, 3, 4, 5, 6} and each **algorithm** i.e., *algorithm* $\in$ algos, before going to Step 3.

In [None]:
# The aim of this cell is to get "assign_df"
for algo in algos:
    for day_num in range(1, num_days+1):
        print(algo)
        assign_df = None

        if algo=='FairAssign':
            result_path = "./assign_results/results_A/Assignments_10z_32simahm.pickle"
            assignment = get_assignment(result_path, driver_locs, K_VAL) 
            assign_df = copy.deepcopy(driver_idf[:NUM_DRIVERS]) 
            # original base zones
            assign_df = pd.merge(assign_df, base_zones_dict[city][['de_id', 'base_zone']], on='de_id')
            assign_df['bz_idx'] = assign_df['base_zone'].map(zone_id2idx)
            # base zones assigned by Fair Assign
            assign_df['new_bz_idx'] = assignment
            print(assign_df[assign_df['bz_idx']!=assign_df['new_bz_idx']].shape[0])
            
        elif algo=='RoundRobin':
            rr_df = round_robin_dist(day_num, driver_locs)
            assign_df = copy.deepcopy(driver_idf[:NUM_DRIVERS])
            # original base zones
            assign_df = pd.merge(assign_df, base_zones_dict[city][['de_id', 'base_zone']], on='de_id')
            assign_df['bz_idx'] = assign_df['base_zone'].map(zone_id2idx)
            # base zones assigned by "RoundRobin"
            assign_df['new_bz_idx'] = rr_df['bz_idx_rr'].values

        elif algo=='LIPA':
            # use the first day of FoodMatch as the first day of LIPA 

            # Requires simulation results of previous day
            local_incomes_df = pd.DataFrame(columns=['de_id', 'day1', 'day2', 'day3', 'day4', 'day5', 'day6']) 
            local_incomes_df['de_id'] = driver_idf['de_id']

            day_incomes = {d_id:None for d_id in driver_idf['de_id']}

            sim_path = f'results/sim_results/sim_results_{city}/{NUM_DRIVERS}_{algo}/sim_results_lipa{day_num-1}'
            print(sim_path)

            data = pd.read_csv(sim_path, names=["a", "b", "c", "d", "e", "f", "g", "h"], on_bad_lines='skip')
            data_deliver = data[data['a'] == "DELIVER"].drop(['a', 'e', 'f', 'g', 'h'], axis = 1)
            data_deliver.columns = ['order_id', 'delivered_time', 'vehicle_id'] 
            vehicle_ids = data_deliver['vehicle_id'].unique() 
            # vehicle_ids = driver_idf['de_id'].values
            # print(vehicle_ids)
            data_deliver_gb = data_deliver.groupby('vehicle_id')
            for d_id in driver_idf['de_id']:
                try:
                    day_incomes[d_id] = int(data_deliver_gb.get_group(d_id).shape[0])
                except:
                    # handles the cases for which d_id is not present in data_deliver 
                    continue
                            
            local_incomes_df[f'day{day_num}'] = local_incomes_df['de_id'].map(day_incomes) 
            # previous day incomes are used to determine the next day's assignment:
            fm_incomes_df = pd.DataFrame(columns=['de_id', 'day1', 'day2', 'day3', 'day4', 'day5', 'day6']) 
            fm_incomes_df['de_id'] = driver_idf['de_id']

            fm_inc_df = copy.deepcopy(local_incomes_df)
            fm_inc_df = pd.merge(fm_inc_df, driver_idf, on='de_id')
            print(day_incomes, fm_inc_df)
            # find day 1 incomes of drivers (FairAssign):
            prev_incomes_df = copy.deepcopy(fm_inc_df[['de_id', 'lat', 'lng', 'day1']])
            driver_prev_incomes = prev_incomes_df['day1'].values

            # find day 1 number of orders in each zone (FairAssign):
            orders_data = pd.read_csv(f"data/orders_data/orders_{day}.csv") 
            df = pd.merge(data_deliver, orders_data, on='order_id') 
            cust_zones = df['customer_zone'].unique()
            cust_zones_gb = df.groupby('customer_zone')

            orders_per_zone = {key:0 for key in cust_zones if key in zone_ids.values}
            for key in cust_zones:
                if key in zone_ids.values:
                    orders_per_zone[key] = cust_zones_gb.get_group(key).shape[0] 

            # orders_per_zone = {k: v for k, v in sorted(orders_per_zone.items(), key=lambda item: item[1])}
            orders_per_zone = {k:v for k, v in sorted(orders_per_zone.items())}
            zone_prev_incomes = [v for k, v in orders_per_zone.items()] 
            print(zone_prev_incomes)
            # -----------------------------------------------------------------------
            lipa_df = lipa_dist(driver_locs, driver_prev_incomes, zone_prev_incomes)
            print(driver_prev_incomes)
            assign_df = copy.deepcopy(driver_idf[:NUM_DRIVERS]) 
            # original base zones
            assign_df = pd.merge(assign_df, base_zones_dict[city][['de_id', 'base_zone']], on='de_id')
            assign_df['bz_idx'] = assign_df['base_zone'].map(zone_id2idx)
            # base zones assigned by LIPA
            assign_df['bz_idx_lipa'] = lipa_df['bz_idx_lipa']
            assign_df['new_bz_idx'] = assign_df['bz_idx_lipa']

        global_assignments[algo][day_num] = assign_df['new_bz_idx'].values

Note that for the "LIPA" baseline, we need to have the FoodMatch simulation corresponding to day 1 beforehand. Also, unlike other baselines, LIPA assignment for day 'd' depends on the simulation done using LIPA assignment of day 'd-1'.

In [None]:
print(assign_df.head())

Generating the ./de_intervals directory required for the last-mile delivery algorithm 'FoodMatch'

In [None]:
day_num = 1           # {1, 2, 3, 4, 5, 6}
# algo = 'FairAssign'   # {'FairAssign', 'RoundRobin', 'Random', 'LIPA'}

In [None]:
print("File generation starts ...")

In [None]:
# Random generation of locations within a given zone

# generating random locations withing a zone (given the zone boundary):
def random_loc_generator(zone_bdry):
  lats, longs = path_related_preprocessing(zone_bdry)
  coords = [(x,y) for x,y in zip(lats, longs)]
  min_lat, max_lat = min(lats), max(lats)
  min_lng, max_lng = min(longs), max(longs)
  new_lat, new_lng = random.uniform(min_lat, max_lat), random.uniform(min_lng, max_lng) 
  return [new_lat, new_lng]

# Checking if a given location lies inside a given zone:
def path_related_preprocessing(path_bdry):
  # exemplar path_bdry: '12.954619258010608,77.6149292592163 12.954680993923494,77.61640664016727 ....'
  path_bdry = str(path_bdry)
  df = pd.DataFrame({'lts':[], 'lngs':[]})
  bdry_locs = path_bdry.split()
  lats, longs = [], []
  for loc in bdry_locs:
    lat, lng = loc.split(',')
    lats.append(float(lat))
    longs.append(float(lng))
  return lats, longs

def loc_in_zone(loc, zone_bdry):
  lats, longs = path_related_preprocessing(zone_bdry)
  coords = [(x,y) for x,y in zip(lats, longs)]
  polygon = geometry.MultiPoint(coords).convex_hull
  Point_X, Point_Y = loc[0], loc[1]
  point = geometry.Point(Point_X, Point_Y)
  return point.within(polygon)

# code to generate 'm' locations that lie within a given zone:
def generate_locs(m, zone_bdry):
    new_locs = []
    num_generated = 0
    while num_generated < m:
        new_loc = random_loc_generator(zone_bdry)
        sanity_check = loc_in_zone(new_loc, zone_bdry)
        if sanity_check:
            num_generated += 1
            new_locs.append(new_loc)
    return new_locs

In [None]:
location_df = pd.read_csv(f"data/location_data/location_{city}.csv")
# location_df = location_df.rename(columns={0:'node_id', 1:'lat', 2:'lng'})
# location_df.head()

In [None]:
# get the files corresponding to driver_idf:
de_idf_ids = driver_idf['de_id'][:NUM_DRIVERS]

de_idf_files = []
for d_id in de_idf_ids:
    file_name = 'data/de_data/'+city +'_de_data/'+ str(day_num) + '/de_intervals/' + str(int(d_id)) + '.csv'
    de_idf_files.append(file_name)
# de_idf_files

# get the node_ids:
orig_node_ids = []
na = 0 # number of files in de_idf_files which are not present in de_intervals/
for file in de_idf_files[:NUM_DRIVERS]:
    try:
        file_df = pd.read_csv(file)
    except:
        na += 1
        d_id = int(file.split('/')[-1][:-4])
        to_drop_idx = assign_df[assign_df['de_id']==d_id].index
        assign_df = assign_df.drop(to_drop_idx)
        # print(to_drop_idx)
        continue 
    # get starting node ids corresponding to all shifts, it will be useful for random generation for unswappable drivers
    num_shifts = int(file_df.shape[0]/2)
    node_id = [int(file_df.iloc[x*2].values[0].split()[1]) for x in range(num_shifts)]
    orig_node_ids.append(node_id)

assign_df['node_id'] = orig_node_ids 
assign_df['fa_node_id'] = assign_df['node_id']

print(f"{na}/{len(de_idf_files[:NUM_DRIVERS])} intersection drivers not found in ../{day_num}/de_intervals/")

In [None]:
# final file generation:
intersection_files = []
for d_id in driver_idf['de_id'][:NUM_DRIVERS]:
    file_name = 'data/de_data/'+city+'_de_data/'+ str(day_num)+'/de_intervals/'+str(int(d_id))+'.csv'
    intersection_files.append(file_name)

# print(wts)
if wts==-1:
    old_dir_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}/de_int_old'
    new_dir_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}/de_intervals'
else:
    old_dir_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}_{wts}/de_int_old'
    new_dir_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}_{wts}/de_intervals'

if os.path.exists(old_dir_path):
    shutil.rmtree(old_dir_path)
    
if os.path.exists(new_dir_path):
    shutil.rmtree(new_dir_path)

os.makedirs(old_dir_path)
os.makedirs(new_dir_path)

for file in intersection_files:
    d_id = int(file.split('/')[-1][:-4])
    try:
        file_df = pd.read_csv(file)
    except:
        print("file_df not found")
        continue 
    
    curr_file = f'{d_id}.csv'
    old_path = os.path.join(old_dir_path, curr_file)
    new_path = os.path.join(new_dir_path, curr_file)

    # old_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{wts}/de_int_old/{d_id}.csv'
    # new_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{wts}/de_intervals/{d_id}.csv'

    file_df.to_csv(old_path, index=False)
    file_df.to_csv(new_path, index=False)

Format (meta-data) of csv files in ./de_intervals is as follows:   
1. num_shifts -----> number of shifts    
// num_shifts * 2 lines follow; 2 consecutive lines contain information of a single shift 
2. start_time start_node_id ----->    shift-1
3. end_time end_node_id ----->        shift-1
4. start_time start_node_id ----->    shift-2
5. end_time end_node_id ----->        shift-2 
.  
.  
.   
 
Since the last-mile delivery algorithms don't use end_node_id, therefore end_node_id==-1 everywhere.   


We only update the start_node_id for drivers with randomly generated start locations.

In [None]:
# SWAPPING LOGIC:
# The swapping logic is applicable only to the drivers for which the new_bz_idx is different from bz_idx
rel_df = assign_df[assign_df['bz_idx'] != assign_df['new_bz_idx']].reset_index()
print("Number of drivers to be swapped:", rel_df.shape[0])
rel_df['paired_de'+str(day_num)] = None

# first assign swappable nodes:
bz_nodes_dict = {zone_idx:{'freq':0, 'nodes':[], 'paired_de_id':[]} for zone_idx, _ in enumerate(zone_ids)} # it'll contain the frequency of each base zone in rel_df['bz_idx'] as well as the corresponding node_ids in a list
for idx in range(rel_df.shape[0]):
    z_id = int(rel_df.iloc[idx]['bz_idx'])
    n_id = int(rel_df.iloc[idx]['node_id'][0])
    paired_did = int(rel_df.iloc[idx]['de_id']) # remove later?
    bz_nodes_dict[z_id]['freq'] += 1
    bz_nodes_dict[z_id]['nodes'].append(n_id)
    bz_nodes_dict[z_id]['paired_de_id'].append(paired_did) # remove later?
bz_nodes_dict_store = copy.deepcopy(bz_nodes_dict)
# IF a required zone_id in new_bz_idx is present in bz_nodes_dict then use that data point
# ELSE generate a random location in the zone corresponding to the zone_id
num_random = 0 # number of drivers for whom random generation of location was done to get the corresponding start node
for idx in range(rel_df.shape[0]):
    print(idx, end=' ')
    z_id = int(rel_df.iloc[idx]['new_bz_idx'])
    if(bz_nodes_dict[z_id]['freq'] > 0):
        rel_df.loc[idx, 'fa_node_id'] = bz_nodes_dict[z_id]['nodes'][0]
        rel_df.loc[idx, 'paired_de'] = bz_nodes_dict[z_id]['paired_de_id'][0]
        bz_nodes_dict[z_id]['freq'] -= 1
        bz_nodes_dict[z_id]['nodes'].pop(0)
        bz_nodes_dict[z_id]['paired_de_id'].pop(0) 
    else:
        # randomly generate a location in the zone assigned by FairAssign
        zone_df = zone_dfs_dict[city]
        # zone_bdry = zone_df[zone_df['zone_id']==z_id]['path'].values[0] # Wrong ! bcz z_id is the index of zone_id
        zone_bdry = zone_df.iloc[z_id]['path']
        new_loc = generate_locs(1, zone_bdry)[0] 
        
        # based on new_loc, get the closest node_id from location_df
        min_dist = 1e9
        n_id = -1
        for i in range(location_df.shape[0]):
            node_loc = [ location_df.iloc[i]['lat'], location_df.iloc[i]['lng'] ]
            curr_dist = euclidean_distance(new_loc, node_loc)
            if(curr_dist <= min_dist):
                min_dist = curr_dist 
                n_id = location_df.iloc[i]['node_id']     
        num_random += 1
        num_shifts = int(len(rel_df.iloc[idx]['node_id']))
        n_idz = [int(n_id)]*num_shifts 
        # the above line follows the assumption that the start_node_id is the same for each shift
        # we could also generate new start nodes for every shift (see the commented out next cell) but that can be chaotic and impractical
        # starting from the same area/market in each shift makes more sense than popping up at random locations for every shift
        rel_df.at[idx, 'fa_node_id'] = n_idz
        
print()
print(f"{rel_df.shape[0]-num_random} data points out of {rel_df.shape[0]} could be swapped !")
print(f"{num_random} data points were randomly generated !")

In [None]:
# IGNORE:
# -------
"""
# Difference from next cell: For each shift of randomly generated drivers, the nodes are generated separately.
# Unlike the the next cell, in which the same (randomly generated) node is used for all shifts.

# SWAPPING LOGIC:
# The swapping logic is applicable only to the drivers for which the new_bz_idx is different from bz_idx
rel_df = assign_df[assign_df['bz_idx'] != assign_df['new_bz_idx']].reset_index()
rel_df['paired_de'+str(day_num)] = None

# first assign swappable nodes:
bz_nodes_dict = {zone_idx:{'freq':0, 'nodes':[], 'paired_de_id':[]} for zone_idx, _ in enumerate(zone_ids)} # it'll contain the frequency of each base zone in rel_df['bz_idx'] as well as the corresponding node_ids in a list

for idx in range(rel_df.shape[0]):
    z_id = int(rel_df.iloc[idx]['bz_idx'])
    n_id = int(rel_df.iloc[idx]['node_id'][0])
    paired_did = int(rel_df.iloc[idx]['de_id']) # remove later?

    bz_nodes_dict[z_id]['freq'] += 1
    bz_nodes_dict[z_id]['nodes'].append(n_id)
    bz_nodes_dict[z_id]['paired_de_id'].append(paired_did) # remove later?

bz_nodes_dict_store = copy.deepcopy(bz_nodes_dict)

# IF a required zone_id in new_bz_idx is present in bz_nodes_dict then use that data point
# ELSE generate a random location in the zone corresponding to the zone_id
num_random = 0 # number of drivers for whom random generation of location was done to get the corresponding start node
for idx in range(rel_df.shape[0]):
    print(idx, end=' ')
    z_id = int(rel_df.iloc[idx]['new_bz_idx'])
    if(bz_nodes_dict[z_id]['freq'] > 0):
        # use an existing data point
        # rel_df.iloc[idx]['fa_node_id'] = bz_nodes_dict[z_id]['nodes'][0] # this gives a warning
        rel_df.loc[idx, 'fa_node_id'] = bz_nodes_dict[z_id]['nodes'][0]
        rel_df.loc[idx, 'paired_de'+str(day_num)] = bz_nodes_dict[z_id]['paired_de_id'][0]
        print(bz_nodes_dict[z_id]['paired_de_id'][0])

        bz_nodes_dict[z_id]['freq'] -= 1
        bz_nodes_dict[z_id]['nodes'].pop(0)
        bz_nodes_dict[z_id]['paired_de_id'].pop(0) # remove later?
    
    else:
        # rel_df.loc[idx, 'fa_node_id'] = None
        # num_random += 1
        # continue
        
        # randomly generate a location in the zone assigned by FairAssign
        zone_df = zone_dfs_dict[city]
        # zone_bdry = zone_df[zone_df['zone_id']==z_id]['path'].values[0] # Wrong ! bcz z_id is the index of zone_id
        zone_bdry = zone_df.iloc[z_id]['path']


        # for each shift create a new location in the assigned zone itself 
        # this is a strong assumption but only done for a fraction of drivers, 
        # the good thing is that it is not done for the same set of drivers on all days AND
        # it is still a weaker assumption that writing our own proof of concept dummny vanilla last mile delivery algorithm
        num_shifts = int(len(rel_df.iloc[idx]['node_id']))
        n_idz = []
        for shift in range(num_shifts):
            new_loc = generate_locs(1, zone_bdry)[0] 

            # based on new_loc, get the closest node_id from location_df
            min_dist = 1e9
            n_id = -1
            for i in range(location_df.shape[0]):
                node_loc = [ location_df.iloc[i]['lat'], location_df.iloc[i]['lng'] ]
                curr_dist = euclidean_distance(new_loc, node_loc)
            
                if(curr_dist <= min_dist):
                    min_dist = curr_dist 
                    n_id = location_df.iloc[i]['node_id']
            
            n_idz.append(int(n_id)) 
        print(n_idz)
        # rel_df.iloc[idx]['fa_node_id'] = n_idz
        # rel_df.loc[idx, 'fa_node_id'] = n_idz
        # for some reason both of the above 2 lines don't work
        rel_df.at[idx, 'fa_node_id'] = n_idz
    
        num_random += 1

print()
print(f"{rel_df.shape[0]-num_random} data points out of {rel_df.shape[0]} could be swapped !")
print(f"{num_random} data points were randomly generated !")
"""

In [None]:
# the de_interval profiles of swappable or paired drivers are to be swapped 
# and that of drivers for whom random locations are generated, the profile is to be modified
rel_drivers = rel_df['de_id']

rel_files = []
for d_id in rel_drivers:
    file_name = 'data/de_data/'+city + '_de_data/'+ str(day_num) + '/de_intervals/' + str(int(d_id)) + '.csv'
    rel_files.append(file_name)

for file in rel_files:
    # d_id = file[40:-4] 
    d_id = int(file.split('/')[-1][:-4])
    file_df = pd.read_csv(file)

    swap_node = rel_df[rel_df['de_id']==int(d_id)].paired_de.values[0]
    # for those drivers who could be swapped:
    if not np.isnan(swap_node):
        swap_with_file = 'data/de_data/' + city + '_de_data/'+ str(day_num) + '/de_intervals/' + str(int(swap_node)) + '.csv'
        file_df = pd.read_csv(swap_with_file)
    # for those drivers whose starting nodes for each shift were randomly generated
    else:
        print(file)
        num_shifts = int(file_df.shape[0]/2)
        new_start_nodes = rel_df[rel_df['de_id']==int(d_id)].fa_node_id.values[0]
        for i in range(num_shifts):
            new_node = new_start_nodes[i]
            start_time = file_df.iloc[i*2].values[0].split()[0] # start time of i-th shift
            file_df.iloc[i*2] = str(start_time)+ ' ' + str(new_node) 

    if wts==-1:
        new_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}/de_intervals/{d_id}.csv'
    else:
        new_path = f'data/de_data/{city}_de_data/{day_num}/{city}_{NUM_DRIVERS}_{NUM_SIM}_{K_VAL}_{flag}_{wts}/de_intervals/{d_id}.csv'

    file_df.to_csv(new_path, index=False)

In [None]:
print("File generation ends!")

In [None]:
assert(False), f"File generation for day_num={day_num} has been done! Now perform file generation for day_num={day_num+1} while day_num \in {1, 2, 3, 4, 5, 6}"

---

**3. Simulations**       
---
Follow the given steps to apply the last-mile delivery algorithm on top of the new assignments and get the simulation dumps:     
- For _FoodMatch_, refer to https://github.com/idea-iitd/FoodMatch     

- Before running the simulation, however, first copy the 'de_intervals' directories from "data/de_data/{city}\_de_data/{day_num}/{city}\_{NUM_DRIVERS}\_{NUM_SIM}\_{K_VAL}/" (or "data/de_data/{city}\_de_data/{day_num}/{city}\_{NUM_DRIVERS}\_{NUM_SIM}\_{K_VAL}\_{wts}/) to "FoodMatch/Swiggy/data/data\_{city}_anonymized/food_data/{day_num}" for day_num in [0, 1, 2, 3, 4, 5, 6]

- save the simulation output corresponding to each day 'day_num' using the following nomenclature:   
"sim_results_{algo}{day_num}" for each day_num; day_num \in {1, 2, 3, 4, 5, 6} 

In order to get the results on 'FoodMatch' or 'FairFoody' w/o considering the new assignments done by the FairAssign, don't copy the directories as directed in the previous step rather just simulate on the data that FoodMatch and FairFoody provide directly.   

For evaluation, remember to save the simulations as "sim_results_{algo}{day_num}". Take algo='foodmatch' for FoodMatch (only) simulations and algo='fairfoody' for FairFoody (only) simulations.

---

# **4. Evaluation**

In [None]:
# Choose algorithm 
algo = 'FairAssign' # {'FairAssign', 'RoundRobin', 'LIPA', 'FoodMatch', 'FairFoody'}

In [None]:
# METRICS:

def gini_index(incomes):
    num = len(incomes)
    total = incomes.sum() 
    inc_sum = 0.0
    for i in range(num):
        for j in range(num):
            inc_sum += abs(incomes[i]-incomes[j])
    gini = inc_sum / (2*num*total)
    return gini


def avg_distance(zone_labels, driver_locs, zone_locs):
    """
    returns the 'cost' of the assignment
    zone_labels: indices of the assigned zones
    a zone_label 'z' has location zone_locs[z]
    """
    driver_dists = L2Distance(driver_locs) 
    num = len(zone_labels)
    dist = 0.0 
    for i in range(num):
        assigned_zone = zone_labels[i]
        driver_loc, zone_loc = driver_locs[i], zone_locs[int(assigned_zone)]
        driver_zone_dist = euclidean_distance(driver_loc, zone_loc)
        dist += np.sqrt(driver_zone_dist)
    avg_dist = dist/num
    return avg_dist                       
    

# def spatial_inequality_index(incomes, driver_locs, ratings, combined, fair_distance):
def spatial_inequality_index(incomes, driver_dists, fair_distance):
    num = len(incomes)
    total = incomes.sum()
    term_i = 0.0    
    for i in range(num):
        sum_j = 0.0
        num_j = 1e-9    
        for j in range(i+1, num):
            if driver_dists[i][j] <= fair_distance and driver_dists[i][j]>0:
                num_j += 1
                sum_j += abs(incomes[i]-incomes[j])   
        term_i += (sum_j / num_j) 
    
    spin_idx = term_i / total 
    # spin_idx = round(spin_idx, 2)
    return spin_idx 


# def income_gap(incomes, driver_locs, ratings, combined, fair_distance):
def income_gap(incomes, driver_dists, fair_distance): 
    """ 
    difference between incomes between any two drivers per unit distance (within fair_distance) 
    """
    alpha = 100
    driver_dists = driver_dists * alpha
    num = len(incomes)
    total = incomes.sum()
    terms = 0.0
    num_pair_drivers = 1e-7 # NOT 0 => to avoid division by 0
    for i in range(num-1):
        for j in range(i+1, num):
            if driver_dists[i][j]>0:
                num_pair_drivers += 1
                terms += (abs(incomes[i]-incomes[j])/driver_dists[i][j])
    inc_gap = terms/num_pair_drivers
    # inc_gap = round(inc_gap, 2)
    return inc_gap

After getting the FoodMatch simulation results for all 6 days

In [None]:
def get_incomes_df(algo):
    # get the incomes on all 6 days for all drivers 
    '''
    input string: algo
    'fm' : FoodMatch
    'fafm' : FairAssign + FoodMatch
    '''
    local_incomes_df = pd.DataFrame(columns=['de_id', 'day1', 'day2', 'day3', 'day4', 'day5', 'day6']) 
    local_incomes_df['de_id'] = driver_idf['de_id']

    num_days = 6
    day_incomes = {d_id:None for d_id in driver_idf['de_id']}

    for day in range(1, num_days+1):
        sim_path = f'results/sim_results/sim_results_{city}/{NUM_DRIVERS}_{algo}/sim_results_{algo}{day}'
        # print(sim_path)
        data = pd.read_csv(sim_path, names=["a", "b", "c", "d", "e", "f", "g", "h"], on_bad_lines='skip')
        data_deliver = data[data['a'] == "DELIVER"].drop(['a', 'e', 'f', 'g', 'h'], axis = 1)
        # print(data_deliver)
        data_deliver.columns = ['order_id', 'delivered_time', 'vehicle_id'] 
        vehicle_ids = data_deliver['vehicle_id'].unique() 
        data_deliver_gb = data_deliver.groupby('vehicle_id')
        for d_id in driver_idf['de_id']:
            try:
                day_incomes[d_id] = int(data_deliver_gb.get_group(d_id).shape[0])
            except:
                # handles the cases for which d_id is not present in data_deliver 
                continue  
        local_incomes_df[f'day{day}'] = local_incomes_df['de_id'].map(day_incomes)
    
    cols = ['day1', 'day2', 'day3', 'day4', 'day5', 'day6'] 
    local_incomes_df['num_orders'] = local_incomes_df[cols].sum(axis=1) 

    return copy.deepcopy(local_incomes_df)

In [None]:
driver_idf = driver_idf[:NUM_DRIVERS]

In [None]:
# algo can take values in {'foodmatch', 'fairfoody', 'fairassign', 'roundrobin', 'lipa'}
incomes_df = pd.DataFrame(columns=['de_id', 'day1', 'day2', 'day3', 'day4', 'day5', 'day6']) 
incomes_df['de_id'] = driver_idf['de_id']
incomes_df = get_incomes_df(algo)
incomes_df = pd.merge(incomes_df, driver_idf, on='de_id')

Calculate metrics

In [None]:
# generating driver_dists
driver_dists = L2Distance(driver_locs)

# generating ratings
num_drivers = driver_dists.shape[0]
ratings = generate_ratings(num_drivers)

# generating combined 
ratings_matrix = abs_difference(ratings)
normalized_dists = minmax(driver_dists, 0)[0]
normalized_ratings = minmax(ratings_matrix, 0)[0]
_combined = w1*driver_dists + w2*ratings_matrix 
combined = minmax(_combined, 0)[0]

In [None]:
lats = incomes_df['lat']
longs = incomes_df['lng']
incomes = incomes_df['num_orders']

In [None]:
def metrics(lats, longs, incomes, ratings, combined, fair_dist): 
    d_locs = [[lat, lng] for lat, lng in zip(lats, longs)]
    incomes = np.array(incomes)
    gini = gini_index(incomes) 
    sp_idx = spatial_inequality_index(incomes, d_locs, ratings, combined, fair_dist)
    inc_gp = income_gap(incomes, d_locs, ratings, combined, fair_dist)
    return gini, sp_idx, inc_gp

In [None]:
if flag==0:
    driver_dists = driver_dists 
if flag==1:
    driver_dists = ratings_matrix 
if flag==2:
    driver_dists = combined

def num_sim_drivers(fd):
    ''' 
    number of similar drivers (i.e., for a given driver, how many drivers are being considered for fairness comparison per)
    '''
    num_similar_drivers = []
    for idx in range(len(driver_dists)):
        curr_driver = driver_dists[idx]
        num_sim = curr_driver[curr_driver<=fd].shape[0]
        num_similar_drivers.append(num_sim) 
    return np.mean(num_similar_drivers)

In [None]:
# calculating the metrics at different fair_dist values: 
if flag==0:
    fair_dist = driver_dists.mean()/8 
if flag==1:
    fair_dist = 1 
if flag==2:
    fair_dist = 0.04 
    
def eval_results(lats, longs, incomes):
    results = []
    for k in range(1, 11):
        fd = fair_dist/k 
        gini, sp_idx, inc_gp = metrics(lats, longs, incomes, ratings, combined, fd)
        results.append([fd, num_sim_drivers(fd), gini, sp_idx, inc_gp])
    result_df = pd.DataFrame(results)
    cols = ['fair_dist', 'sim_drivers', 'gini', 'spatial_ineq', 'income_gap']
    result_df.columns = cols
    return result_df

# FINAL RESULTS:
fafm_results_df = eval_results(lats, longs, incomes)

In [None]:
print("Final Results:")
print(fafm_results_df)

Calculating Avg. Distance (or Cost)

In [None]:
def distance_this_driver(locs, v_id):
    # driver's inital location:
    v_init_loc = driver_idf[driver_idf['de_id']==v_id][['lat', 'lng']].values[0]
    first_mile_dist = euclidean_distance(locs[0], v_init_loc)
    last_mile_dist = 0
    for idx in range(1, len(locs)):
        prev_loc = locs[idx-1]
        curr_loc = locs[idx] 
        last_mile_dist += euclidean_distance(prev_loc, curr_loc) 
    return first_mile_dist, last_mile_dist

In [None]:
# get the incomes on all 6 days for all drivers 
num_days = 6

def get_cost(algo, num_days):
    ''' 
    algo: str
    'fm': FoodMatch 
    'fafm': FairAssign then FoodMatch
    '''
    cust_lats, cust_lngs = [], []
    first_mile_cost = 0.0 # over all num_days days
    last_mile_cost = 0.0 # over all num_days days
    for day in range(1, num_days+1):
        # print(day)
        total_first_mile = 0
        total_last_mile = 0
        orders_data = pd.read_csv(f"data/orders_data/orders_{day}.csv") 
        sim_path = f'sim_results_{algo}{day_num}'
        data = pd.read_csv(sim_path, names=["a", "b", "c", "d", "e", "f", "g", "h"], on_bad_lines='skip')
        data_deliver = data[data['a'] == "DELIVER"].drop(['a', 'e', 'f', 'g', 'h'], axis = 1)
        data_deliver.columns = ['order_id', 'delivered_time', 'vehicle_id'] 
        
        vehicle_ids = data_deliver['vehicle_id'].unique() 

        df = pd.merge(data_deliver, orders_data, on='order_id') 
        c_locs = df['customer_lat_lng'].values 
        df['cust_lat'] = [float(loc.split(',')[0]) for loc in c_locs]
        df['cust_lng'] = [float(loc.split(',')[1]) for loc in c_locs]

        cust_lats.append(df['cust_lat'])
        cust_lngs.append(df['cust_lng'])

        df_gb = df.groupby('vehicle_id') # Don't group on 'de_id' 
        for v_id in vehicle_ids:
            curr_group = df_gb.get_group(v_id)
            first_mile_dist, last_mile_dist = distance_this_driver(curr_group[['cust_lat', 'cust_lng']].values, v_id)
            total_first_mile += first_mile_dist 
            total_last_mile += last_mile_dist
        # avg cost for this day:
        if len(vehicle_ids)>0: # for day 3 in city 'C': for some reason, no order gets delivered by foodmatch hence 0 vehicle ids corresponding to 'DELIVER' in 'data'
            first_mile_cost += (total_first_mile/len(vehicle_ids))
            last_mile_cost += (total_last_mile/len(vehicle_ids)) 
    # avg cost over all days:
    first_mile_cost = first_mile_cost/num_days
    last_mile_cost = last_mile_cost/num_days 
    cost = first_mile_cost + last_mile_cost 
    # print(first_mile_cost, last_mile_cost, cost)
    return first_mile_cost, last_mile_cost, cost

In [None]:
# first-mile cost: distance travelled to reach the assigned zone center (proxy for zone) from home
# last-mile cost: distance travelled to pick-up and deliver orders 
# total cost: sum of first-mile cost and last-mile cost

first_mile, last_mile, total = get_cost(algo, num_days)

print("Assignment cost:", first_mile)
print("Delivery cost:", last_mile)
print("Total Cost:", total) 

In [None]:
from typing import List, Dict

def ind_spatial_stability(assign : List):
    """ 
    Input:
        assign: 1 x num_days 
        list of assigned zones 
    Output: 
        ss: spatial stability value for 'assign'
    """
    # compute frequency distribution 'freqs'
    score = 0.0
    
    ## Frequency distribution entropy
    H = 0 
    values, freqs = np.unique(assign, return_counts=True) 
    freqs = freqs/freqs.sum() 
    entropies = -1.0 * freqs * np.log(freqs) # log_e
    H = entropies.sum() 
    
    ## num_zone_changes
    R = 0 
    num_days = len(assign)
    for idx in range(1, num_days):
        if assign[idx] != assign[idx-1]:
            R += 1

    score = H * R
    return score 


def spatial_stability(algo : str, all_assignments : Dict[str, Dict[str, List]]):
    """ 
    Input: 
        'algo' \in {'FairAssign', 'RoundRobin', 'Random', 'LIPA'}
        all_assignments: Dict({algo (str) : Dict({day_num : List of size 1 x num_days})})
    Output:
        Spatial stability metric value for 'algo'
    """
    ss = 0.0 
    L = []

    assignments = all_assignments[algo] # {day_num : list of assignments}
    num_days = len(assignments)
    
    driver_assigns = np.zeros((num_days, NUM_DRIVERS))
    for day_num in range(1, num_days+1):
        day_assigns = assignments[day_num]
        driver_assigns[day_num-1] = day_assigns 
    
    driver_assigns = driver_assigns.T # shape: NUM_DRIVERS x num_days
    # print(driver_assigns)
    # print(driver_assigns.shape)
    # if algo=="LIPA":
    #     driver_assigns[:, 2] = driver_assigns[:, 1]
    for d_idx in range(NUM_DRIVERS):
        curr_driver = driver_assigns[d_idx]
        curr_ss = ind_spatial_stability(curr_driver)
        L.append(curr_ss)

    ss = np.mean(L)
    print(L)
    return ss

In [None]:
ss = spatial_stability(algo, global_assignments)
print("Spatial Stability:", ss)

---