In [1]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import json
import os
import math
import heapq
import seaborn as sns
import matplotlib.pyplot as plt
import csv

import random

In [2]:
cwd = os.getcwd()
print(cwd)

/Users/m1m1024/Desktop/cs330/cs330-casestudy


# Load Data

In [4]:
with open(f'{cwd}/adjacency 2.json') as adjacency_f:
    adjacency = json.load(adjacency_f)
    
with open(f'{cwd}/node_data.json') as node_f:
    nodes = json.load(node_f)

#nodes_df = pd.DataFrame(nodes).T

# Functions

In [5]:
def haversine_distance(lat1, lon1, lat2, lon2):
    '''
    Haversine distance metric between two points
    '''
    
    #earth radius in km
    radius = 6371.0

    #convert lat lon into radians from degress
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    #haversine from wikipedia
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = radius * c

    return distance

In [6]:
def get_nearest_node(lat, lon):
    '''
    Find nearest node via Haversine distance
    TODO: Find more efficient nearest node method, how account for time if very far from nearest node?
    '''
    min_dist = float('inf')
    nearest_node = ''
    for node in nodes:
        dist = haversine_distance(nodes[node]['lat'], nodes[node]['lon'], lat, lon)
        if dist < min_dist:
            min_dist = dist
            nearest_node = node
    return nearest_node

In [7]:
def get_datetime(time_string):
    '''
    Convert any datetime strings to datetime
    '''
    datetime_format = "%m/%d/%Y %H:%M:%S"
    time = datetime.strptime(time_string, datetime_format)
    return time

In [12]:
def djikstras(source, dest, dt, day_type):
    '''
    Djikstra's algorithm
    Returns: dict of minimum distances to nodes and dict of node parents
    '''
    distance = {} #distances to each node
    parent = {} #
   # print("datetime ", dt)
    hour = dt.hour
#     print("source ", source)
#     print("dest ", dest)
 
    
    #initialize all distances to infinity
    for node in nodes:
        distance[node] = float('inf')
    distance[source] = 0
    pq = [(0, source)]
    while pq:
        node_dist, node = heapq.heappop(pq)
        if node == dest: #early stopping
            break
        if node in adjacency:
            for neighbor in adjacency[node]:
                for edge in adjacency[node][neighbor]:
                    if edge['hour'] == hour and edge['day_type'] == day_type:
                        edge_dist = edge['time']
                        if distance[neighbor] > node_dist + edge_dist:
                            #print("edge dist ", edge_dist)
                           # print("node dist ", node_dist)
                            distance[neighbor] = node_dist + edge_dist
                           # print(distance[neighbor])
                            parent[neighbor] = node
                            heapq.heappush(pq, (distance[neighbor], neighbor))
    return distance, parent

In [9]:
def get_path(parent, source_node, dest_node):
    '''
    Recover path using list of node parents, start at destination and end at source
    Returns: list of nodes in path (reverse order)
    '''
    path = [dest_node]
    current_node = dest_node
    while current_node != source_node:
        path.append(parent[current_node])
        current_node = parent[current_node]
    return path

In [26]:
# def get_driver(p, dt, day_type, start_time):
#     '''
#     Get the next driver from the driver PQ (Task 2 based on straight line distance)
#     Update relevant driver and passenger variables and get the time to traverse from driver location to passenger pickup
#     Returns: Driver match
#     '''
#     p.total_time = 0
#     d_match = None
#     min_dist = float('inf')
    
#     while driver_q:
#         d_time, idx, d = heapq.heappop(driver_q) #(datetime, id, driver object)
#         if d_time <= p.appear: #is available
#             dist = haversine_distance(d.source[0], d.source[1], p.source[0], p.source[1])
#             if dist < min_dist:
#                 min_dist = dist
#                 d_match = d
#         else:
#             #push back to heap since not time yet
#             heapq.heappush(driver_q, (d_time, d.idx, d))
#             break
    


def get_driver(p, dt, day_type, start_time):
    '''
    Get the next driver from the driver PQ (Task 3 based on estimated time)
    Update relevant driver and passenger variables and get the time to traverse from driver location to passenger pickup
    Returns: Driver match
    '''
    p.total_time = 0
    d_match_tuple = None #(driver available, id, driver object)
    d_match = None #driver
    min_dist = float('inf')
    availablesize = 0
    all_potential_candidates = []
    
    while driver_q:
        d_candidate = heapq.heappop(driver_q) #(datetime, id, driver object)
        d_time, idx, d = d_candidate
        if d_time <= start_time: #is available
            availablesize += 1
            if check_active(start_time, d): #driver active or been sitting for too long
                dist = haversine_distance(nodes[d.current_node]['lat'], nodes[d.current_node]['lon'], p.source[0], p.source[1])
                if dist < min_dist: #if better match
                    min_dist = dist
                    if d_match_tuple:
                        all_potential_candidates.append(d_match_tuple)
                        # heapq.heappush(driver_q, d_match_tuple) #put current match back into q
                    d_match = d
                    d_match_tuple = d_candidate
                else:
                    all_potential_candidates.append(d_candidate)
                    # heapq.heappush(driver_q, d_candidate) #not a better match, put back in q
            else:
                completed_drivers.append(d)
                driver_profits.append(d.get_profit())
        else:
            #push back to heap since not actually available yet
            heapq.heappush(driver_q, (d_time, d.idx, d))
            break


    for i in range(len(all_potential_candidates)):
           heapq.heappush(driver_q, all_potential_candidates[i])
    

    wait_time = 0
    if d_match is None and driver_q: #check if no match found (no drivers available), choose next available
        available_time, idx, d_match = heapq.heappop(driver_q) #(datetime, id, driver object) 
        time_diff = available_time - p.appear
        wait_time = time_diff.total_seconds()/3600 #get number of hours passenger has to wait for next driver to be available

    with open('driver_order2.txt', 'a') as file:
            file.write(f"Driver {d_match.idx}: matched with passenger {p.idx} and available driver # is {availablesize} \n")

        
    time, _ = djikstras(d_match.current_node, p.source_node, p.appear, day_type)
    pickup_time = time[p.source_node]
    p.total_time += wait_time
    d_match.current_node = p.source_node
    d_match.pickup_time += pickup_time
    p.total_time += pickup_time
    with open('pwaittime2.txt', 'a') as file:
        file.write(f"passenger {p.idx}: waiting {p.total_time} \n")
    return d_match, pickup_time + wait_time

In [11]:
# def check_active(current_time, d):
#     driver_leaves = 0.8
#     val = random.random()
#     if val < driver_leaves:
#         with open('driver_order.txt', 'a') as file:
#             file.write(f"Driver {d.idx}: still in queue \n")
#         return True #still active
#     else:
#         with open('driver_order.txt', 'a') as file:
#             file.write(f"Driver {d.idx}: exited \n")
#         return False #driver is inactive
def check_active(current_time, d):
    if d.ride_time != 0:
        max_active_time = timedelta(hours=8) #check if driver has been active for over 8 hours, reference passenger appear for current time
        time_active = current_time - d.appear
        
        
        probability = 0.0

        if time_active >= max_active_time:
            probability = 1.0

        # Calculate probability based on hours active
        else:
            hours_active = time_active.total_seconds() / 3600  # Convert to hours
            probability = hours_active / 8


        if random.random() < probability:
            with open('driver_order2.txt', 'a') as file:
                file.write(f"Driver {d.idx}: exited the queue \n")
            return False
        else:
            with open('driver_order2.txt', 'a') as file:
                file.write(f"Driver {d.idx}: still in queue \n")
            return True
    else:
        return True

In [13]:
def ride():
    '''
    Main function for going through passenger and driver queues
    '''

    global passenger_index
    global day_type
    global pass_time
    global current_time
    
    while passenger_index < len(passenger_q) and driver_q:
        _, _, p = passenger_q[passenger_index]  #(datetime, id, passenger object)
        
        d, pickup_time = get_driver(p, p.appear, day_type, pass_time)
        current_time = p.appear + timedelta(hours=pickup_time) #account for time to pickup
        pass_time = p.appear + timedelta(hours=pickup_time) #account for time to pickup
        
        time, parent = djikstras(p.source_node, p.dest_node, current_time, day_type)
        ride_time = time[p.dest_node]
        p.total_time += ride_time
        completed_passengers.append(p)
        passenger_times.append(p.total_time)
        d.source_node = p.dest_node
        d.source = p.dest
        d.ride_time += ride_time
        
        if math.isfinite(p.total_time):
            current_time = timedelta(hours=ride_time) + current_time
        passenger_index += 1
        if check_active(current_time, d):
            heapq.heappush(driver_q, (current_time, d.idx, d))       
        else:
            completed_drivers.append(d)
            driver_profits.append(d.get_profit())
    
    while driver_q:
        _, _, d = heapq.heappop(driver_q)
        completed_drivers.append(d)
        driver_profits.append(d.get_profit())
        
    return completed_passengers, completed_drivers, passenger_times, driver_profits

# Classes (Driver, Passenger)

In [None]:
class driver:
    def __init__(self, index, appear, source_lat, source_lon):
        self.idx = index
        self.appear = get_datetime(appear)
        self.source = (source_lat, source_lon)
        self.ride_time = 0 #total time logged driving passengers
        self.pickup_time = 0 #total time logged driving to passengers
        self.source_node = get_nearest_node(self.source[0], self.source[1])
        self.current_node = self.source_node #update as performing routes

    def get_profit(self):
        self.profit = self.ride_time - self.pickup_time
        return self.profit

In [None]:
class passenger:
    def __init__(self, index, appear, source_lat, source_lon, dest_lat, dest_lon):
        self.idx = index
        self.appear = get_datetime(appear)
        self.source = (source_lat, source_lon)
        self.dest = (dest_lat, dest_lon)
        self.total_time = 0 #total time logged for passenger (time appear to time drop off)
        self.source_node = get_nearest_node(self.source[0], self.source[1])
        self.dest_node = get_nearest_node(self.dest[0], self.dest[1])

# MUST DELETE THIS, ONLY HERE NOW FOR TESTING PURPOSES TO RUN QUICKER

In [27]:
class Node:
    def __init__(self, point, left=None, right=None):
        self.point = point
        self.left = left
        self.right = right

class KDTree:
    def __init__(self, points):
        self.root = self._build_kdtree(points)

    def _build_kdtree(self, points, depth=0):
        if not points:
            return None

        axis = depth % 2  # 2 dimensions (alternating between latitude and longitude)

        sorted_points = sorted(points, key=lambda point: point[axis])
        median = len(sorted_points) // 2

        return Node(
            sorted_points[median],
            self._build_kdtree(sorted_points[:median], depth + 1),
            self._build_kdtree(sorted_points[median + 1 :], depth + 1),
        )

    def find_nearest(self, target):
        best = [None, float("inf")]

        def search(node, depth=0):
            if node is None:
                return

            axis = depth % 2
            current_point = node.point
            distance = haversine_distance(current_point[0], current_point[1], target[0], target[1])

            if distance < best[1]:
                best[0] = current_point
                best[1] = distance

            if target[axis] < current_point[axis]:
                search(node.left, depth + 1)
            else:
                search(node.right, depth + 1)

            other_side = abs(target[axis] - current_point[axis])
            if other_side < best[1]:
                if target[axis] < current_point[axis]:
                    search(node.right, depth + 1)
                else:
                    search(node.left, depth + 1)

        search(self.root)
        return best[0]

coordinates = [(node_info['lat'], node_info['lon']) for node_info in nodes.values()]
coordinate_to_key = {coord: key for key, coord in zip(nodes.keys(), coordinates)}
kdtree = KDTree(coordinates)


In [28]:
class passenger:
    def __init__(self, index, appear, source_lat, source_lon, dest_lat, dest_lon):
        self.idx = index
        self.appear = get_datetime(appear)
        self.source = (source_lat, source_lon)
        self.dest = (dest_lat, dest_lon)
        self.total_time = 0 #total time logged for passenger (time appear to time drop off)
        self.source_node = coordinate_to_key.get(kdtree.find_nearest((self.source[0], self.source[1])))
        self.dest_node = coordinate_to_key.get(kdtree.find_nearest((self.dest[0], self.dest[1])))

class driver:
    def __init__(self, index, appear, source_lat, source_lon):
        self.idx = index
        self.appear = get_datetime(appear)
        self.source = (source_lat, source_lon)
        self.ride_time = 0 #total time logged driving passengers
        self.pickup_time = 0 #total time logged driving to passengers
        self.source_node = coordinate_to_key.get(kdtree.find_nearest((self.source[0], self.source[1])))
        self.current_node = self.source_node #update as performing routes

    def get_profit(self):
        self.profit = self.ride_time - self.pickup_time
        return self.profit

In [16]:
passenger_q = []

with open(f'{cwd}/passengers.csv', 'r') as csvfile:
    reader_variable = csv.reader(csvfile, delimiter=",")
    first_line = csvfile.readline()
    for index, row in enumerate(reader_variable):
        p = passenger(index, row[0], float(row[1]), float(row[2]), float(row[3]), float(row[4]))
        passenger_q.append((p.appear, p.idx, p))

In [29]:
driver_q = []

with open(f'{cwd}/drivers.csv', 'r') as csvfile:
    reader_variable = csv.reader(csvfile, delimiter=",")
    first_line = csvfile.readline()
    for index, row in enumerate(reader_variable):
        d = driver(index, row[0], float(row[1]), float(row[2]))
        driver_q.append((d.appear, d.idx, d))
        
heapq.heapify(driver_q)

# Run Task 2

In [None]:
passenger_q = []

with open(f'{cwd}/passengers.csv', 'r') as csvfile:
    reader_variable = csv.reader(csvfile, delimiter=",")
    first_line = csvfile.readline()
    for index, row in enumerate(reader_variable):
        if index > 100:
            break
        p = passenger(index, row[0], float(row[1]), float(row[2]), float(row[3]), float(row[4]))
        passenger_q.append((p.appear, p.idx, p))

In [None]:
driver_q = []

with open(f'{cwd}/drivers.csv', 'r') as csvfile:
    reader_variable = csv.reader(csvfile, delimiter=",")
    first_line = csvfile.readline()
    for index, row in enumerate(reader_variable):
        if index > 10:
            break
        d = driver(index, row[0], float(row[1]), float(row[2]))
        driver_q.append((d.appear, d.idx, d))
        
heapq.heapify(driver_q)

In [30]:
completed_passengers = []
completed_drivers = []
passenger_times = []
driver_profits = []
day_type = 'weekday'
start_time = passenger_q[0][0]
current_time = start_time
pass_time = start_time
passenger_index = 0  # Track the passenger queue index

completed_passengers, completed_drivers, passenger_times, driver_profits = ride()

IndexError: index out of range

In [33]:
print('mean passenger times for t2: ',sum(passenger_times)/len(passenger_times))
print('mean driver profit for t2',sum(driver_profits)/len(driver_profits))
print('number of passengers worked t2', len(passenger_times))
print('sanity check for number of drivers t2',len(driver_profits))
print('stdev of passenger times t2',np.std(passenger_times))
print('stdev of driver profits t2',np.std(driver_profits))

mean passenger times for t2:  1.3819932496227862
mean driver profit for t2 0.3092945207336275
number of passengers worked t2 4719
sanity check for number of drivers t2 499
stdev of passenger times t2 0.8671914079656321
stdev of driver profits t2 0.7030898149613322


In [None]:
file_path = 'passenger_times2.txt'

# Open the file in write mode
with open(file_path, 'w') as file:
    # Write each value in passenger_times list to the file
    for time in passenger_times:
        file.write(str(time) + '\n')

In [None]:
file_path = 'driver_profits2.txt'

# Open the file in write mode
with open(file_path, 'w') as file:
    # Write each value in passenger_times list to the file
    for profit in driver_profits:
        file.write(str(profit) + '\n')

# Djikstra's Test
Single passenger and single driver

In [None]:
nodes_df = pd.DataFrame(nodes).T
nodes_df.head()

In [None]:
#test passenger
with open(f'{cwd}/passengers.csv', 'r') as csvfile:
    reader_variable = csv.reader(csvfile, delimiter=",")
    first_line = csvfile.readline()
    for index, row in enumerate(reader_variable):
        if index > 0:
            break
        p1 = passenger(index, row[0], float(row[1]), float(row[2]), float(row[3]), float(row[4]))

print(vars(p1))

In [None]:
#test driver
with open(f'{cwd}/drivers.csv', 'r') as csvfile:
    reader_variable = csv.reader(csvfile, delimiter=",")
    first_line = csvfile.readline()
    for index, row in enumerate(reader_variable):
        if index > 0:
            break
        d1 = driver(index, row[0], float(row[1]), float(row[2]))

print(vars(d1))

In [None]:
print('wait time (hours)', (d1.appear-p1.appear).total_seconds()/3600)
print(d1.appear)
pickup_times, pickup_parent = djikstras(d1.source_node, p1.source_node, p1.appear, 'weekday', p1.appear)
pickup_time = pickup_times[p1.source_node]
print('pickup time (hours):', pickup_time) #get time from source to destination
pickup_path = get_path(pickup_parent, d1.source_node, p1.source_node) #retrieve path from parent dict
pickup_path_coords = nodes_df[nodes_df.index.isin(pickup_path)] #retrieve path coordinates (to verify graphically)
print('pickup path trajectory:')
print(pickup_path_coords.head(5))
current_time = p1.appear + timedelta(hours=pickup_time)
d1.pickup_time += pickup_time

time, parent = djikstras(p1.source_node, p1.dest_node, current_time, 'weekday', current_time)
print('route time (hours):', time[p1.dest_node]) #get time from source to destination
path = get_path(parent, p1.source_node, p1.dest_node) #retrieve path from parent dict
path_coords = nodes_df[nodes_df.index.isin(path)] #retrieve path coordinates (to verify graphically)
print('path trajectory:')
d1.ride_time = time[p1.dest_node]
print(path_coords.head(5))

In [None]:
d1.get_profit()

In [None]:
#plot nodes and path coords
plt.figure(figsize=(20, 20))
sns.scatterplot(nodes_df, x='lon', y='lat', size=1, alpha=0.6, linewidth=0, label='node', color='grey')
sns.lineplot(path_coords, x='lon', y='lat', color='red', alpha=1, label='ride path', lw=3)
sns.lineplot(pickup_path_coords, x='lon', y='lat', color='blue', alpha=1, label='pickup path', lw=3)
#plt.plot([p1.source[1]], [p1.source[0]], marker='*', color='orange', markersize=20, label='p source')
#plt.plot([d1.source[1]], [d1.source[0]], marker='^', color='orange', markersize=20, label='d source')
# plt.plot([p1.dest[1]], [p1.dest[0]], marker='*', color='blue', markersize=20, label='p dest')
plt.legend()