# ALMRRC2021 Data Exploration

## Imports

In [3]:
import os
import json
import time
import random


import pandas as pd
import numpy as np
import plotly.graph_objects as go
import utm
from dotenv import load_dotenv

load_dotenv()

# local
from almrcc_score import score



In [4]:
LOCAL = True

if LOCAL:
    # only unix
    PATH = !pwd
    PATH = PATH[0]

    path='../data/almrrc2021-data-training/model_build_inputs_10_percent'
    def get_file(file_name,  ) -> str:
        with open(os.path.join(path, file_name), "r") as f:
            return f.read()

else:
    from google.cloud import storage

    client = storage.Client()
    bucket = client.get_bucket('research-data-staging')
    

    path = 'almrrc2021-data-training/model_build_inputs_10_percent'
    def get_file(file_name,) -> str:
        return bucket.blob(os.path.join(path, file_name)).download_as_string().decode('utf-8')




## Read in the JSON

In [5]:
route_data = json.loads(
    get_file(file_name="route_data.json")
)
time_data = json.loads(
    get_file(file_name="travel_times.json")
)
package_data = json.loads(
    get_file(file_name="package_data.json")
)
actual_sequence = json.loads(
    get_file(file_name="actual_sequences.json")
)


### Find a Route ID Present in All Data

In [28]:
for k in list(route_data.keys())[random.randint(0, len(route_data.keys())-1):]:
    if all(k in _d.keys() for _d in [route_data, time_data, package_data, actual_sequence]):
        print(k)
        GLOBAL_ROUTE_ID = k
        break

RouteID_91b46004-2e9f-4df1-a3f8-ab49128126f1


## Convert the JSON data to a pandas dataframe

In [10]:
route_df = pd.DataFrame.from_records(
    (
        {
            "stop_id": k,
            "route_id": route_id,
            **v,
        }
        for route_id in route_data.keys()
        for k, v in route_data[route_id]["stops"].items()
    )
)


In [11]:
travel_times = pd.DataFrame.from_records(
    {"stop_id": k, "time": v, "route_id": route_id}
    for route_id, time_data in time_data.items()
    for k, v in time_data.items()
)


## Create the Travel Time Matrices for Each Route

In [12]:
def _add_row(x, square_df) -> None:
    square_df.loc[x["stop_id"], list(x["time"].keys())] = list(x["time"].values())


travel_times["time_matrix"] = ""
df_map = {}
lat_long_map = {}
route_df_grouper = route_df.groupby("route_id")
# actual_sequence_grouper = actual_sequence.groupby("route_id")

for r_id, _df in travel_times.groupby("route_id"):
    # get the actual sequence for this route
    _actual = actual_sequence.get(r_id)
    # create a square matrix of all the stop ids
    _square_df = pd.DataFrame(index=_df.stop_id.unique(), columns=_df.stop_id.unique())
    # convert the json travel time format to  a matrix
    _df.apply(_add_row, axis=1, square_df=_square_df)
    # sort the matrix by moving the depot to the top
    df_map[r_id] = _square_df.sort_index(key=lambda x: x.map(_actual['actual'])).sort_index(axis=1, key=lambda x: x.map(_actual['actual']))
    # create a map to get the lat long for each stop
    lat_long_map[r_id] = (
        route_df_grouper.get_group(r_id)[["stop_id", "lat", "lng"]]
        .set_index("stop_id")
        .to_dict("index")
    )

# drop all but the first row of the matrix 
travel_times = travel_times.groupby("route_id").first()
travel_times.drop(["stop_id", "time"], axis=1, inplace=True)


In [13]:
travel_times["time_matrix"] = travel_times.index.map(df_map)
travel_times["lat_lon_map"] = travel_times.index.map(lat_long_map)


## Find Optimal Path using Amazon Travel Times

Using Google's OR Tools


In [29]:
"""Simple Travelling Salesperson Problem (TSP) between cities."""

from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp


def create_data_model(tt_matrix):
    """Stores the data for the problem."""
    return {
        "distance_matrix": tt_matrix,
        "num_vehicles": 1,
        "depot": 0,
    }


def get_sequence(manager, routing, solution) -> list:
    """Prints solution on console."""
    index = routing.Start(0)
    sequence = []
    while not routing.IsEnd(index):
        sequence.append(manager.IndexToNode(index))
        index = solution.Value(routing.NextVar(index))
    sequence.append(manager.IndexToNode(index))
    return sequence


def print_solution(manager, routing, solution):
    """Prints solution on console."""
    print(f"Objective: {solution.ObjectiveValue()} miles")
    index = routing.Start(0)
    plan_output = "Route for vehicle 0:\n"
    route_distance = 0
    while not routing.IsEnd(index):
        plan_output += f" {manager.IndexToNode(index)} ->"
        previous_index = index
        index = solution.Value(routing.NextVar(index))
        route_distance += routing.GetArcCostForVehicle(previous_index, index, 0)
    plan_output += f" {manager.IndexToNode(index)}\n"
    print(plan_output)
    plan_output += f"Route distance: {route_distance}miles\n"


def or_route(tt_matrix):
    """Entry point of the program."""
    # Instantiate the data problem.
    data = create_data_model(tt_matrix)

    # Create the routing index manager.
    manager = pywrapcp.RoutingIndexManager(
        len(data["distance_matrix"]), data["num_vehicles"], data["depot"]
    )

    # Create Routing Model.
    routing = pywrapcp.RoutingModel(manager)

    def distance_callback(from_index, to_index):
        """Returns the distance between the two nodes."""
        # Convert from routing variable Index to distance matrix NodeIndex.
        from_node = manager.IndexToNode(from_index)
        to_node = manager.IndexToNode(to_index)
        return data["distance_matrix"][from_node][to_node]

    transit_callback_index = routing.RegisterTransitCallback(distance_callback)

    # Define cost of each arc.
    routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)

    # Setting first solution heuristic.
    search_parameters = pywrapcp.DefaultRoutingSearchParameters()
    search_parameters.first_solution_strategy = (
        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC
    )

    # Solve the problem.
    solution = routing.SolveWithParameters(search_parameters)

    # Print solution on console.
    if solution:
        print_solution(manager, routing, solution)
    
    return get_sequence(manager, routing, solution)


sol = or_route(travel_times.loc[
    GLOBAL_ROUTE_ID, "time_matrix"
].values)


Objective: 9145 miles
Route for vehicle 0:
 0 -> 36 -> 35 -> 34 -> 33 -> 32 -> 31 -> 30 -> 29 -> 28 -> 27 -> 26 -> 49 -> 50 -> 48 -> 60 -> 61 -> 66 -> 65 -> 62 -> 64 -> 63 -> 17 -> 20 -> 19 -> 16 -> 15 -> 14 -> 13 -> 53 -> 54 -> 52 -> 51 -> 47 -> 56 -> 55 -> 57 -> 58 -> 67 -> 78 -> 79 -> 81 -> 82 -> 83 -> 89 -> 92 -> 93 -> 94 -> 96 -> 95 -> 90 -> 88 -> 69 -> 72 -> 71 -> 84 -> 85 -> 12 -> 73 -> 74 -> 75 -> 76 -> 70 -> 77 -> 80 -> 68 -> 87 -> 86 -> 91 -> 127 -> 128 -> 126 -> 121 -> 123 -> 122 -> 124 -> 125 -> 120 -> 119 -> 118 -> 117 -> 114 -> 113 -> 108 -> 109 -> 110 -> 111 -> 112 -> 107 -> 105 -> 100 -> 103 -> 102 -> 101 -> 116 -> 98 -> 115 -> 99 -> 104 -> 106 -> 97 -> 9 -> 10 -> 8 -> 3 -> 1 -> 2 -> 4 -> 5 -> 6 -> 7 -> 11 -> 21 -> 22 -> 23 -> 24 -> 25 -> 18 -> 59 -> 39 -> 37 -> 38 -> 40 -> 46 -> 45 -> 44 -> 43 -> 42 -> 41 -> 0



## Pull OpenRouteService Travel Times

In [30]:
import openrouteservice as ors
ors_client = ors.Client(key=os.environ["ORS_KEY"])

def get_ors_travel_time(locations):
    return ors_client.distance_matrix(locations, profile='driving-hgv', metrics=['duration'])

In [31]:
mapper = travel_times.loc[
    GLOBAL_ROUTE_ID, 'lat_lon_map'
]


ordered_index = travel_times.loc[
        GLOBAL_ROUTE_ID, "time_matrix"
    ].index

ors_tt_df = travel_times.loc[
        GLOBAL_ROUTE_ID, "time_matrix"
    ].copy()


location_pairs = ordered_index.map(lambda x: (mapper[x]['lng'], mapper[x]['lat'])).values.tolist()


# chunk the location pairs into groups of 50, as the maximum allowed is 3500 in a pull, 
chunk_size = 50

chunked_arrays = np.array_split(location_pairs, len(location_pairs) // chunk_size + 1)
# preserve the indexes as well
chunked_indices = np.array_split(ordered_index, len(ordered_index) // chunk_size + 1)

for chunk_location, chunk_ind in zip(chunked_arrays, chunked_indices):
    assert len(chunk_location) == len(chunk_ind)
    res = get_ors_travel_time(chunk_location.tolist())
    ors_tt_df.loc[chunk_ind, chunk_ind] = res['durations']
    time.sleep(random.random() * 0.5)

# location_combinations = list(itertools.product(location_pairs, repeat=2))

In [32]:
ors_sol = or_route(ors_tt_df.values)

Objective: 7739 miles
Route for vehicle 0:
 0 -> 59 -> 40 -> 45 -> 46 -> 44 -> 43 -> 42 -> 41 -> 38 -> 37 -> 39 -> 36 -> 35 -> 34 -> 33 -> 32 -> 31 -> 30 -> 29 -> 28 -> 27 -> 26 -> 25 -> 24 -> 23 -> 22 -> 21 -> 20 -> 19 -> 18 -> 17 -> 16 -> 15 -> 13 -> 14 -> 61 -> 60 -> 49 -> 48 -> 50 -> 47 -> 51 -> 52 -> 53 -> 54 -> 56 -> 55 -> 57 -> 58 -> 66 -> 65 -> 64 -> 63 -> 62 -> 67 -> 80 -> 78 -> 79 -> 81 -> 68 -> 82 -> 83 -> 88 -> 89 -> 90 -> 92 -> 93 -> 94 -> 95 -> 96 -> 91 -> 128 -> 127 -> 86 -> 87 -> 69 -> 70 -> 77 -> 76 -> 75 -> 74 -> 73 -> 72 -> 71 -> 84 -> 85 -> 12 -> 8 -> 2 -> 3 -> 7 -> 6 -> 11 -> 5 -> 4 -> 1 -> 9 -> 10 -> 113 -> 114 -> 112 -> 111 -> 110 -> 109 -> 108 -> 107 -> 97 -> 106 -> 105 -> 104 -> 103 -> 101 -> 102 -> 100 -> 99 -> 116 -> 98 -> 115 -> 117 -> 119 -> 123 -> 122 -> 125 -> 124 -> 121 -> 120 -> 118 -> 126 -> 0



### Map the Sequence Idx to the Stop Ids

In [33]:
seq_names = travel_times.loc[
    GLOBAL_ROUTE_ID, "time_matrix"
].index

seq_ors = ors_tt_df.index

# create a simple df of the actual sequence
sequence_df = pd.DataFrame(index=actual_sequence[GLOBAL_ROUTE_ID]['actual'].values(), data=actual_sequence[GLOBAL_ROUTE_ID]['actual'].keys(), columns=['actual'])
sequence_df.sort_index(inplace=True)

# write the ortools solution to the dataframe.
sequence_df['ortools'] = pd.DataFrame(index=range(len(sol)), data=[seq_names[i] for i in sol])
sequence_df['ors_ortools'] = pd.DataFrame(index=range(len(ors_sol)), data=[seq_ors[i] for i in ors_sol])

### Plot the Sequences

This is not a meaningful way to plot...

In [34]:
_df = _df.sort_index()

fig = go.Figure(data=[go.Scatter(x=sequence_df.index, y=sequence_df.actual, mode='lines', name="actual")])
fig.add_trace(
    go.Scatter(x=sequence_df.index, y=sequence_df.ortools, mode='lines', name='ortools')
)
fig.add_trace(
    go.Scatter(x=sequence_df.index, y=sequence_df.ors_ortools, mode='lines', name='ors_ortools')
)

fig.show()

### Score the OR Tools solution, accoring to Amazon's algorithm

In [35]:
score(
    sequence_df.ortools.values.tolist(),
    sequence_df.actual.values.tolist(),
    time_data[GLOBAL_ROUTE_ID]
)

0.26906561140111984

### Score the OR Tools solution w/ ORS TT Matrix, accoring to Amazon's algorithm

In [36]:
score(
    sequence_df.ors_ortools.values.tolist(),
    sequence_df.actual.values.tolist(),
    time_data[GLOBAL_ROUTE_ID]
)

0.0734948762155341

In [37]:
mapper = travel_times.loc[
    GLOBAL_ROUTE_ID, 'lat_lon_map'
]

for sol_type in ['ortools', 'actual', 'ors_ortools']:
    sequence_df[f'{sol_type}_lat'] = sequence_df[sol_type].map(lambda x: mapper[x]['lat'])
    sequence_df[f'{sol_type}_lon'] = sequence_df[sol_type].map(lambda x: mapper[x]['lng'])

    # get UTM x/y 
    x, y, _, _ = utm.from_latlon(sequence_df[f'{sol_type}_lat'].values, sequence_df[f'{sol_type}_lon'].values)
    sequence_df[f'{sol_type}_utm_x'] = x
    sequence_df[f'{sol_type}_utm_y'] = y



In [38]:
import numpy as np

# color_bar = go.ColorBar(
#     title="Order",
    
# )

fig = go.Figure(
    data=[go.Scattermapbox(
        lat=sequence_df["actual_lat"],
        lon=sequence_df["actual_lon"],
        mode="lines+markers",
        name="actual",
        line=dict(width=4),
        marker=go.scattermapbox.Marker(
            # size=new_df['cum_time'] * 10,
            # size=np.log(sequence_df.index.values + 1) * 10,
            size=sequence_df.index.values / 10,
            # colorbar=color_bar,
        ),
        # text=new_df["cum_time"],
        hoverinfo="text",
        text=sequence_df.index.values,
    ), 
    go.Scattermapbox(
        lat=sequence_df["ortools_lat"],
        lon=sequence_df["ortools_lon"],
        mode="lines+markers",
        name="ortools",
        line=dict(width=4),
        marker=go.scattermapbox.Marker(
            # size=new_df['cum_time'] * 10,
            # size=np.log(sequence_df.index.values + 1) * 10,
            size=sequence_df.index.values / 10
            # colorbar=color_bar,
        ),
        hoverinfo="text",
        text=sequence_df.index.values,
    ),
    go.Scattermapbox(
        lat=sequence_df["ors_ortools_lat"],
        lon=sequence_df["ors_ortools_lon"],
        mode="lines+markers",
        name="ors_otools",
        line=dict(width=4),
        marker=go.scattermapbox.Marker(
            # size=new_df['cum_time'] * 10,
            # size=np.log(sequence_df.index.values + 1) * 10,
            size=sequence_df.index.values / 10
            # colorbar=color_bar,
        ),
        hoverinfo="text",
        text=sequence_df.index.values,
    )
    ]
)

fig.update_layout(
    # autosize=True,
    height=600,
    width=1000,
    hovermode="closest",
    mapbox=go.layout.Mapbox(
        accesstoken=os.environ["MAPBOX_KEY"],
        # style="mapbox://styles/max-schrader/ck8t1cmmc02wk1it9rv28iyte",
        style="mapbox://styles/max-schrader/cl6lhvrfw001516pkh3s6iv7l",
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=sequence_df["ortools_lat"].mean(), lon=sequence_df["ortools_lon"].mean()
        ),
        pitch=0,
        zoom=12,
        
    ),
    # margin=go.layout.Margin(l=0, r=0, t=0, b=0),
)
fig.show()


In [39]:
fig = go.Figure(data=[
    go.Scatter(x=sequence_df.actual_utm_x, y=sequence_df.actual_utm_y, mode='lines', name="actual"),
    go.Scatter(x=sequence_df.ortools_utm_x, y=sequence_df.ortools_utm_y, mode='lines', name='ortools'),
    go.Scatter(x=sequence_df.ors_ortools_utm_x, y=sequence_df.ors_ortools_utm_y, mode='lines', name='ors_ortools')
    ]
)

fig.show()