# ALMRRC2021 Data Exploration


In [1]:
LOCAL = True

if not LOCAL:
    !pip install plotly
    !conda install -c conda-forge -c plotly jupyter-dash
    !pip install python-dotenv
    !pip install utm
    !pip install ortools
    !pip install openrouteservice
    !git clone "https://github.com/mschrader15/amazon-routing-challenge.git"
    

## Imports


In [2]:
import os
import sys
import pathlib
import json
import time
import random
import itertools


import pandas as pd
import numpy as np
import plotly.graph_objects as go
import utm
from dotenv import load_dotenv

if not LOCAL:
    from io import StringIO
    from google.cloud import storage

    client = storage.Client()
    bucket = client.get_bucket("kale-dataproc-notebook")
    load_dotenv(
        stream=StringIO(bucket.blob("shared/navish/.env").download_as_string().decode())
    )

    # hacky for now until we create a proper library
    sys.path.append("/amazon-routing-challenge/plotting")

else:

    def find_root(path):
        if os.path.split(path)[-1] != "amazon-routing-challenge":
            return find_root(os.path.split(path)[0])
        return path

    sys.path.append(find_root(pathlib.Path().absolute()))
    load_dotenv()

from almrcc_tools.almrcc_score import score


In [3]:
if LOCAL:
    # only unix
    PATH = !pwd
    PATH = PATH[0]

    path='../data/almrrc2021-data-training/model_build_inputs_10_percent'
    def get_file(file_name,  ) -> str:
        with open(os.path.join(path, file_name), "r") as f:
            return f.read()

else:
    
    from google.cloud import storage

    client = storage.Client()
    bucket = client.get_bucket('research-data-staging')
    

    path = 'almrrc2021/model_build_inputs_10_percent'
    def get_file(file_name,) -> str:
        return bucket.blob(os.path.join(path, file_name)).download_as_string().decode('utf-8')




## Read in the JSON


In [5]:
route_data = json.loads(get_file(file_name="route_data.json"))
time_data = json.loads(get_file(file_name="travel_times.json"))
package_data = json.loads(get_file(file_name="package_data.json"))
actual_sequence = json.loads(get_file(file_name="actual_sequences.json"))
lkh_sequence = json.loads(get_file(file_name="proposed_sequences.json"))  # this is solved from lkh

In [8]:
package_data

{'RouteID_3f166f0e-fd2e-47ab-96a0-6cbc99cc6eef': {'AE': {'PackageID_ac1927f3-812c-4fc9-be7b-f786e2f1567b': {'scan_status': 'DELIVERED',
    'time_window': {'start_time_utc': nan, 'end_time_utc': nan},
    'planned_service_time_seconds': 78.0,
    'dimensions': {'depth_cm': 26.2, 'height_cm': 6.9, 'width_cm': 23.6}}},
  'AG': {},
  'AH': {'PackageID_6a8081fd-4eb8-4256-a15f-c2c598473945': {'scan_status': 'DELIVERED',
    'time_window': {'start_time_utc': nan, 'end_time_utc': nan},
    'planned_service_time_seconds': 125.7,
    'dimensions': {'depth_cm': 24.0, 'height_cm': 1.4, 'width_cm': 16.4}},
   'PackageID_8bd134c9-e4e2-4608-84b9-240f6b3cf914': {'scan_status': 'DELIVERED',
    'time_window': {'start_time_utc': nan, 'end_time_utc': nan},
    'planned_service_time_seconds': 125.7,
    'dimensions': {'depth_cm': 24.1, 'height_cm': 4.1, 'width_cm': 16.5}},
   'PackageID_584af658-954f-42cd-9955-208a69e49322': {'scan_status': 'DELIVERED',
    'time_window': {'start_time_utc': nan, 'end_tim

### Find a Route ID Present in All Data


In [5]:
for k in list(route_data.keys())[random.randint(0, len(route_data.keys()) - 1) :]:
    if all(
        k in _d.keys() for _d in [route_data, time_data, package_data, actual_sequence]
    ):
        print(k)
        GLOBAL_ROUTE_ID = k
        break


RouteID_fffd257c-3041-4736-be7a-5efea8af1173


## Convert the JSON data to a pandas dataframe


In [6]:
route_df = pd.DataFrame.from_records(
    (
        {
            "stop_id": k,
            "route_id": route_id,
            **v,
        }
        for route_id in route_data.keys()
        for k, v in route_data[route_id]["stops"].items()
    )
)


In [7]:
travel_times = pd.DataFrame.from_records(
    {"stop_id": k, "time": v, "route_id": route_id}
    for route_id, time_data in time_data.items()
    for k, v in time_data.items()
)


In [None]:
travel_times = pd.DataFrame.from_records(
    {"stop_id": k, "time": v, "route_id": route_id}
    for route_id, time_data in time_data.items()
    for k, v in time_data.items()
)

## Create the Travel Time Matrices for Each Route


In [8]:
def _add_row(x, square_df) -> None:
    square_df.loc[x["stop_id"], list(x["time"].keys())] = list(x["time"].values())


travel_times["time_matrix"] = ""
df_map = {}
lat_long_map = {}
route_df_grouper = route_df.groupby("route_id")
# actual_sequence_grouper = actual_sequence.groupby("route_id")

for r_id, _df in travel_times.groupby("route_id"):
    # get the actual sequence for this route
    _actual = actual_sequence.get(r_id)
    # create a square matrix of all the stop ids
    _square_df = pd.DataFrame(index=_df.stop_id.unique(), columns=_df.stop_id.unique())
    # convert the json travel time format to  a matrix
    _df.apply(_add_row, axis=1, square_df=_square_df)
    # sort the matrix by moving the depot to the top
    df_map[r_id] = _square_df.sort_index(
        key=lambda x: x.map(_actual["actual"])
    ).sort_index(axis=1, key=lambda x: x.map(_actual["actual"]))
    # create a map to get the lat long for each stop
    lat_long_map[r_id] = (
        route_df_grouper.get_group(r_id)[["stop_id", "lat", "lng"]]
        .set_index("stop_id")
        .to_dict("index")
    )

# drop all but the first row of the matrix
travel_times = travel_times.groupby("route_id").first()
travel_times.drop(["stop_id", "time"], axis=1, inplace=True)


In [9]:
travel_times["time_matrix"] = travel_times.index.map(df_map)
travel_times["lat_lon_map"] = travel_times.index.map(lat_long_map)


## Find Optimal Path using Amazon Travel Times

Using Google's OR Tools


In [10]:
"""Simple Travelling Salesperson Problem (TSP) between cities."""

from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp


def create_data_model(tt_matrix):
    """Stores the data for the problem."""
    return {
        "distance_matrix": tt_matrix,
        "num_vehicles": 1,
        "depot": 0,
    }


def get_sequence(manager, routing, solution) -> list:
    """Prints solution on console."""
    index = routing.Start(0)
    sequence = []
    while not routing.IsEnd(index):
        sequence.append(manager.IndexToNode(index))
        index = solution.Value(routing.NextVar(index))
    sequence.append(manager.IndexToNode(index))
    return sequence


def print_solution(manager, routing, solution):
    """Prints solution on console."""
    print(f"Objective: {solution.ObjectiveValue()} miles")
    index = routing.Start(0)
    plan_output = "Route for vehicle 0:\n"
    route_distance = 0
    while not routing.IsEnd(index):
        plan_output += f" {manager.IndexToNode(index)} ->"
        previous_index = index
        index = solution.Value(routing.NextVar(index))
        route_distance += routing.GetArcCostForVehicle(previous_index, index, 0)
    plan_output += f" {manager.IndexToNode(index)}\n"
    print(plan_output)
    plan_output += f"Route distance: {route_distance}miles\n"


def or_route(tt_matrix):
    """Entry point of the program."""
    # Instantiate the data problem.
    data = create_data_model(tt_matrix)

    # Create the routing index manager.
    manager = pywrapcp.RoutingIndexManager(
        len(data["distance_matrix"]), data["num_vehicles"], data["depot"]
    )

    # Create Routing Model.
    routing = pywrapcp.RoutingModel(manager)

    def distance_callback(from_index, to_index):
        """Returns the distance between the two nodes."""
        # Convert from routing variable Index to distance matrix NodeIndex.
        from_node = manager.IndexToNode(from_index)
        to_node = manager.IndexToNode(to_index)
        return data["distance_matrix"][from_node][to_node]

    transit_callback_index = routing.RegisterTransitCallback(distance_callback)

    # Define cost of each arc.
    routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)

    # Setting first solution heuristic.
    search_parameters = pywrapcp.DefaultRoutingSearchParameters()
    search_parameters.first_solution_strategy = (
        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC
    )

    # Solve the problem.
    solution = routing.SolveWithParameters(search_parameters)

    # Print solution on console.
    if solution:
        print_solution(manager, routing, solution)

    return get_sequence(manager, routing, solution)


sol = or_route(travel_times.loc[GLOBAL_ROUTE_ID, "time_matrix"].values)


Objective: 15508 miles
Route for vehicle 0:
 0 -> 11 -> 12 -> 13 -> 14 -> 16 -> 15 -> 26 -> 25 -> 24 -> 27 -> 28 -> 29 -> 30 -> 31 -> 32 -> 33 -> 34 -> 38 -> 37 -> 43 -> 44 -> 46 -> 45 -> 53 -> 54 -> 55 -> 56 -> 57 -> 58 -> 59 -> 61 -> 62 -> 63 -> 64 -> 66 -> 65 -> 67 -> 68 -> 69 -> 70 -> 71 -> 72 -> 73 -> 78 -> 77 -> 76 -> 75 -> 74 -> 172 -> 171 -> 91 -> 92 -> 96 -> 95 -> 94 -> 93 -> 97 -> 98 -> 100 -> 99 -> 101 -> 88 -> 89 -> 90 -> 87 -> 86 -> 85 -> 84 -> 83 -> 82 -> 81 -> 80 -> 79 -> 167 -> 166 -> 165 -> 164 -> 163 -> 169 -> 168 -> 170 -> 162 -> 161 -> 129 -> 128 -> 127 -> 126 -> 125 -> 108 -> 102 -> 112 -> 113 -> 111 -> 110 -> 109 -> 103 -> 104 -> 105 -> 124 -> 123 -> 107 -> 106 -> 114 -> 115 -> 116 -> 117 -> 118 -> 120 -> 173 -> 119 -> 121 -> 122 -> 133 -> 136 -> 135 -> 134 -> 137 -> 138 -> 139 -> 140 -> 141 -> 142 -> 143 -> 144 -> 145 -> 146 -> 147 -> 148 -> 149 -> 150 -> 151 -> 152 -> 153 -> 154 -> 155 -> 156 -> 157 -> 158 -> 159 -> 160 -> 130 -> 131 -> 132 -> 60 -> 47 -> 48 -> 

## Pull OpenRouteService Travel Times


In [151]:
import openrouteservice as ors

ors_client = ors.Client(key=os.environ["ORS_KEY"])


def get_ors_travel_time(locations, destination_indexes):
    return ors_client.distance_matrix(
        locations,
        destinations=destination_indexes,
        profile="driving-hgv",
        metrics=["duration", "distance"],
    )


In [1]:
170 * 170

28900

In [152]:
mapper = travel_times.loc[GLOBAL_ROUTE_ID, "lat_lon_map"]


ordered_index = travel_times.loc[GLOBAL_ROUTE_ID, "time_matrix"].index

ors_tt_df = travel_times.loc[GLOBAL_ROUTE_ID, "time_matrix"].copy()

ors_tt_df.loc[:, :] = 0


location_pairs = ordered_index.map(
    lambda x: (mapper[x]["lng"], mapper[x]["lat"])
).values.tolist()

#  the length of the location pairs must be ATLEAST less than the ORS limit of 3500 per call
assert len(location_pairs) <= 3500


start_ind = 0
end_ind = 3500 // len(location_pairs)
while start_ind < len(location_pairs) - 1:
    # print(f"{start_ind} : {end_ind}")

    try:
        ors_tt_df.iloc[:, start_ind:end_ind] = get_ors_travel_time(
            location_pairs, list(range(start_ind, end_ind))
        )["durations"]
    except Warning as e:
        print("sleeping due to rate limiter", e)
        time.sleep(60)

    start_ind = end_ind
    end_ind += 3500 // len(location_pairs)
    end_ind = min(end_ind, len(location_pairs))


In [153]:
ors_sol = or_route(ors_tt_df.values)


Objective: 16066 miles
Route for vehicle 0:
 0 -> 4 -> 5 -> 3 -> 6 -> 7 -> 8 -> 9 -> 10 -> 11 -> 12 -> 13 -> 14 -> 16 -> 15 -> 1 -> 2 -> 17 -> 18 -> 19 -> 20 -> 21 -> 22 -> 23 -> 24 -> 25 -> 26 -> 27 -> 28 -> 29 -> 30 -> 31 -> 35 -> 34 -> 33 -> 32 -> 36 -> 37 -> 38 -> 50 -> 49 -> 47 -> 48 -> 52 -> 51 -> 39 -> 40 -> 41 -> 42 -> 43 -> 44 -> 46 -> 45 -> 53 -> 54 -> 55 -> 56 -> 57 -> 58 -> 59 -> 61 -> 62 -> 63 -> 64 -> 66 -> 65 -> 67 -> 68 -> 69 -> 70 -> 71 -> 72 -> 73 -> 60 -> 78 -> 77 -> 76 -> 75 -> 74 -> 81 -> 82 -> 83 -> 84 -> 85 -> 86 -> 87 -> 88 -> 89 -> 90 -> 80 -> 79 -> 172 -> 171 -> 91 -> 92 -> 96 -> 95 -> 94 -> 93 -> 97 -> 98 -> 100 -> 99 -> 101 -> 111 -> 113 -> 112 -> 102 -> 108 -> 106 -> 107 -> 110 -> 109 -> 103 -> 104 -> 105 -> 124 -> 123 -> 122 -> 114 -> 115 -> 116 -> 117 -> 118 -> 119 -> 173 -> 120 -> 121 -> 125 -> 126 -> 127 -> 128 -> 130 -> 131 -> 132 -> 160 -> 159 -> 158 -> 157 -> 156 -> 155 -> 154 -> 153 -> 152 -> 151 -> 150 -> 149 -> 148 -> 147 -> 146 -> 145 -> 144 -> 1

### Map the Sequence Idx to the Stop Ids


In [158]:
seq_names = travel_times.loc[GLOBAL_ROUTE_ID, "time_matrix"].index

seq_ors = ors_tt_df.index

# create a simple df of the actual sequence
sequence_df = pd.DataFrame(
    index=actual_sequence[GLOBAL_ROUTE_ID]["actual"].values(),
    data=actual_sequence[GLOBAL_ROUTE_ID]["actual"].keys(),
    columns=["actual"],
)
sequence_df.sort_index(inplace=True)

# write the ortools solution to the dataframe.
sequence_df["ortools"] = pd.DataFrame(
    index=range(len(sol)), data=[seq_names[i] for i in sol]
)
sequence_df["ors_ortools"] = pd.DataFrame(
    index=range(len(ors_sol)), data=[seq_ors[i] for i in ors_sol]
)
sequence_df["lkh"] = pd.DataFrame(
    index=lkh_sequence[GLOBAL_ROUTE_ID]["proposed"].values(),
    data=lkh_sequence[GLOBAL_ROUTE_ID]["proposed"].keys(),
)


### Plot the Sequences

This is not a meaningful way to plot...


In [159]:
_df = _df.sort_index()

fig = go.Figure(
    data=[
        go.Scatter(
            x=sequence_df.index, y=sequence_df.actual, mode="lines", name="actual"
        )
    ]
)
fig.add_trace(
    go.Scatter(x=sequence_df.index, y=sequence_df.ortools, mode="lines", name="ortools")
)
fig.add_trace(
    go.Scatter(
        x=sequence_df.index, y=sequence_df.ors_ortools, mode="lines", name="ors_ortools"
    )
)
fig.add_trace(
    go.Scatter(x=sequence_df.index, y=sequence_df.lkh, mode="lines", name="lkh")
)

fig.show()


### Score the OR Tools solution, accoring to Amazon's algorithm


In [160]:
score(
    sequence_df.ortools.values.tolist(),
    sequence_df.actual.values.tolist(),
    time_data[GLOBAL_ROUTE_ID],
)


0.10219952459160135

### Score the OR Tools solution w/ ORS TT Matrix, accoring to Amazon's algorithm


In [161]:
score(
    sequence_df.ors_ortools.values.tolist(),
    sequence_df.actual.values.tolist(),
    time_data[GLOBAL_ROUTE_ID],
)


0.007540257354676536

### Score the LKH Algorithm


In [162]:
score(
    sequence_df.ors_ortools.values.tolist(),
    sequence_df.lkh.values.tolist(),
    time_data[GLOBAL_ROUTE_ID],
)


0.006876336263749528

In [163]:
mapper = travel_times.loc[GLOBAL_ROUTE_ID, "lat_lon_map"]

for sol_type in ["ortools", "actual", "ors_ortools", "lkh"]:
    sequence_df[f"{sol_type}_lat"] = sequence_df[sol_type].map(
        lambda x: mapper[x]["lat"]
    )
    sequence_df[f"{sol_type}_lon"] = sequence_df[sol_type].map(
        lambda x: mapper[x]["lng"]
    )

    # get UTM x/y
    x, y, _, _ = utm.from_latlon(
        sequence_df[f"{sol_type}_lat"].values, sequence_df[f"{sol_type}_lon"].values
    )
    sequence_df[f"{sol_type}_utm_x"] = x
    sequence_df[f"{sol_type}_utm_y"] = y


In [177]:
# px.colors.diverging.RdBu


In [193]:
import numpy as np
import plotly.express as px

colors = px.colors.diverging.RdBu
# color_bar = go.ColorBar(
#     title="Order",

# )
sequence_df["order_string"] = sequence_df.index.map(lambda x: str(x))

fig = go.Figure()

for i, sol_type in enumerate(["actual", "lkh", "ortools", "ors_ortools"]):

    fig.add_trace(
        go.Scattermapbox(
            name=sol_type,
            lat=sequence_df[f"{sol_type}_lat"],
            lon=sequence_df[f"{sol_type}_lon"],
            mode="text+lines",
            text=sequence_df["order_string"],
            # marker=dict(w)
            line=dict(color=colors[i]),
            textfont=dict(family="sans serif", size=22, color=colors[i]),
        )
    )

fig.update_layout(
    # autosize=True,
    height=600,
    width=1000,
    hovermode="closest",
    mapbox=go.layout.Mapbox(
        accesstoken=os.environ["MAPBOX_KEY"],
        # style="mapbox://styles/max-schrader/ck8t1cmmc02wk1it9rv28iyte",
        style="mapbox://styles/max-schrader/cl6lhvrfw001516pkh3s6iv7l",
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=sequence_df["ortools_lat"].mean(), lon=sequence_df["ortools_lon"].mean()
        ),
        pitch=0,
        zoom=12,
    ),
    # margin=go.layout.Margin(l=0, r=0, t=0, b=0),
)


# # add the annotations
# for i, row in sequence_df.iterrows():
#     # for sol_type in ['ortools', 'actu/al', 'ors_ortools', 'lkh']:
#         fig.add_annotation(x=row['ortools_lat'], y=row['ortools_lon'],
#             text="Text annotation with arrow",
#             showarrow=True,
#             arrowhead=1)


fig.show()


In [180]:
# fig = go.Figure(data=[
#     go.Scatter(x=sequence_df.actual_utm_x, y=sequence_df.actual_utm_y, mode='lines', name="actual"),
#     go.Scatter(x=sequence_df.ortools_utm_x, y=sequence_df.ortools_utm_y, mode='lines', name='ortools'),
#     go.Scatter(x=sequence_df.ors_ortools_utm_x, y=sequence_df.ors_ortools_utm_y, mode='lines', name='ors_ortools')
#     ]
# )

# fig.show()


In [187]:
sequence_df[f"{sol_type}_lat"].iloc[
                    1:5 + 1,
                ],

(1    42.191658
 2    42.191888
 3    42.193048
 4    42.193136
 5    42.191895
 Name: ors_ortools_lat, dtype: float64,)

## Animation


In [201]:
fig = go.Figure()

colors = ["blue", "red"]

for i, sol_type in enumerate(['ors_ortools', 'actual']):
    fig.add_trace(
        go.Scattermapbox(
            name=sol_type,
            lat=[sequence_df[f"{sol_type}_lat"].iloc[0]],
            lon=[sequence_df[f"{sol_type}_lon"].iloc[0]],
            mode="text+lines",
            text=[sequence_df["order_string"].iloc[0]],
            # marker=dict(w)
            line=dict(color=colors[i]),
            textfont=dict(family="sans serif", size=22, color=colors[i]),
        )
    )


sol_type = "ors_ortools"

frames = [
    go.Frame(
        data=[
            go.Scattermapbox(
                lat=sequence_df[f"{sol_type}_lat"].iloc[
                    :k + 1,
                ],
                lon=sequence_df[f"{sol_type}_lon"].iloc[:k + 1],
                mode="lines",
                text=sequence_df["order_string"].iloc[:k + 1],
            # marker=dict(w)
                line=dict(color=colors[0], width=3),
                # textfont=dict(family="sans serif", size=22, color=colors[i]),
                
            ),
            go.Scattermapbox(
                lat=sequence_df[f"actual_lat"].iloc[
                    :k + 1,
                ],
                lon=sequence_df[f"actual_lon"].iloc[:k + 1],
                mode="lines",
                text=sequence_df["order_string"].iloc[:k + 1],
            # marker=dict(w)
                line=dict(color=colors[1], width=5),
                # textfont=dict(family="sans serif", size=22, color=colors[i]),
                
            )
        ],
        traces=[0, 1],
        name=f"frame{k}",
    )
    for k in range(len(sequence_df))
]


fig.update(frames=frames)
sliders = [
    dict(
        steps=[
            dict(
                method="animate",
                args=[
                    [f"frame{k}", ],
                    dict(
                        mode="immediate",
                        frame=dict(
                            duration=100, redraw=True
                        ),  # could update the transition time to be scaled by the predicted travel time
                        transition=dict(duration=0),
                    ),
                ],
                label="{:d}".format(k),
            )
            for k in range(len(sequence_df))
        ],
        transition=dict(duration=0),
        x=0,  # slider starting position
        y=0,
        currentvalue=dict(
            font=dict(size=12), prefix="Point: ", visible=True, xanchor="center"
        ),
        len=1.0,
    )
]

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            showactive=False,
            y=0,
            x=1.05,
            xanchor="right",
            yanchor="top",
            pad=dict(t=0, r=10),
            buttons=[
                dict(
                    label="Play",
                    method="animate",
                    args=[
                        None,
                        dict(
                            frame=dict(duration=200, redraw=True),
                            transition=dict(duration=0),
                            fromcurrent=True,
                            mode="immediate",
                        ),
                    ],
                )
            ],
        )
    ],
    sliders=sliders,
)

fig.update_layout(
    # autosize=True,
    height=600,
    width=1000,
    hovermode="closest",
    mapbox=go.layout.Mapbox(
        accesstoken=os.environ["MAPBOX_KEY"],
        # style="mapbox://styles/max-schrader/ck8t1cmmc02wk1it9rv28iyte",
        style="mapbox://styles/max-schrader/cl6lhvrfw001516pkh3s6iv7l",
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=sequence_df["ortools_lat"].mean(), lon=sequence_df["ortools_lon"].mean()
        ),
        pitch=0,
        zoom=12,
    ),
)


fig.show()
